{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T02:29:30Z","timestamp":1768271370678,"version":"3.49.0"},"reference-count":57,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Journal of Parallel and Distributed Computing"],"published-print":{"date-parts":[[2024,3]]},"DOI":"10.1016\/j.jpdc.2023.104811","type":"journal-article","created":{"date-parts":[[2023,11,24]],"date-time":"2023-11-24T04:33:38Z","timestamp":1700800418000},"page":"104811","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":11,"special_numbering":"C","title":["Sketch-fusion: A gradient compression method with multi-layer fusion for communication-efficient distributed training"],"prefix":"10.1016","volume":"185","author":[{"given":"Lingfei","family":"Dai","sequence":"first","affiliation":[]},{"given":"Luqi","family":"Gong","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7593-8293","authenticated-orcid":false,"given":"Zhulin","family":"An","sequence":"additional","affiliation":[]},{"given":"Yongjun","family":"Xu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8360-7718","authenticated-orcid":false,"given":"Boyu","family":"Diao","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.jpdc.2023.104811_br0010","author":"Abadi"},{"key":"10.1016\/j.jpdc.2023.104811_br0020","series-title":"Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing","first-page":"440","article-title":"Sparse communication for distributed gradient descent","author":"Aji","year":"2017"},{"key":"10.1016\/j.jpdc.2023.104811_br0030","first-page":"1709","article-title":"QSGD: communication-efficient SGD via gradient quantization and encoding","volume":"30","author":"Alistarh","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.jpdc.2023.104811_br0040","first-page":"5973","article-title":"The convergence of sparsified gradient methods","volume":"31","author":"Alistarh","year":"2018","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.jpdc.2023.104811_br0050","series-title":"2019 IEEE High Performance Extreme Computing Conference","first-page":"1","article-title":"Low overhead instruction latency characterization for nvidia gpgpus","author":"Arafa","year":"2019"},{"key":"10.1016\/j.jpdc.2023.104811_br0060","article-title":"Qsparse-local-SGD: distributed SGD with quantization, sparsification and local computations","volume":"32","author":"Basu","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.jpdc.2023.104811_br0070","series-title":"International Conference on Machine Learning","first-page":"560","article-title":"signSGD: compressed optimisation for non-convex problems","author":"Bernstein","year":"2018"},{"key":"10.1016\/j.jpdc.2023.104811_br0080","author":"Chen"},{"key":"10.1016\/j.jpdc.2023.104811_br0090","series-title":"2020 IEEE Hot Chips 32 Symposium","first-page":"1","article-title":"NVIDIA A100 GPU: performance & innovation for GPU computing","author":"Choquette","year":"2020"},{"key":"10.1016\/j.jpdc.2023.104811_br0100","author":"Dettmers"},{"key":"10.1016\/j.jpdc.2023.104811_br0110","doi-asserted-by":"crossref","first-page":"2341","DOI":"10.1137\/120880811","article-title":"Stochastic first- and zeroth-order methods for nonconvex stochastic programming","volume":"23","author":"Ghadimi","year":"2013","journal-title":"SIAM J. Optim."},{"key":"10.1016\/j.jpdc.2023.104811_br0120","author":"Goyal"},{"key":"10.1016\/j.jpdc.2023.104811_br0130","series-title":"Proceedings of the 29th ACM International Conference on Information & Knowledge Management","first-page":"3507","article-title":"Compression of deep learning models for NLP","author":"Gupta","year":"2020"},{"key":"10.1016\/j.jpdc.2023.104811_br0140","author":"Gupta"},{"key":"10.1016\/j.jpdc.2023.104811_br0150","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.jpdc.2023.104811_br0160","first-page":"6869","article-title":"Quantized neural networks: training neural networks with low precision weights and activations","volume":"18","author":"Hubara","year":"2017","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.jpdc.2023.104811_br0170","first-page":"13144","article-title":"Communication-efficient distributed SGD with sketching","volume":"32","author":"Ivkin","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.jpdc.2023.104811_br0180","author":"Jia"},{"key":"10.1016\/j.jpdc.2023.104811_br0190","series-title":"Proceedings of the 32nd International Conference on Neural Information Processing Systems","first-page":"2530","article-title":"A linear speedup analysis of distributed deep learning with sparse and quantized communication","author":"Jiang","year":"2018"},{"key":"10.1016\/j.jpdc.2023.104811_br0200","series-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation","first-page":"463","article-title":"A unified architecture for accelerating distributed DNN training in heterogeneous GPU\/CPU clusters","author":"Jiang","year":"2020"},{"key":"10.1016\/j.jpdc.2023.104811_br0210","series-title":"2017 ACM\/IEEE 44th Annual International Symposium on Computer Architecture","first-page":"1","article-title":"In-datacenter performance analysis of a tensor processing unit","author":"Jouppi","year":"2017"},{"key":"10.1016\/j.jpdc.2023.104811_br0220","author":"Kirillov"},{"key":"10.1016\/j.jpdc.2023.104811_br0230","series-title":"Cifar-10","author":"Krizhevsky","year":"2010"},{"key":"10.1016\/j.jpdc.2023.104811_br0240","doi-asserted-by":"crossref","first-page":"436","DOI":"10.1038\/nature14539","article-title":"Deep learning","volume":"521","author":"LeCun","year":"2015","journal-title":"Nature"},{"key":"10.1016\/j.jpdc.2023.104811_br0250","series-title":"11th {USENIX} Symposium on Operating Systems Design and Implementation","first-page":"583","article-title":"Scaling distributed machine learning with the parameter server","author":"Li","year":"2014"},{"key":"10.1016\/j.jpdc.2023.104811_br0260","series-title":"Proceedings of the 27th International Conference on Neural Information Processing Systems","first-page":"19","article-title":"Communication efficient distributed machine learning with the parameter server","author":"Li","year":"2014"},{"key":"10.1016\/j.jpdc.2023.104811_br0270","author":"Li"},{"key":"10.1016\/j.jpdc.2023.104811_br0280","series-title":"2018 51st Annual IEEE\/ACM International Symposium on Microarchitecture","first-page":"175","article-title":"A network-centric hardware\/algorithm co-design to accelerate distributed training of deep neural networks","author":"Li","year":"2018"},{"key":"10.1016\/j.jpdc.2023.104811_br0290","article-title":"Can decentralized algorithms outperform centralized algorithms? A case study for decentralized parallel stochastic gradient descent","volume":"30","author":"Lian","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.jpdc.2023.104811_br0300","author":"Lin"},{"key":"10.1016\/j.jpdc.2023.104811_br0310","series-title":"2018 IEEECVF Conference on Computer Vision and Pattern Recognition","first-page":"5419","article-title":"Event-based vision meets deep learning on steering prediction for self-driving cars","author":"Maqueda","year":"2018"},{"key":"10.1016\/j.jpdc.2023.104811_br0320","series-title":"Building a large annotated corpus of English: the Penn Treebank","author":"Marcus","year":"1993"},{"key":"10.1016\/j.jpdc.2023.104811_br0330","author":"Mishchenko"},{"key":"10.1016\/j.jpdc.2023.104811_br0340","doi-asserted-by":"crossref","first-page":"117","DOI":"10.1016\/j.jpdc.2008.09.002","article-title":"Bandwidth optimal all-reduce algorithms for clusters of workstations","volume":"69","author":"Patarasuk","year":"2009","journal-title":"J. Parallel Distrib. Comput."},{"key":"10.1016\/j.jpdc.2023.104811_br0350","series-title":"Proceedings of the 27th ACM Symposium on Operating Systems Principles","first-page":"16","article-title":"A generic communication scheduler for distributed DNN training acceleration","author":"Peng","year":"2019"},{"key":"10.1016\/j.jpdc.2023.104811_br0360","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"779","article-title":"You only look once: unified, real-time object detection","author":"Redmon","year":"2016"},{"key":"10.1016\/j.jpdc.2023.104811_br0370","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","article-title":"Imagenet large scale visual recognition challenge","volume":"115","author":"Russakovsky","year":"2015","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.jpdc.2023.104811_br0380","author":"Sapio"},{"key":"10.1016\/j.jpdc.2023.104811_br0390","author":"Sergeev"},{"key":"10.1016\/j.jpdc.2023.104811_br0400","series-title":"IEEE INFOCOM 2021-IEEE Conference on Computer Communications","first-page":"1","article-title":"Exploiting simultaneous communications to accelerate data parallel distributed deep learning","author":"Shi","year":"2021"},{"key":"10.1016\/j.jpdc.2023.104811_br0410","series-title":"IEEE INFOCOM 2020-IEEE Conference on Computer Communications","first-page":"406","article-title":"Communication-efficient distributed deep learning with merged gradient sparsification on GPUs","author":"Shi","year":"2020"},{"key":"10.1016\/j.jpdc.2023.104811_br0420","series-title":"Proceedings of the 28th International Conference on Neural Information Processing Systems","first-page":"802","article-title":"Convolutional lstm network: a machine learning approach for precipitation nowcasting","volume":"vol. 1","author":"Shi","year":"2015"},{"key":"10.1016\/j.jpdc.2023.104811_br0430","author":"Simonyan"},{"key":"10.1016\/j.jpdc.2023.104811_br0440","first-page":"4452","article-title":"Sparsified SGD with memory","volume":"31","author":"Stich","year":"2018","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.jpdc.2023.104811_br0450","series-title":"Interspeech 2015","first-page":"1488","article-title":"Scalable distributed DNN training using commodity GPU cloud computing","author":"Strom","year":"2015"},{"key":"10.1016\/j.jpdc.2023.104811_br0460","doi-asserted-by":"crossref","first-page":"49","DOI":"10.1177\/1094342005051521","article-title":"Optimization of collective communication operations in mpich","volume":"19","author":"Thakur","year":"2005","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"10.1016\/j.jpdc.2023.104811_br0470","author":"Tsuzuku"},{"key":"10.1016\/j.jpdc.2023.104811_br0480","first-page":"14236","article-title":"PowerSGD: practical low-rank gradient compression for distributed optimization","volume":"32","author":"Vogels","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.jpdc.2023.104811_br0490","series-title":"Algorithmic Learning Theory","first-page":"857","article-title":"Stochastic nonconvex optimization with large minibatches","author":"Wang","year":"2019"},{"key":"10.1016\/j.jpdc.2023.104811_br0500","series-title":"International Conference on Machine Learning","first-page":"5325","article-title":"Error compensated quantized SGD and its applications to large-scale distributed optimization","author":"Wu","year":"2018"},{"key":"10.1016\/j.jpdc.2023.104811_br0510","author":"Xu"},{"key":"10.1016\/j.jpdc.2023.104811_br0520","series-title":"Compressed communication for distributed deep learning: Survey and quantitative evaluation","author":"Xu","year":"2020"},{"key":"10.1016\/j.jpdc.2023.104811_br0530","article-title":"Artificial intelligence: a powerful paradigm for scientific research","volume":"2","author":"Xu","year":"2021","journal-title":"Innovation"},{"key":"10.1016\/j.jpdc.2023.104811_br0540","series-title":"Proceedings of the 47th International Conference on Parallel Processing","first-page":"1","article-title":"Imagenet training in minutes","author":"You","year":"2018"},{"key":"10.1016\/j.jpdc.2023.104811_br0550","author":"Zhang"},{"key":"10.1016\/j.jpdc.2023.104811_br0560","series-title":"Proceedings of the Workshop on Network Meets AI & ML","first-page":"8","article-title":"Is network the bottleneck of distributed training?","author":"Zhang","year":"2020"},{"key":"10.1016\/j.jpdc.2023.104811_br0570","author":"Zong"}],"container-title":["Journal of Parallel and Distributed Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0743731523001818?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0743731523001818?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T22:39:38Z","timestamp":1760049578000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0743731523001818"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3]]},"references-count":57,"alternative-id":["S0743731523001818"],"URL":"https:\/\/doi.org\/10.1016\/j.jpdc.2023.104811","relation":{},"ISSN":["0743-7315"],"issn-type":[{"value":"0743-7315","type":"print"}],"subject":[],"published":{"date-parts":[[2024,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Sketch-fusion: A gradient compression method with multi-layer fusion for communication-efficient distributed training","name":"articletitle","label":"Article Title"},{"value":"Journal of Parallel and Distributed Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.jpdc.2023.104811","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2023 Elsevier Inc. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"104811"}}