{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T14:57:50Z","timestamp":1773154670818,"version":"3.50.1"},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,7,1]],"date-time":"2022-07-01T00:00:00Z","timestamp":1656633600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,7,1]],"date-time":"2022-07-01T00:00:00Z","timestamp":1656633600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,7]]},"DOI":"10.1109\/icdcs54860.2022.00087","type":"proceedings-article","created":{"date-parts":[[2022,10,13]],"date-time":"2022-10-13T19:44:10Z","timestamp":1665690250000},"page":"853-863","source":"Crossref","is-referenced-by-count":5,"title":["AIACC-Training: Optimizing Distributed Deep Learning Training through Multi-streamed and Concurrent Gradient Communications"],"prefix":"10.1109","author":[{"given":"Lixiang","family":"Lin","sequence":"first","affiliation":[{"name":"Alibaba Group"}]},{"given":"Shenghao","family":"Qiu","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Ziqi","family":"Yu","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Liang","family":"You","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Long","family":"Xin","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Xiaoyang","family":"Sun","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Jie","family":"Xu","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]}],"member":"263","reference":[{"key":"ref39","article-title":"Priority-based parameter propagation for distributed dnn training","author":"jayarajan","year":"2019"},{"key":"ref38","article-title":"Effect of batch size on training dynamics","year":"0"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref32","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/s10044-008-0141-y"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3497776.3517769"},{"key":"ref37","article-title":"Insightface: an open source 2d&#x2013;3d deep face analysis library","year":"0"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.14778\/3503585.3503590"},{"key":"ref35","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018"},{"key":"ref34","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NIPS"},{"key":"ref10","article-title":"Tensorflow: A system for large-scale machine learning","author":"abadi","year":"2016","journal-title":"OSDI"},{"key":"ref40","article-title":"Optimizing sparse matrix multiplications for graph neural networks","author":"qiu","year":"2021"},{"key":"ref11","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"NIPS"},{"key":"ref12","article-title":"Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems","author":"chen","year":"2015"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/s10472-010-9213-y"},{"key":"ref14","article-title":"Parameter server for distributed machine learning","author":"li","year":"2013","journal-title":"NIPS Big Learning Workshop"},{"key":"ref15","author":"gibiansky","year":"2017"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/HCS49909.2020.9220619"},{"key":"ref18","article-title":"Nvlink and nvswitch the building blocks of advanced multi-gpu communication","year":"0"},{"key":"ref19","article-title":"Evaluating modern gpu interconnect: Pcie, nvlink, nv-sli, nvswitch and gpudirect","author":"li","year":"2019","journal-title":"IEEE TPDS"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628092"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3405671.3405810"},{"key":"ref27","article-title":"Hyperband: A novel bandit-based approach to hyperparameter optimization","author":"li","year":"2017","journal-title":"The Journal of Machine Learning Research"},{"key":"ref3","article-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism","author":"huang","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref6","article-title":"Blink: Fast and generic collectives for distributed ml","author":"wang","year":"2020","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2018.2817118"},{"key":"ref5","article-title":"Blueconnect: Novel hierarchical all-reduce on multi-tired network for deep learning","author":"cho","year":"2019","journal-title":"MLSys"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11728"},{"key":"ref7","article-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","author":"lin","year":"2018","journal-title":"ICLRE"},{"key":"ref2","article-title":"A unified architecture for accelerating distributed {DNN} training in heterogeneous gpu\/cpu clusters","author":"jiang","year":"2020","journal-title":"OSDI"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472904"},{"key":"ref9","article-title":"Horovod: fast and easy distributed deep learning in tensorflow","author":"sergeev","year":"2018"},{"key":"ref20","author":"shanley","year":"2003","journal-title":"InfiniBand Network Architecture"},{"key":"ref46","article-title":"Habana homepage","year":"0"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref22","article-title":"A stochastic approximation method","author":"robbins","year":"0","journal-title":"The Annals of Mathematical Statistics"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00085"},{"key":"ref21","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014"},{"key":"ref47","article-title":"E3: Energy-efficient microservices on smartnic-accelerated servers","author":"liu","year":"2019","journal-title":"ATC"},{"key":"ref24","article-title":"Dawnbench: An end-to-end deep learning benchmark and competition","author":"coleman","year":"2017","journal-title":"Training"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"ref23","article-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","author":"lin","year":"2017"},{"key":"ref41","article-title":"Tictac: Accelerating distributed deep learning with communication scheduling","author":"hashemi","year":"2019","journal-title":"SysML"},{"key":"ref26","article-title":"Bayesian optimization with unknown search space","author":"ha","year":"2019","journal-title":"NIPS"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3052862"},{"key":"ref25","article-title":"Population based training of neural networks","author":"jaderberg","year":"2017"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155446"}],"event":{"name":"2022 IEEE 42nd International Conference on Distributed Computing Systems (ICDCS)","location":"Bologna, Italy","start":{"date-parts":[[2022,7,10]]},"end":{"date-parts":[[2022,7,13]]}},"container-title":["2022 IEEE 42nd International Conference on Distributed Computing Systems (ICDCS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9912136\/9912128\/09912175.pdf?arnumber=9912175","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,4]],"date-time":"2022-11-04T01:27:11Z","timestamp":1667525231000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9912175\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,7]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/icdcs54860.2022.00087","relation":{},"subject":[],"published":{"date-parts":[[2022,7]]}}}