{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T04:21:37Z","timestamp":1777350097487,"version":"3.51.4"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,5,1]],"date-time":"2023-05-01T00:00:00Z","timestamp":1682899200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,5,1]],"date-time":"2023-05-01T00:00:00Z","timestamp":1682899200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,5]]},"DOI":"10.1109\/ipdps54959.2023.00042","type":"proceedings-article","created":{"date-parts":[[2023,7,18]],"date-time":"2023-07-18T17:29:11Z","timestamp":1689701351000},"page":"344-355","source":"Crossref","is-referenced-by-count":43,"title":["ByteTransformer: A High-Performance Transformer Boosted for Variable-Length Inputs"],"prefix":"10.1109","author":[{"given":"Yujia","family":"Zhai","sequence":"first","affiliation":[{"name":"University of California,Riverside"}]},{"given":"Chengquan","family":"Jiang","sequence":"additional","affiliation":[{"name":"ByteDance Ltd."}]},{"given":"Leyuan","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance Ltd."}]},{"given":"Xiaoying","family":"Jia","sequence":"additional","affiliation":[{"name":"ByteDance Ltd."}]},{"given":"Shang","family":"Zhang","sequence":"additional","affiliation":[{"name":"NVIDIA Corporation"}]},{"given":"Zizhong","family":"Chen","sequence":"additional","affiliation":[{"name":"University of California,Riverside"}]},{"given":"Xin","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance Ltd."}]},{"given":"Yibo","family":"Zhu","sequence":"additional","affiliation":[{"name":"ByteDance Ltd."}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref35","article-title":"Albert: A lite bert for self-supervised learning of language representations","author":"lan","year":"2019"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref34","year":"2022"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-industry.15"},{"key":"ref37","article-title":"Deberta: Decoding-enhanced bert with disentangled attention","author":"he","year":"2020"},{"key":"ref14","article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","author":"shoeybi","year":"2019"},{"key":"ref36","article-title":"Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter","author":"sanh","year":"2019"},{"key":"ref31","article-title":"Gaussian error linear units (gelus)","author":"hendrycks","year":"2016"},{"key":"ref30","article-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","author":"dao","year":"2022"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"ref33","year":"2022"},{"key":"ref10","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref32","year":"2022"},{"key":"ref2","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018"},{"key":"ref1","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref17","year":"2022"},{"key":"ref16","first-page":"265","article-title":"{TensorFlow}: a system for {Large-Scale} machine learning","author":"abadi","year":"2016","journal-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref18","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume":"32","author":"paszke","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref24","year":"2022"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00074"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476138"},{"key":"ref25","year":"2022"},{"key":"ref20","year":"2022"},{"key":"ref22","article-title":"Algorithm-based fault tolerance for convolutional neural networks","author":"zhao","year":"2020","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3447818.3460364"},{"key":"ref28","year":"2022"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"ref29","year":"2022"},{"key":"ref8","year":"2023"},{"key":"ref7","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"radford","year":"2019","journal-title":"OpenAIRE blog"},{"key":"ref9","article-title":"Boosting distributed training performance of the unpadded bert model","author":"zeng","year":"2022"},{"key":"ref4","article-title":"Understanding backtranslation at scale","author":"edunov","year":"2018"},{"key":"ref3","article-title":"Xlnet: Generalized autoregressive pretraining for language understanding","volume":"32","author":"yang","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3357895"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3326937.3341261"}],"event":{"name":"2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","location":"St. Petersburg, FL, USA","start":{"date-parts":[[2023,5,15]]},"end":{"date-parts":[[2023,5,19]]}},"container-title":["2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10177277\/10177383\/10177488.pdf?arnumber=10177488","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,7]],"date-time":"2023-08-07T17:39:08Z","timestamp":1691429948000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10177488\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/ipdps54959.2023.00042","relation":{},"subject":[],"published":{"date-parts":[[2023,5]]}}}