{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:13:12Z","timestamp":1775326392577,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":30,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,12]],"date-time":"2024-08-12T00:00:00Z","timestamp":1723420800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2023YFB3001801"],"award-info":[{"award-number":["2023YFB3001801"]}]},{"name":"National Natural Science Foundation of China","award":["62322201, 62072018, U23B2020, U22A2028"],"award-info":[{"award-number":["62322201, 62072018, U23B2020, U22A2028"]}]},{"name":"the Fundamental Research Funds for the Central Universities","award":["YWF-23-L-1121"],"award-info":[{"award-number":["YWF-23-L-1121"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,12]]},"DOI":"10.1145\/3673038.3673116","type":"proceedings-article","created":{"date-parts":[[2024,8,8]],"date-time":"2024-08-08T18:29:01Z","timestamp":1723141741000},"page":"822-832","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["PRoof: A Comprehensive Hierarchical Profiling Framework for Deep Neural Networks with Roofline Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-4295-9983","authenticated-orcid":false,"given":"Siyu","family":"Wu","sequence":"first","affiliation":[{"name":"Beihang University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1101-7927","authenticated-orcid":false,"given":"Hailong","family":"Yang","sequence":"additional","affiliation":[{"name":"Beihang University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5163-4607","authenticated-orcid":false,"given":"Xin","family":"You","sequence":"additional","affiliation":[{"name":"Beihang University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0081-5395","authenticated-orcid":false,"given":"Ruihao","family":"Gong","sequence":"additional","affiliation":[{"name":"SenseTime Research, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1829-2817","authenticated-orcid":false,"given":"Yi","family":"Liu","sequence":"additional","affiliation":[{"name":"Beihang University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7186-0556","authenticated-orcid":false,"given":"Zhongzhi","family":"Luan","sequence":"additional","affiliation":[{"name":"Beihang University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5382-1473","authenticated-orcid":false,"given":"Depei","family":"Qian","sequence":"additional","affiliation":[{"name":"Beihang University, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,12]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Intel Corporation. 2024. Intel VTune Profiler. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/oneapi\/vtune-profiler.html"},{"key":"e_1_3_2_2_2_1","unstructured":"NVIDIA Corporation. 2024. DLProf User Guide. https:\/\/docs.nvidia.com\/deeplearning\/frameworks\/dlprof-user-guide\/index.html"},{"key":"e_1_3_2_2_3_1","unstructured":"NVIDIA Corporation. 2024. Nsight Systems | NVIDIA Developer. https:\/\/developer.nvidia.com\/nsight-systems"},{"key":"e_1_3_2_2_4_1","unstructured":"NVIDIA Corporation. 2024. NVIDIA Nsight Compute | NVIDIA Developer. https:\/\/developer.nvidia.com\/nsight-compute"},{"key":"e_1_3_2_2_5_1","unstructured":"NVIDIA Corporation. 2024. NVIDIA TensorRT. https:\/\/developer.nvidia.com\/tensorrt"},{"key":"e_1_3_2_2_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_7_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arxiv:2010.11929\u00a0[cs.CV]"},{"key":"e_1_3_2_2_8_1","unstructured":"The\u00a0Linux Foundation. 2024. Open Neural Network Exchange. https:\/\/onnx.ai\/"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_10_1","volume-title":"Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861","author":"Howard G","year":"2017","unstructured":"Andrew\u00a0G Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, and Hartwig Adam. 2017. Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861 (2017)."},{"key":"e_1_3_2_2_11_1","first-page":"623","article-title":"dPRO: A Generic Performance Diagnosis and Optimization Toolkit for Expediting Distributed DNN Training","volume":"4","author":"Hu Hanpeng","year":"2022","unstructured":"Hanpeng Hu, Chenyu Jiang, Yuchen Zhong, Yanghua Peng, Chuan Wu, Yibo Zhu, Haibin Lin, and Chuanxiong Guo. 2022. dPRO: A Generic Performance Diagnosis and Optimization Toolkit for Expediting Distributed DNN Training. Proceedings of Machine Learning and Systems 4 (2022), 623\u2013637.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_2_12_1","unstructured":"Intel. 2024. OpenVINO\u2122 toolkit: An open source AI toolkit that makes it easier to write once deploy anywhere. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/openvino-toolkit\/overview.html"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00053"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00042"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"e_1_3_2_2_17_1","volume-title":"MD-Roofline: A Training Performance Analysis Model for Distributed Deep Learning. In 2022 IEEE Symposium on Computers and Communications (ISCC). IEEE, 1\u20138.","author":"Miao Tianhao","year":"2022","unstructured":"Tianhao Miao, Qinghua Wu, Ting Liu, Penglai Cui, Rui Ren, Zhenyu Li, and Gaogang Xie. 2022. MD-Roofline: A Training Performance Analysis Model for Distributed Deep Learning. In 2022 IEEE Symposium on Computers and Communications (ISCC). IEEE, 1\u20138."},{"key":"e_1_3_2_2_18_1","unstructured":"Microsoft. 2024. Optimize and Accelerate Machine Learning Inferencing and Training. https:\/\/onnxruntime.ai\/"},{"key":"e_1_3_2_2_19_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2018.8451355"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00016"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_2_2_24_1","volume-title":"International conference on machine learning. PMLR, 6105\u20136114","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. 2019. Efficientnet: Rethinking model scaling for convolutional neural networks. In International conference on machine learning. PMLR, 6105\u20136114."},{"key":"e_1_3_2_2_25_1","volume-title":"International conference on machine learning. PMLR, 10096\u201310106","author":"Tan Mingxing","year":"2021","unstructured":"Mingxing Tan and Quoc Le. 2021. Efficientnetv2: Smaller models and faster training. In International conference on machine learning. PMLR, 10096\u201310106."},{"key":"e_1_3_2_2_26_1","volume-title":"Mlp-mixer: An all-mlp architecture for vision. Advances in neural information processing systems 34","author":"Tolstikhin O","year":"2021","unstructured":"Ilya\u00a0O Tolstikhin, Neil Houlsby, Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Thomas Unterthiner, Jessica Yung, Andreas Steiner, Daniel Keysers, Jakob Uszkoreit, 2021. Mlp-mixer: An all-mlp architecture for vision. Advances in neural information processing systems 34 (2021), 24261\u201324272."},{"key":"e_1_3_2_2_27_1","volume-title":"Advances in Neural Information Processing Systems, I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.). Vol.\u00a030. Curran Associates","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141\u00a0ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.). Vol.\u00a030. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-80126-7_35"},{"key":"e_1_3_2_2_30_1","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Zhu Hongyu","year":"2020","unstructured":"Hongyu Zhu, Amar Phanishayee, and Gennady Pekhimenko. 2020. Daydream: Accurately estimating the efficacy of optimizations for { DNN} training. In 2020 USENIX Annual Technical Conference (USENIX ATC 20). 337\u2013352."}],"event":{"name":"ICPP '24: the 53rd International Conference on Parallel Processing","location":"Gotland Sweden","acronym":"ICPP '24"},"container-title":["Proceedings of the 53rd International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3673038.3673116","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3673038.3673116","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T17:30:21Z","timestamp":1758648621000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3673038.3673116"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,12]]},"references-count":30,"alternative-id":["10.1145\/3673038.3673116","10.1145\/3673038"],"URL":"https:\/\/doi.org\/10.1145\/3673038.3673116","relation":{},"subject":[],"published":{"date-parts":[[2024,8,12]]},"assertion":[{"value":"2024-08-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}