{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:13Z","timestamp":1755870013619,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3728489","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"928-942","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Leonid: Exploring Automated Kernel Fusion in Performance-Portable Programming Models for Scientific Computation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4276-0510","authenticated-orcid":false,"given":"Chenchen","family":"Zhang","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1038-944X","authenticated-orcid":false,"given":"Hao","family":"Luo","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7426-6248","authenticated-orcid":false,"given":"Chao","family":"Yang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China and PKU-Changsha Institute of Computing and Digital Economy, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"AMD HIP homepage","year":"2024","unstructured":"2024. AMD HIP homepage. https:\/\/www.amd.com\/zh-cn\/developer\/resources\/rocm-hub\/hip-sdk.html"},{"key":"e_1_3_3_1_3_2","volume-title":"NVIDIA CUDA homepage","year":"2024","unstructured":"2024. NVIDIA CUDA homepage. https:\/\/developer.nvidia.com\/cuda-toolkit"},{"key":"e_1_3_3_1_4_2","unstructured":"2024. ONNX: Open Neural Network Exchange. https:\/\/github.com\/onnx\/onnx."},{"key":"e_1_3_3_1_5_2","volume-title":"OpenACC homepage","year":"2024","unstructured":"2024. OpenACC homepage. https:\/\/www.openacc.org\/"},{"key":"e_1_3_3_1_6_2","volume-title":"Top 500 supercomputer lists","year":"2024","unstructured":"2024. Top 500 supercomputer lists. https:\/\/www.top500.org\/lists\/top500\/2024\/06\/"},{"key":"e_1_3_3_1_7_2","unstructured":"Mart\u00edn Abadi Ashish Agarwal Paul Barham et\u00a0al. 2016. TensorFlow: Large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1603.04467 (2016)."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Usman Ahmed Jerry Chun-Wei Lin and Gautam Srivastava. 2022. A ML-based resource utilization OpenCL GPU-kernel fusion model. Sustainable Computing: Informatics and Systems 35 (2022) 100683.","DOI":"10.1016\/j.suscom.2022.100683"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"B.\u00a0J. Alder and T.\u00a0E. Wainwright. 1959. Studies in Molecular Dynamics. I. General Method. The Journal of Chemical Physics 31 2 (1959) 459\u2013466.","DOI":"10.1063\/1.1730376"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC49587.2019.00012"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"P.\u00a0L. Bhatnagar E.\u00a0P. Gross and M. Krook. 1954. A Model for Collision Processes in Gases. I. Small Amplitude Processes in Charged and Neutral One-Component Systems. Physical Review 94 3 (1954) 511.","DOI":"10.1103\/PhysRev.94.511"},{"key":"e_1_3_3_1_12_2","volume-title":"Prefix Sums and Their Applications","author":"Blelloch Guy\u00a0E.","year":"1990","unstructured":"Guy\u00a0E. Blelloch. 1990. Prefix Sums and Their Applications. Technical Report CMU-CS-90-190. Carnegie Mellon University."},{"key":"e_1_3_3_1_13_2","unstructured":"Huanqi Cao and Jiajie Chen. 2022. Design and Implementation of Shenwei Universal C\/C++. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.00607 (2022)."},{"key":"e_1_3_3_1_14_2","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, et\u00a0al. 2018. TVM: An automated End-to-End optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578\u2013594."},{"key":"e_1_3_3_1_15_2","volume-title":"NVIDIA H100 Tensor Core GPU Architecture: Exceptional Performance, Scalability, and Security for the Data Center","author":"Corporation NVIDIA","year":"2022","unstructured":"NVIDIA Corporation. 2022. NVIDIA H100 Tensor Core GPU Architecture: Exceptional Performance, Scalability, and Security for the Data Center. White Paper. NVIDIA Corporation."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Jose\u00a0Monsalve Diaz Kyle Friedline Swaroop Pophale Oscar Hernandez David\u00a0E. Bernholdt and Sunita Chandrasekaran. 2019. Analysis of OpenMP 4.5 offloading in implementations: correctness and overhead. Parallel Comput. 89 (2019) 102546.","DOI":"10.1016\/j.parco.2019.102546"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"H.\u00a0Carter Edwards Daniel Sunderland Vicki Porter Chris Amsler and Sam Mish. 2012. Manycore performance-portability: Kokkos multidimensional array library. Scientific Programming 20 2 (2012) 89\u2013114.","DOI":"10.1155\/2012\/917630"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"H.\u00a0Carter Edwards Christian\u00a0R. Trott and Daniel Sunderland. 2014. Kokkos: Enabling manycore performance portability through polymorphic memory access patterns. J. Parallel and Distrib. Comput. 74 12 (2014) 3202\u20133216.","DOI":"10.1016\/j.jpdc.2014.07.003"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Jan Fousek Ji\u0159i Filipovi\u010d and Matu\u0161 Madzin. 2011. Automatic fusions of CUDA-GPU kernels for parallel map. ACM SIGARCH Computer Architecture News 39 4 (2011) 98\u201399.","DOI":"10.1145\/2082156.2082183"},{"key":"e_1_3_3_1_20_2","unstructured":"G. Ga\u00ebl et\u00a0al. 2021. Eigen v3. http:\/\/eigen.tuxfamily.org."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"A. Hart R. Ansaloni and A. Gray. 2012. Porting and scaling OpenACC applications on massively-parallel GPU-accelerated supercomputers. The European Physical Journal Special Topics 210 1 (2012) 5\u201316.","DOI":"10.1140\/epjst\/e2012-01634-y"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Paul Havlak and Ken Kennedy. 1991. An implementation of interprocedural bounded regular section analysis. IEEE Transactions on Parallel and Distributed Systems 2 3 (1991) 350\u2013360.","DOI":"10.1109\/71.86110"},{"key":"e_1_3_3_1_23_2","first-page":"981","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Heinecke Alexander","year":"2016","unstructured":"Alexander Heinecke, Greg Henry, Maxwell Hutchinson, and Hans Pabst. 2016. LIBXSMM: Accelerating small matrix multiplications by runtime code generation. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 981\u2013991."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.2172\/1089988"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"W.\u00a0DANIEL HILLIS and JR. GUY L.\u00a0STEELE. 1986. Data Parallel Algorithms. Commun. ACM 29 12 (1986).","DOI":"10.1145\/7902.7903"},{"key":"e_1_3_3_1_26_2","unstructured":"Lee Howes and Maria Rovatsou. 2015. SYCL integrates OpenCL devices with modern C++."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Xiaomeng Huang Xing Huang Dong Wang et\u00a0al. 2019. OpenArray v1.0: a simple operator library for the decoupling of ocean modeling and parallel computing. Geoscientific Model Development 12 11 (2019) 4729\u20134749.","DOI":"10.5194\/gmd-12-4729-2019"},{"key":"e_1_3_3_1_28_2","volume-title":"SYCL 2020 Specification","author":"Group Khronos SYCL Working","year":"2021","unstructured":"Khronos SYCL Working Group. 2021. SYCL 2020 Specification. Technical Report. Khronos Group."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Dmitrii Kochkov Jamie\u00a0A. Smith Anastasiya Alieva et\u00a0al. 2021. Machine learning\u2013accelerated computational fluid dynamics. Proceedings of the National Academy of Sciences 118 21 (2021) e2101784118.","DOI":"10.1073\/pnas.2101784118"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/LLVMHPCHiPar51896.2020.00010"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO53902.2022.9741270"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"crossref","unstructured":"Kai Lu Yaohua Wang Yang Guo et\u00a0al. 2022. MT-3000: a heterogeneous multi-zone processor for HPC. CCF Transactions on High Performance Computing 4 2 (2022) 150\u2013164.","DOI":"10.1007\/s42514-022-00095-y"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Aristeidis Mastoras Sotiris Anagnostidis and Albert-Jan\u00a0N. Yzelman. 2022. Design and implementation for nonblocking execution in GraphBLAS: Tradeoffs and performance. ACM Transactions on Architecture and Code Optimization 20 1 (2022) 1\u201323.","DOI":"10.1145\/3561652"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.5555\/2388996.2389108"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Richard\u00a0Tran Mills Mark\u00a0F. Adams Satish Balay et\u00a0al. 2021. Toward performance-portable PETSc for GPU-based exascale systems. Parallel Comput. 108 (2021) 102831.","DOI":"10.1016\/j.parco.2021.102831"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2009.7478342"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Thomas Norrie Nishant Patil Doe\u00a0Hyun Yoon et\u00a0al. 2021. The design process for Google\u2019s training chips: TPUv2 and TPUv3. IEEE Micro 41 2 (2021) 56\u201363.","DOI":"10.1109\/MM.2021.3058217"},{"key":"e_1_3_3_1_38_2","volume-title":"OpenMP Application Programming Interface (version 5.2 ed.)","year":"2021","unstructured":"OpenMP Architecture Review Board 2021. OpenMP Application Programming Interface (version 5.2 ed.). OpenMP Architecture Review Board."},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"S.\u00a0J. Plimpton S.\u00a0G. Moore A. Borner et\u00a0al. 2019. Direct simulation Monte Carlo on petaflop supercomputers and beyond. Physics of Fluids 31 8 (2019) 080607.","DOI":"10.1063\/1.5108534"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"V\u00edctor P\u00e9rez Lukas Sommer Victor Lom\u00fcller Kumudha Narasimhan and Mehdi Goli. 2023. User-driven online kernel fusion for SYCL. ACM Transactions on Architecture and Code Optimization 20 2 (2023) 1\u201325.","DOI":"10.1145\/3571284"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661176"},{"key":"e_1_3_3_1_42_2","unstructured":"Milan Radulovi\u0107. 2019. Memory bandwidth and latency in HPC: system requirements and performance impact. Tesi doctoral UPC Departament d\u2019Arquitectura de Computadors (2019)."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"Jonathan Ragan-Kelley Connelly Barnes Andrew Adams Sylvain Paris Fr\u00e9do Durand and Saman Amarasinghe. 2013. Halide: a language and compiler for optimizing parallelism locality and recomputation in image processing pipelines. ACM SIGPLAN Notices 48 6 (2013) 519\u2013530.","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"crossref","unstructured":"Francis\u00a0P. Russell Michael\u00a0R. Mellor Paul\u00a0H.J. Kelly and Olav Beckmann. 2011. DESOLA: An active linear algebra library using delayed evaluation and runtime code generation. Science of Computer Programming 76 4 (2011) 227\u2013242.","DOI":"10.1016\/j.scico.2008.06.002"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Mehmet Sahin and Robert\u00a0G. Owens. 2003. A novel fully implicit finite volume method applied to the lid-driven cavity problem\u2014Part I: High Reynolds number flow calculations. International Journal for Numerical Methods in Fluids 42 1 (2003) 57\u201377.","DOI":"10.1002\/fld.442"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1093\/oso\/9780198503989.001.0001"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"Siham Tabik G. Ortega and Ester\u00a0M. Garz\u00f3n. 2014. Performance evaluation of kernel fusion BLAS routines on the GPU: iterative solvers as case study. The Journal of Supercomputing 70 2 (2014) 577\u2013587.","DOI":"10.1007\/s11227-014-1102-4"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Andrij Trokhymchuk and Jos\u00e9 Alejandre. 1999. Computer Simulations of Liquid\/Vapor Interface in Lennard-Jones Fluids: Some Questions and Answers. The Journal of Chemical Physics 111 18 (1999) 8510\u20138523.","DOI":"10.1063\/1.480192"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Christian\u00a0R. Trott Damien Lebrun-Grandi\u00e9 Daniel Arndt Jan Ciesko Vinh Dang and Nathan Ellingwood. 2021. Kokkos 3: Programming model extensions for the exascale era. IEEE Transactions on Parallel and Distributed Systems 33 4 (2021) 805\u2013817.","DOI":"10.1109\/TPDS.2021.3097283"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Loup Verlet. 1967. Computer \"Experiments\" on Classical Fluids. I. Thermodynamical Properties of Lennard-Jones Molecules. Physical Review 159 1 (1967) 98.","DOI":"10.1103\/PhysRev.159.98"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.21"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/2749246.2749255"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"crossref","unstructured":"Chen Wang Jian Xia and Long Chen. 2024. A heterogeneous hybrid-precision finite volume method for compressible flow on unstructured grids. Computers & Fluids (2024) 106505.","DOI":"10.1016\/j.compfluid.2024.106505"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/GreenCom-CPSCom.2010.102"},{"key":"e_1_3_3_1_55_2","unstructured":"Junlin Wei Pengfei Lin Jinrong Jiang et\u00a0al. 2024. Accelerating LASG\/IAP climate system ocean model version 3 for performance portability using Kokkos. Future Generation Computer Systems (2024)."},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"crossref","unstructured":"Chuanfu Xu Xi Wang Dali Li Yonggang Che and Zhenghua Wang. 2019. OpenMP 4.5-enabled large-scale heterogeneous Lattice Boltzmann multiphase flow simulations. 2019 IEEE Intl Conf on Parallel & Distributed Processing with Applications Big Data & Cloud Computing Sustainable Computing & Communications Social Computing & Networking (ISPA\/BDCloud\/SocialCom\/SustainCom) 1007\u20131016.","DOI":"10.1109\/ISPA-BDCloud-SustainCom-SocialCom48970.2019.00145"},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476158"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3728489","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:02:53Z","timestamp":1755867773000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3728489"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":56,"alternative-id":["10.1145\/3721145.3728489","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3728489","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}