{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T07:21:32Z","timestamp":1772695292663,"version":"3.50.1"},"reference-count":60,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/hpca68181.2026.11408572","type":"proceedings-article","created":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T20:47:22Z","timestamp":1772657242000},"page":"1-15","source":"Crossref","is-referenced-by-count":0,"title":["TENET-v2: Applying Relation-Centric Notation to Model and Optimize Data Swizzle in the Cache of Modern NPU"],"prefix":"10.1109","author":[{"given":"Hanyu","family":"Zhang","sequence":"first","affiliation":[{"name":"Zhejiang University"}]},{"given":"Fangxu","family":"Guo","sequence":"additional","affiliation":[{"name":"Zhejiang University"}]},{"given":"Liqiang","family":"Lu","sequence":"additional","affiliation":[{"name":"Zhejiang University"}]},{"given":"Long","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd."}]},{"given":"Yunfei","family":"Du","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd."}]},{"given":"Zhe","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd."}]},{"given":"Jinghan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd."}]},{"given":"Jie","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University"}]},{"given":"Chenli","family":"Xue","sequence":"additional","affiliation":[{"name":"Zhejiang University"}]},{"given":"Chengpeng","family":"Wu","sequence":"additional","affiliation":[{"name":"Zhejiang University"}]},{"given":"Ziyi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University"}]},{"given":"Yun","family":"Liang","sequence":"additional","affiliation":[{"name":"Peking University"}]},{"given":"Size","family":"Zheng","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Jianwei","family":"Yin","sequence":"additional","affiliation":[{"name":"Zhejiang University"}]}],"member":"263","reference":[{"key":"ref1","author":"AI","year":"2023","journal-title":"Chatglm"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661197"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP.1993.397164"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446067"},{"key":"ref5","first-page":"578","article-title":"\\{TVM\\}: An automated \\{End-toEnd\\} optimizing compiler for deep learning","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen","year":"2018"},{"key":"ref6","article-title":"Learning to optimize tensor programs","volume":"31","author":"Chen","year":"2018","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1023\/A:1008012332212"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.58"},{"key":"ref9","author":"Corporation","year":"2025","journal-title":"a collection of c++ cuda template abstractions for defining and operating on hierarchically multidimensional layouts of threads and data"},{"key":"ref10","author":"Corporation","year":"2025","journal-title":"Nvidia cutlass library"},{"key":"ref11","author":"Corporation","year":"2025","journal-title":"Nvidia\u00ae tensorrt TM is an sdk for highperformance deep learning inference on nvidia gpus"},{"key":"ref12","article-title":"DeepSeek-AI","year":"2024","journal-title":"Deepseek-v3"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19\u20131423"},{"key":"ref14","article-title":"An image is worth 16 \u00d7 16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3314221.3314606"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00017"},{"key":"ref19","article-title":"Mixtral of experts","author":"Jiang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358286"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623790"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/HCS52781.2021.9567075"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4613-1705-0"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356218"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446759"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00071"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00062"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.11"},{"key":"ref30","article-title":"Meta","year":"2024","journal-title":"Llama 3"},{"key":"ref31","article-title":"Microsoft","year":"2025","journal-title":"Onnx runtime: cross-platform, high performance ml inferencing and training accelerator"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582069"},{"key":"ref33","article-title":"OpenAI","year":"2023","journal-title":"Gpt-4"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577479"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"ref36","article-title":"Lut-gemm: Quantized matrix multiplication based on luts for efficient inference in large-scale generative language models","author":"Park","year":"2022","journal-title":"arXiv preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"ref38","first-page":"89","article-title":"Kosmo: Efficient online miss ratio curve generation for eviction policy evaluation","volume-title":"22nd USENIX Conference on File and Storage Technologies (FAST 24)","author":"Shakiba","year":"2024"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358302"},{"key":"ref40","first-page":"701","article-title":"Welder: Scheduling deep learning memory access via tile-graph","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Shi","year":"2023"},{"key":"ref41","article-title":"Thunderkittens: Simple, fast, and adorable ai kernels","author":"Spector","year":"2024","journal-title":"arXiv preprint"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00068"},{"key":"ref43","article-title":"Stumm Lab and kosmo-fast24 Contributors","year":"2025","journal-title":"Kosmo: Efficient online miss ratio curve generation for eviction policy evaluation"},{"key":"ref44","article-title":"Efficientnet: Rethinking model scaling for convolutional neural networks","volume":"abs\/1905.11946","author":"Tan","year":"2019","journal-title":"ArXiv"},{"key":"ref45","author":"Team","year":"2024","journal-title":"Introducing dbrx: A new state-of-the-art open llm"},{"key":"ref46","author":"Technologies","year":"2025","journal-title":"Huawei compute architecture for neural networks ascend operator library (aol)"},{"key":"ref47","author":"Technologies","year":"2025","journal-title":"Huawei heterogeneous compute architecture for neural networks (cann)"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"ref49","article-title":"Ultralytics","year":"2025","journal-title":"Yolov8"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3431920.3439292"},{"key":"ref51","article-title":"Tilelang: A composable tiled programming model for ai systems","author":"Wang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00066"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2105.15203"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00022"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378514"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00072"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446702"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454106"},{"key":"ref59","first-page":"863","article-title":"Ansor: Generating \\{High-Performance\\} tensor programs for deep learning","volume-title":"14th USENIX symposium on operating systems design and implementation (OSDI 20)","author":"Zheng","year":"2020"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42614.2022.9731657"}],"event":{"name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11408404\/11408433\/11408572.pdf?arnumber=11408572","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T06:49:59Z","timestamp":1772693399000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11408572\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":60,"URL":"https:\/\/doi.org\/10.1109\/hpca68181.2026.11408572","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}