{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,7]],"date-time":"2026-01-07T07:39:11Z","timestamp":1767771551869,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755574","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"582-591","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Detect Any Sound: Open-Vocabulary Sound Event Detection with Multi-Modal Queries"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5936-7042","authenticated-orcid":false,"given":"Pengfei","family":"Cai","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5668-9068","authenticated-orcid":false,"given":"Yan","family":"Song","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8493-7464","authenticated-orcid":false,"given":"Qing","family":"Gu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8754-8766","authenticated-orcid":false,"given":"Nan","family":"Jiang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, HeFei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8567-1413","authenticated-orcid":false,"given":"Haoyu","family":"Song","sequence":"additional","affiliation":[{"name":"Singapore Institute of Technology, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7111-2008","authenticated-orcid":false,"given":"Ian","family":"McLoughlin","sequence":"additional","affiliation":[{"name":"Singapore Institute of Technology, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Keyu An Qian Chen Chong Deng Zhihao Du Changfeng Gao Zhifu Gao Yue Gu Ting He Hangrui Hu Kai Hu Shengpeng Ji Yabin Li Zerui Li Heng Lu Haoneng Luo Xiang Lv Bin Ma Ziyang Ma Chongjia Ni Changhe Song Jiaqi Shi Xian Shi Hao Wang Wen Wang Yuxuan Wang Zhangyu Xiao Zhijie Yan Yexin Yang Bin Zhang Qinglin Zhang Shiliang Zhang Nan Zhao and Siqi Zheng. 2024. FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs. arXiv:2407.04051 [cs.SD] https:\/\/arxiv.org\/abs\/2407.04051"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_24"},{"volume-title":"Thirty-Eighth AAAI Conference on Artificial Intelligence","author":"Bhosale Swapnil","key":"e_1_3_2_1_3_1","unstructured":"Swapnil Bhosale, Sauradip Nag, Diptesh Kanojia, Jiankang Deng, and Xiatian Zhu. 2024. DiffSED: Sound Event Detection with Denoising Diffusion. In Thirty-Eighth AAAI Conference on Artificial Intelligence, Michael J. Wooldridge, Jennifer G. Dy, and Sriraam Natarajan (Eds.). AAAI Press, 792--800."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052995"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024--714"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2690575"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings, Part I.","author":"Carion Nicolas","year":"2020","unstructured":"Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. 2020. End-to-End Object Detection with Transformers. In Computer Vision - ECCV 2020 - 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part I."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.apacoust.2023.109719"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Ke Chen Xingjian Du Bilei Zhu Zejun Ma Taylor Berg-Kirkpatrick and Shlomo Dubnov. 2022. HTS-AT: A Hierarchical Token-Semantic Audio Transformer for Sound Classification and Detection. In ICASSP 2022 - 2022 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). 646--650.","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"5193","author":"Chen Sanyuan","year":"2023","unstructured":"Sanyuan Chen, Yu Wu, Chengyi Wang, Shujie Liu, Daniel Tompkins, Zhuo Chen, Wanxiang Che, Xiangzhan Yu, and Furu Wei. 2023. BEATs: Audio Pre-Training with Acoustic Tokenizers. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 5178--5193."},{"key":"e_1_3_2_1_12_1","volume-title":"Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models. arXiv preprint arXiv:2311.07919","author":"Chu Yunfei","year":"2023","unstructured":"Yunfei Chu, Jin Xu, Xiaohuan Zhou, Qian Yang, Shiliang Zhang, Zhijie Yan, Chang Zhou, and Jingren Zhou. 2023. Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models. arXiv preprint arXiv:2311.07919 (2023)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2005.1521669"},{"key":"e_1_3_2_1_14_1","volume-title":"DCASE 2024 TASK 4: SOUND EVENT DETECTION WITH HETEROGENEOUS DATA AND MISSING LABELS. Technical Report. DCASE2024 Challenge.","author":"Cornell Samuele","year":"2024","unstructured":"Samuele Cornell, Janek Ebbers, Constance Douwes, Irene Martin-Morato, Manu Harju, Annamaria Mesaros, and Romain Serizel. 2024. DCASE 2024 TASK 4: SOUND EVENT DETECTION WITH HETEROGENEOUS DATA AND MISSING LABELS. Technical Report. DCASE2024 Challenge."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics. Association for Computational Linguistics, 4171--4186","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics. Association for Computational Linguistics, 4171--4186."},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021--698"},{"key":"e_1_3_2_1_20_1","volume-title":"The Tenth International Conference on Learning Representations, ICLR 2022","author":"Gu Xiuye","year":"2022","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2022. Open-vocabulary Object Detection via Vision and Language Knowledge Distillation. In The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25--29, 2022."},{"key":"e_1_3_2_1_21_1","first-page":"5036","article-title":"Conformer","volume":"2020","author":"Gulati Anmol","year":"2020","unstructured":"Anmol Gulati, James Qin, Chung-Cheng Chiu, Niki Parmar, Yu Zhang, Jiahui Yu, Wei Han, Shibo Wang, Zhengdong Zhang, Yonghui Wu, and Ruoming Pang. 2020. Conformer: Convolution-augmented Transformer for Speech Recognition. In Proc. Interspeech 2020. 5036--5040.","journal-title":"Convolution-augmented Transformer for Speech Recognition. In Proc. Interspeech"},{"key":"e_1_3_2_1_22_1","volume-title":"The Benefit of Temporally-Strong Labels in Audio Event Classification. In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP).","author":"Hershey Shawn","year":"2021","unstructured":"Shawn Hershey, Daniel P W Ellis, Eduardo Fonseca, Aren Jansen, Caroline Liu, R Channing Moore, and Manoj Plakal. 2021. The Benefit of Temporally-Strong Labels in Audio Event Classification. In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)."},{"key":"e_1_3_2_1_23_1","unstructured":"Po-Yao Huang Hu Xu Juncheng Li Alexei Baevski Michael Auli Wojciech Galuba Florian Metze and Christoph Feichtenhofer. 2022. Masked Autoencoders that Listen. In NeurIPS."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i21.30570"},{"key":"e_1_3_2_1_25_1","volume-title":"Impact of Sound Duration and Inactive Frames on Sound Event Detection Performance. In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP).","author":"Imoto Keisuke","year":"2021","unstructured":"Keisuke Imoto, Sakiko Mishima, Yumi Arai, and Reishi Kondo. 2021. Impact of Sound Duration and Inactive Frames on Sound Event Detection Performance. In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414350"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-227"},{"key":"e_1_3_2_1_30_1","volume-title":"Language-driven Semantic Segmentation. In The Tenth International Conference on Learning Representations, ICLR 2022","author":"Li Boyi","year":"2022","unstructured":"Boyi Li, Kilian Q. Weinberger, Serge J. Belongie, Vladlen Koltun, and Ren\u00e9 Ranftl. 2022. Language-driven Semantic Segmentation. In The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25--29, 2022. OpenReview.net. https:\/\/openreview.net\/forum?id=RriDjddCLN"},{"key":"e_1_3_2_1_31_1","volume-title":"ASTSED: An Effective Sound Event Detection Method Based on Audio Spectrogram Transformer. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Li Kang","year":"2023","unstructured":"Kang Li, Yan Song, Li-Rong Dai, Ian McLoughlin, Xin Fang, and Lin Liu. 2023. ASTSED: An Effective Sound Event Detection Method Based on Audio Spectrogram Transformer. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5."},{"key":"e_1_3_2_1_32_1","volume-title":"Proc. INTERSPEECH","author":"Li Kang","year":"2023","unstructured":"Kang Li, Yan Song, Ian McLoughlin, Lin Liu, Jin Li, and Li-Rong Dai. 2023. Finetuning Audio Spectrogram Transformer with Task-aware Adapters for Sound Event Detection. In Proc. INTERSPEECH 2023. 291--295."},{"key":"e_1_3_2_1_33_1","volume-title":"Grounded Language-Image Pre-training. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10955--10965","author":"Li Liunian Harold","year":"2022","unstructured":"Liunian Harold Li, Pengchuan Zhang, Haotian Zhang, Jianwei Yang, Chunyuan Li, Yiwu Zhong, Lijuan Wang, Lu Yuan, Lei Zhang, Jenq-Neng Hwang, Kai-Wei Chang, and Jianfeng Gao. 2022. Grounded Language-Image Pre-training. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10955--10965."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681145"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_36_1","volume-title":"Decoupled Weight Decay Regularization. In 7th International Conference on Learning Representations (ICLR).","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In 7th International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3090678"},{"key":"e_1_3_2_1_39_1","volume-title":"Convolution-Augmented Transformer For Semi- Supervised Sound Event Detection. Technical Report. DCASE2020 Challenge.","author":"Miyazaki Koichi","year":"2020","unstructured":"Koichi Miyazaki, Tatsuya Komatsu, Tomoki Hayashi, Shinji Watanabe, Tomoki Toda, and Kazuya Takeda. 2020. Convolution-Augmented Transformer For Semi- Supervised Sound Event Detection. Technical Report. DCASE2020 Challenge."},{"key":"e_1_3_2_1_40_1","volume-title":"Exploring Performance-Complexity Trade-Offs in Sound Event Detection. arXiv preprint arXiv:2503.11373","author":"Morocutti Tobias","year":"2025","unstructured":"Tobias Morocutti, Florian Schmid, Jonathan Greif, Francesco Foscarin, and Gerhard Widmer. 2025. Exploring Performance-Complexity Trade-Offs in Sound Event Detection. arXiv preprint arXiv:2503.11373 (2025)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022--10127"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019--2680"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24 July 2021, Virtual Event (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748--8763."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00015"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888942"},{"key":"e_1_3_2_1_46_1","volume-title":"Fine-Tune the Pretrained ATST Model for Sound Event Detection. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP).","author":"Shao Nian","year":"2024","unstructured":"Nian Shao, Xian Li, and Xiaofei Li. 2024. Fine-Tune the Pretrained ATST Model for Sound Event Detection. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448497"},{"key":"e_1_3_2_1_48_1","unstructured":"Yi Su Jisheng Bai Qisheng Xu Kele Xu and Yong Dou. 2025. Audio-Language Models for Audio-Centric Tasks: A survey. arXiv:2501.15177 [cs.SD] https:\/\/arxiv.org\/abs\/2501.15177"},{"key":"e_1_3_2_1_49_1","volume-title":"Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results. Advances in neural information processing systems 30","author":"Tarvainen Antti","year":"2017","unstructured":"Antti Tarvainen and Harri Valpola. 2017. Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_50_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747762"},{"key":"e_1_3_2_1_52_1","volume-title":"Towards Open Vocabulary Learning: A Survey","author":"Wu Jianzong","year":"2024","unstructured":"Jianzong Wu, Xiangtai Li, Shilin Xu, Haobo Yuan, Henghui Ding, Yibo Yang, Xia Li, Jiangning Zhang, Yunhai Tong, Xudong Jiang, Bernard Ghanem, and Dacheng Tao. 2024. Towards Open Vocabulary Learning: A Survey. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","unstructured":"Yusong Wu Ke Chen Tianyu Zhang Yuchen Hui Taylor Berg-Kirkpatrick and Shlomo Dubnov. 2023. Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation. In ICASSP 2023 - 2023 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). 1--5. doi:10.1109\/ICASSP49357.2023.10095969","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_1_54_1","unstructured":"Zhirong Ye Xiangdong Wang Hong Liu Yueliang Qian Rui Tao Long Yan and Kazushige Ouchi. 2021. Sound Event Detection Transformer: An Eventbased End-to-End Model for Sound Event Detection. arXiv:2110.02011 [cs.SD] https:\/\/arxiv.org\/abs\/2110.02011"},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings, Part IX (Lecture Notes in Computer Science","volume":"122","author":"Zang Yuhang","year":"2022","unstructured":"Yuhang Zang, Wei Li, Kaiyang Zhou, Chen Huang, and Chen Change Loy. 2022. Open-Vocabulary DETR with Conditional Matching. In Computer Vision - ECCV 2022 - 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part IX (Lecture Notes in Computer Science, Vol. 13669), Shai Avidan, Gabriel J. Brostow, Moustapha Ciss\u00e9, Giovanni Maria Farinella, and Tal Hassner (Eds.). Springer, 106--122."},{"key":"e_1_3_2_1_56_1","volume-title":"Open-Vocabulary Object Detection Using Captions. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 14388--14397","author":"Zareian Alireza","year":"2021","unstructured":"Alireza Zareian, Kevin Dela Rosa, Derek Hao Hu, and Shih-Fu Chang. 2021. Open-Vocabulary Object Detection Using Captions. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 14388--14397."},{"key":"e_1_3_2_1_57_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Zhang Hongyi","year":"2018","unstructured":"Hongyi Zhang, Moustapha Cisse, Yann N. Dauphin, and David Lopez-Paz. 2018. mixup: Beyond Empirical Risk Minimization. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_58_1","volume-title":"Open Vocabulary Scene Parsing. In 2017 IEEE International Conference on Computer Vision (ICCV).","author":"Zhao Hang","year":"2017","unstructured":"Hang Zhao, Xavier Puig, Bolei Zhou, Sanja Fidler, and Antonio Torralba. 2017. Open Vocabulary Scene Parsing. In 2017 IEEE International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","unstructured":"Xu Zheng Yan Song Ian McLoughlin Lin Liu and Li-Rong Dai. 2021. An Improved Mean Teacher Based Method for Large Scale Weakly Labeled Semi- Supervised Sound Event Detection. In ICASSP 2021 - 2021 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). 356--360. doi:10.1109\/ICASSP39728.2021.9414931","DOI":"10.1109\/ICASSP39728.2021.9414931"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755574","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:05:40Z","timestamp":1765343140000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755574"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":59,"alternative-id":["10.1145\/3746027.3755574","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755574","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}