{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:54Z","timestamp":1765343094294,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62525103, 62441235, 62271281, 62021002"],"award-info":[{"award-number":["62525103, 62441235, 62271281, 62021002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beijing Natural Science Foundation","award":["L252009"],"award-info":[{"award-number":["L252009"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755607","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"4788-4797","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Advancing Reliable Test-Time Adaptation of Vision-Language Models under Visual Variations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8936-4632","authenticated-orcid":false,"given":"Yiwen","family":"Liang","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4180-5801","authenticated-orcid":false,"given":"Hui","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5233-9466","authenticated-orcid":false,"given":"Yizhe","family":"Xiong","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3658-1002","authenticated-orcid":false,"given":"Zihan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5404-4127","authenticated-orcid":false,"given":"Mengyao","family":"Lyu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1390-7424","authenticated-orcid":false,"given":"Zijia","family":"Lin","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8212-1831","authenticated-orcid":false,"given":"Shuaicheng","family":"Niu","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5843-6411","authenticated-orcid":false,"given":"Sicheng","family":"Zhao","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4361-956X","authenticated-orcid":false,"given":"Jungong","family":"Han","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0137-9975","authenticated-orcid":false,"given":"Guiguang","family":"Ding","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Noor Hussein, Muhammad Uzair Khattak, Muhammad Muzammal Naseer, Fahad Shahbaz Khan, and Salman H Khan.","author":"Samadh Jameel Abdul","year":"2023","unstructured":"Jameel Abdul Samadh, Mohammad Hanan Gani, Noor Hussein, Muhammad Uzair Khattak, Muhammad Muzammal Naseer, Fahad Shahbaz Khan, and Salman H Khan. 2023. Align Your Prompts: Test-Time Prompting with Distribution Alignment for Zero-Shot Generalization. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 80396-80413."},{"key":"e_1_3_2_1_2_1","volume-title":"Jong Wook Kim, and Miles Brundage","author":"Agarwal Sandhini","year":"2021","unstructured":"Sandhini Agarwal, Gretchen Krueger, Jack Clark, Alec Radford, Jong Wook Kim, and Miles Brundage. 2021. Evaluating clip: towards characterization of broader capabilities and downstream implications. arXiv preprint arXiv:2108.02818 (2021)."},{"key":"e_1_3_2_1_3_1","first-page":"107","article-title":"Prompting Language-Informed Distribution for Compositional Zero-Shot Learning. In Computer Vision - ECCV 2024","author":"Bao Wentao","year":"2025","unstructured":"Wentao Bao, Lichang Chen, Heng Huang, and Yu Kong. 2025. Prompting Language-Informed Distribution for Compositional Zero-Shot Learning. In Computer Vision - ECCV 2024. Cham, 107-123.","journal-title":"Cham"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00331"},{"volume-title":"Frustratingly Easy Test-Time Adaptation of Vision-Language Models. In The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Farina Matteo","key":"e_1_3_2_1_8_1","unstructured":"Matteo Farina, Gianni Franchi, Giovanni Iacca, Massimiliano Mancini, and Elisa Ricci. [n.d.]. Frustratingly Easy Test-Time Adaptation of Vision-Language Models. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"volume-title":"2004 Conference on Computer Vision and Pattern Recognition Workshop. 178-178","author":"Fei-Fei Li","key":"e_1_3_2_1_9_1","unstructured":"Li Fei-Fei, R. Fergus, and P. Perona. 2004. Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian Approach Tested on 101 Object Categories. In 2004 Conference on Computer Vision and Pattern Recognition Workshop. 178-178."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00255"},{"key":"e_1_3_2_1_11_1","volume-title":"Consolidator: Mergeable Adapter with Grouped Connections for Visual Adaptation. arXiv:2305.00603 [cs.CV] https:\/\/arxiv.org\/abs\/2305.00603","author":"Hao Tianxiang","year":"2023","unstructured":"Tianxiang Hao, Hui Chen, Yuchen Guo, and Guiguang Ding. 2023. Consolidator: Mergeable Adapter with Grouped Connections for Visual Adaptation. arXiv:2305.00603 [cs.CV] https:\/\/arxiv.org\/abs\/2305.00603"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"e_1_3_2_1_14_1","volume-title":"International Conference on Learning Representations.","author":"Dan","year":"2020","unstructured":"Dan Hendrycks*, Norman Mu*, Ekin Dogus Cubuk, Barret Zoph, Justin Gilmer, and Balaji Lakshminarayanan. 2020. AugMix: A Simple Method to Improve Robustness and Uncertainty under Data Shift. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"e_1_3_2_1_16_1","unstructured":"Tony Huang Jack Chu and Fangyun Wei. 2022. Unsupervised Prompt Learning for Vision-Language Models. arXiv:2204.03649 https:\/\/arxiv.org\/abs\/2204.03649"},{"key":"e_1_3_2_1_17_1","volume-title":"Retrieval-Enhanced Contrastive Vision-Text Models. In The Twelfth International Conference on Learning Representations.","author":"Iscen Ahmet","year":"2024","unstructured":"Ahmet Iscen, Mathilde Caron, Alireza Fathi, and Cordelia Schmid. 2024. Retrieval-Enhanced Contrastive Vision-Text Models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 4904-4916."},{"key":"e_1_3_2_1_19_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Lu Yang","year":"2024","unstructured":"Jiahaoli, Yang Lu, Yuan Xie, and Yanyun Qu. 2024. Relationship Prompt Learning is Enough for Open-Vocabulary Semantic Segmentation. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01343"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01032"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01032"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"volume-title":"Advances in Neural Information Processing Systems","author":"Liang Victor Weixin","key":"e_1_3_2_1_24_1","unstructured":"Victor Weixin Liang, Yuhui Zhang, Yongchan Kwon, Serena Yeung, and James Y Zou. 2022. Mind the Gap: Understanding the Modality Gap in Multi-modal Contrastive Representation Learning. In Advances in Neural Information Processing Systems, S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh (Eds.), Vol. 35. Curran Associates, Inc., 17612-17625."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612055"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"e_1_3_2_1_27_1","volume-title":"Computer Vision - ECCV","author":"Lyu Mengyao","year":"2024","unstructured":"Mengyao Lyu, Tianxiang Hao, Xinhao Xu, Hui Chen, Zijia Lin, Jungong Han, and Guiguang Ding. 2025. Learn from the Learnt: Source-Free Active Domain Adaptation via Contrastive Sampling and Visual Persistence. In Computer Vision - ECCV 2024. Springer Nature Switzerland, Cham, 228-246."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02276"},{"key":"e_1_3_2_1_29_1","first-page":"65252","volume-title":"Levine (Eds.)","volume":"36","author":"XIAOSONG MA","year":"2023","unstructured":"XIAOSONG MA, Jie ZHANG, Song Guo, and Wenchao Xu. 2023. SwapPrompt: Test-Time Prompt Adaptation for Vision-Language Models. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 65252-65264."},{"key":"e_1_3_2_1_30_1","unstructured":"Subhransu Maji Esa Rahtu Juho Kannala Matthew Blaschko and Andrea Vedaldi. 2013. Fine-Grained Visual Classification of Aircraft. arXiv:1306.5151 https:\/\/arxiv.org\/abs\/1306.5151"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"volume-title":"2012 IEEE Conference on Computer Vision and Pattern Recognition. 3498-3505","author":"Parkhi Omkar M","key":"e_1_3_2_1_32_1","unstructured":"Omkar M Parkhi, Andrea Vedaldi, Andrew Zisserman, and C. V. Jawahar. 2012. Cats and dogs. In 2012 IEEE Conference on Computer Vision and Pattern Recognition. 3498-3505."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748-8763."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 36th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"5400","author":"Recht Benjamin","year":"2019","unstructured":"Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt, and Vaishaal Shankar. 2019. Do ImageNet Classifiers Generalize to ImageNet?. In Proceedings of the 36th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 97), Kamalika Chaudhuri and Ruslan Salakhutdinov (Eds.). PMLR, 5389-5400."},{"key":"e_1_3_2_1_35_1","first-page":"287","volume-title":"Proceedings of the Fifth Annual Workshop on Computational Learning Theory","author":"Seung H. S.","unstructured":"H. S. Seung, M. Opper, and H. Sompolinsky. 1992. Query by committee. In Proceedings of the Fifth Annual Workshop on Computational Learning Theory (Pittsburgh, Pennsylvania, USA) (COLT '92). Association for Computing Machinery, New York, NY, USA, 287-294."},{"key":"e_1_3_2_1_36_1","volume-title":"ICLR 2023 Workshop on Multimodal Representation Learning: Perks and Pitfalls.","author":"Shi Peiyang","year":"2023","unstructured":"Peiyang Shi, Michael C. Welle, Mra, rten Bj\u00f6rkman, and Danica Kragic. 2023. Towards understanding the modality gap in CLIP. In ICLR 2023 Workshop on Multimodal Representation Learning: Perks and Pitfalls."},{"key":"e_1_3_2_1_37_1","first-page":"14274","volume-title":"Oh (Eds.)","volume":"35","author":"Shu Manli","year":"2022","unstructured":"Manli Shu, Weili Nie, De-An Huang, Zhiding Yu, Tom Goldstein, Anima Anandkumar, and Chaowei Xiao. 2022. Test-Time Prompt Tuning for Zero-Shot Generalization in Vision-Language Models. In Advances in Neural Information Processing Systems, S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh (Eds.), Vol. 35. Curran Associates, Inc., 14274-14289."},{"key":"e_1_3_2_1_38_1","first-page":"1","article-title":"A dataset of 101 human action classes from videos in the wild","volume":"2","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. A dataset of 101 human action classes from videos in the wild. Center for Research in Computer Vision, Vol. 2, 11 (2012), 1-7.","journal-title":"Center for Research in Computer Vision"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00090"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611988"},{"key":"e_1_3_2_1_41_1","volume-title":"Focus Small. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 9718-9729","author":"Wang Ao","year":"2025","unstructured":"Ao Wang, Hui Chen, Zijia Lin, Jungong Han, and Guiguang Ding. 2025. LSNet: See Large, Focus Small. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 9718-9729."},{"volume-title":"Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. dtextquotesingle Alch\u00e9-Buc","author":"Wang Haohan","key":"e_1_3_2_1_42_1","unstructured":"Haohan Wang, Songwei Ge, Zachary Lipton, and Eric P Xing. 2019. Learning Robust Global Representations by Penalizing Local Predictive Power. In Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. dtextquotesingle Alch\u00e9-Buc, E. Fox, and R. Garnett (Eds.), Vol. 32. Curran Associates, Inc."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2017.2710978"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01846"},{"key":"e_1_3_2_1_46_1","volume-title":"PYRA: Parallel Yielding Re-activation for Training-Inference Efficient Task Adaptation. In Computer Vision - ECCV","author":"Xiong Yizhe","year":"2025","unstructured":"Yizhe Xiong, Hui Chen, Tianxiang Hao, Zijia Lin, Jungong Han, Yuesong Zhang, Guoxin Wang, Yongjun Bao, and Guiguang Ding. 2025. PYRA: Parallel Yielding Re-activation for Training-Inference Efficient Task Adaptation. In Computer Vision - ECCV 2024. Springer Nature Switzerland, Cham, 455-473."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01067"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01049"},{"key":"e_1_3_2_1_49_1","volume-title":"Dual Prototype Evolving for Test-Time Generalization of Vision-Language Models. In The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Zhang Ce","year":"2024","unstructured":"Ce Zhang, Simon Stepputtis, Katia P. Sycara, and Yaqi Xie. 2024a. Dual Prototype Evolving for Test-Time Generalization of Vision-Language Models. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"e_1_3_2_1_51_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Zhang Taolin","year":"2024","unstructured":"Taolin Zhang, Jinpeng Wang, Hang Guo, Tao Dai, Bin Chen, and Shu-Tao Xia. 2024b. BoostAdapter: Improving Vision-Language Test-Time Adaptation via Regional Bootstrapping. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02713"},{"key":"e_1_3_2_1_53_1","volume-title":"Test-Time Adaptation with CLIP Reward for Zero-Shot Generalization in Vision-Language Models. In The Twelfth International Conference on Learning Representations.","author":"Zhao Shuai","year":"2024","unstructured":"Shuai Zhao, Xiaohan Wang, Linchao Zhu, and Yi Yang. 2024. Test-Time Adaptation with CLIP Reward for Zero-Shot Generalization in Vision-Language Models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680885"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755607","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:00:41Z","timestamp":1765342841000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755607"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":56,"alternative-id":["10.1145\/3746027.3755607","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755607","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}