{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T17:47:53Z","timestamp":1763142473595,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3729950","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:18:36Z","timestamp":1752455916000},"page":"823-832","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Diffusion Augmented Retrieval: A Training-Free Approach to Interactive Text-to-Image Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2008-960X","authenticated-orcid":false,"given":"Zijun","family":"Long","sequence":"first","affiliation":[{"name":"Hunan University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1367-0606","authenticated-orcid":false,"given":"Kangheng","family":"Liang","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3756-5569","authenticated-orcid":false,"given":"Gerardo","family":"Aragon Camarasa","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2751-2087","authenticated-orcid":false,"given":"Richard","family":"Mccreadie","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5198-7445","authenticated-orcid":false,"given":"Paul","family":"Henderson","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Generative adversarial networks: An overview","author":"Creswell Antonia","year":"2018","unstructured":"Antonia Creswell, Tom White, Vincent Dumoulin, Kai Arulkumaran, Biswa Sengupta, and Anil A Bharath. 2018. Generative adversarial networks: An overview. IEEE signal processing magazine, Vol. 35, 1 (2018), 53-65."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3261988"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2828437"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the International Conference on Machine Learning, ICML","author":"Esser Patrick","year":"2024","unstructured":"Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas M\u00fcller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, Dustin Podell, Tim Dockhorn, Zion English, and Robin Rombach. 2024. Scaling Rectified Flow Transformers for High-Resolution Image Synthesis. In Proceedings of the International Conference on Machine Learning, ICML 2024."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475634"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2024.103716"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the Advances in neural information processing systems conference, NeurIPS","volume":"27","author":"Goodfellow Ian","year":"2014","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative adversarial nets. In Proceedings of the Advances in neural information processing systems conference, NeurIPS 2014, Vol. 27."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_9_1","volume-title":"A Review on Generative Adversarial Networks: Algorithms, Theory, and Applications. CoRR","author":"Gui Jie","year":"2020","unstructured":"Jie Gui, Zhenan Sun, Yonggang Wen, Dacheng Tao, and Jieping Ye. 2020. A Review on Generative Adversarial Networks: Algorithms, Theory, and Applications. CoRR, Vol. abs\/2001.06937 (2020)."},{"key":"e_1_3_2_1_10_1","first-page":"6840","volume-title":"Proceedings of the Advances in neural information processing systems conference, NeurIPS","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In Proceedings of the Advances in neural information processing systems conference, NeurIPS, Vol. 33. 6840-6851."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.48084\/etasr.7200"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.46"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems, NeurIPS","author":"Levy Matan","year":"2023","unstructured":"Matan Levy, Rami Ben-Ari, Nir Darshan, and Dani Lischinski. 2023. Chatting Makes Perfect: Chat-based Image Retrieval. In Proceedings of the Advances in Neural Information Processing Systems, NeurIPS, 2023."},{"key":"e_1_3_2_1_14_1","first-page":"12888","volume-title":"Proceedings of the International Conference on Machine Learning, ICML","volume":"162","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven C. H. Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In Proceedings of the International Conference on Machine Learning, ICML 2022, Vol. 162. PMLR, 12888-12900."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.639"},{"key":"e_1_3_2_1_16_1","first-page":"740","volume-title":"Microsoft COCO: Common Objects in Context. In Proceedings of The European Conference on Computer Vision ECCV","volume":"8693","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In Proceedings of The European Conference on Computer Vision ECCV 2014, Vol. 8693. Springer, 740-755."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657741"},{"key":"e_1_3_2_1_18_1","first-page":"6580","article-title":"Multiway-Adapter: Adapting Multimodal Large Language Models for Scalable Image-Text Retrieval. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Long Zijun","year":"2024","unstructured":"Zijun Long, George Killick, Richard McCreadie, and Gerardo Aragon Camarasa. 2024b. Multiway-Adapter: Adapting Multimodal Large Language Models for Scalable Image-Text Retrieval. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 6580-6584.","journal-title":"IEEE"},{"key":"e_1_3_2_1_19_1","volume-title":"Understanding and Mitigating Human-Labelling Errors in Supervised Contrastive Learning. In European Conference on Computer Vision. Springer, 435-454","author":"Long Zijun","year":"2024","unstructured":"Zijun Long, Lipeng Zhuang, et al., 2024c. Understanding and Mitigating Human-Labelling Errors in Supervised Contrastive Learning. In European Conference on Computer Vision. Springer, 435-454."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.3233\/FAIA240691"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58523-5_20"},{"key":"e_1_3_2_1_22_1","first-page":"8748","volume-title":"Proceedings of the International Conference on Machine Learning, ICML","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the International Conference on Machine Learning, ICML 2021, Vol. 139. PMLR, 8748-8763."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of The International conference on machine learning ICML, PMLR, 8821-8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In Proceedings of The International conference on machine learning ICML, PMLR, 8821-8831."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of The Advances in Neural Information Processing Systems NeurIPS","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L. Denton, Seyed Kamyar Seyed Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, Jonathan Ho, David J. Fleet, and Mohammad Norouzi. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. In Proceedings of The Advances in Neural Information Processing Systems NeurIPS 2022,."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01850"},{"key":"e_1_3_2_1_27_1","first-page":"5679","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, WACV","author":"de Pablos Antonio","year":"2024","unstructured":"Antonio Tejero-de Pablos. 2024. Complementary-Contradictory Feature Regularization Against Multimodal Overfitting. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, WACV 2024. 5679-5688."},{"key":"e_1_3_2_1_28_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. CoRR","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aur\u00e9lien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. CoRR, Vol. abs\/2302.13971 (2023)."},{"key":"e_1_3_2_1_29_1","first-page":"8384","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR","author":"Wan Yongquan","year":"2024","unstructured":"Yongquan Wan, Wenhai Wang, Guobing Zou, and Bofeng Zhang. 2024. Cross-modal feature alignment and fusion for composed image retrieval. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024. 8384-8388."},{"key":"e_1_3_2_1_30_1","volume-title":"Saksham Singhal, Subhojit Som, and Furu Wei.","author":"Wang Wenhui","year":"2022","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, Johan Bjorck, Zhiliang Peng, Qiang Liu, Kriti Aggarwal, Owais Khan Mohammed, Saksham Singhal, Subhojit Som, and Furu Wei. 2022. Image as a Foreign Language: BEiT Pretraining for All Vision and Vision-Language Tasks. CoRR, Vol. abs\/2208.10442 (2022)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/JAS.2023.123618"},{"key":"e_1_3_2_1_33_1","volume-title":"A Survey of Resource-efficient LLM and Multimodal Foundation Models. CoRR","author":"Xu Mengwei","year":"2024","unstructured":"Mengwei Xu, Wangsong Yin, Dongqi Cai, Rongjie Yi, Daliang Xu, Qipeng Wang, Bingyang Wu, Yihao Zhao, Chen Yang, Shihe Wang, Qiyang Zhang, Zhenyan Lu, Li Zhang, Shangguang Wang, Yuanchun Li, Yunxin Liu, Xin Jin, and Xuanzhe Liu. 2024. A Survey of Resource-efficient LLM and Multimodal Foundation Models. CoRR, Vol. abs\/2401.08092 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Shelby Heinecke, Huan Wang, Yejin Choi, Ludwig Schmidt, Zeyuan Chen, Silvio Savarese, Juan Carlos Niebles, Caiming Xiong, and Ran Xu.","author":"Xue Le","year":"2024","unstructured":"Le Xue, Manli Shu, Anas Awadalla, Jun Wang, An Yan, Senthil Purushwalkam, Honglu Zhou, Viraj Prabhu, Yutong Dai, Michael S. Ryoo, Shrikant Kendre, Jieyu Zhang, Can Qin, Shu Zhang, Chia-Chih Chen, Ning Yu, Juntao Tan, Tulika Manoj Awalgaonkar, Shelby Heinecke, Huan Wang, Yejin Choi, Ludwig Schmidt, Zeyuan Chen, Silvio Savarese, Juan Carlos Niebles, Caiming Xiong, and Ran Xu. 2024. xGen-MM (BLIP-3): A Family of Open Large Multimodal Models. CoRR, Vol. abs\/2408.08872 (2024)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3718099"},{"key":"e_1_3_2_1_36_1","first-page":"378","volume-title":"Proceedings of the The European Conference on Computer Vision ECCV","volume":"15089","author":"Yoon Hee Suk","year":"2024","unstructured":"Hee Suk Yoon, Eunseop Yoon, et al., 2024. BI-MDRG: Bridging Image History in Multimodal Dialogue Response Generation. In Proceedings of the The European Conference on Computer Vision ECCV 2024., Vol. 15089. Springer, 378-396."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462881"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of The Conference on Parsimony and Learning. 202-227","author":"Zhai Yuexiang","year":"2024","unstructured":"Yuexiang Zhai, Shengbang Tong, Xiao Li, Mu Cai, Qing Qu, Yong Jae Lee, and Yi Ma. 2024. Investigating the catastrophic forgetting in multimodal large language model fine-tuning. In Proceedings of The Conference on Parsimony and Learning. 202-227."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658032"}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Padua Italy","acronym":"SIGIR '25"},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3729950","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T18:29:57Z","timestamp":1755887397000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3729950"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":39,"alternative-id":["10.1145\/3726302.3729950","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3729950","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}