{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,23]],"date-time":"2026-03-23T16:29:58Z","timestamp":1774283398075,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":80,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100014219","name":"National Science Fund for Distinguished Young Scholars","doi-asserted-by":"publisher","award":["No.62025603"],"award-info":[{"award-number":["No.62025603"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100014219","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. U21B2037, No. U22B2051, No. U23A20383, No. U21A20472, No. 62176222, No. 62176223, No. 62176226, No. 62072386, No. 62072387, No. 62072389, No. 62002305 and No. 62272401"],"award-info":[{"award-number":["No. U21B2037, No. U22B2051, No. U23A20383, No. U21A20472, No. 62176222, No. 62176223, No. 62176226, No. 62072386, No. 62072387, No. 62072389, No. 62002305 and No. 62272401"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Natural Science Foundation of Fujian Province of China","award":["No. 2021J06003?No.2022J06001"],"award-info":[{"award-number":["No. 2021J06003?No.2022J06001"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["No. 2022ZD0118201"],"award-info":[{"award-number":["No. 2022ZD0118201"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681275","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"8062-8071","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Multimodal Inplace Prompt Tuning for Open-set Object Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6885-0261","authenticated-orcid":false,"given":"Guilin","family":"Li","sequence":"first","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2911-5369","authenticated-orcid":false,"given":"Mengdan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent Youtu Lab, shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6855-5403","authenticated-orcid":false,"given":"Xiawu","family":"Zheng","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8015-5366","authenticated-orcid":false,"given":"Peixian","family":"Chen","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1026-6476","authenticated-orcid":false,"given":"Zihan","family":"Wang","sequence":"additional","affiliation":[{"name":"East China Normal University, shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3970-7519","authenticated-orcid":false,"given":"Yunhang","family":"Shen","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2561-7712","authenticated-orcid":false,"given":"Mingchen","family":"Zhuge","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology, thuwal, Saudi Arabia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5035-3168","authenticated-orcid":false,"given":"Chenglin","family":"Wu","sequence":"additional","affiliation":[{"name":"DeepWisdom, xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6928-2638","authenticated-orcid":false,"given":"Fei","family":"Chao","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7998-0731","authenticated-orcid":false,"given":"Ke","family":"Li","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8132-9083","authenticated-orcid":false,"given":"Xing","family":"Sun","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9163-2932","authenticated-orcid":false,"given":"Rongrong","family":"Ji","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, xiamen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_24"},{"key":"e_1_3_2_1_2_1","unstructured":"Arnav Chavan Zhuang Liu Deepak Gupta Eric Xing and Zhiqiang Shen. 2023. One-for-All: Generalized LoRA for Parameter-Efficient Fine-tuning. arxiv: 2306.07967 [cs.LG]"},{"key":"e_1_3_2_1_3_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Dettmers Tim","year":"2024","unstructured":"Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. 2024. Qlora: Efficient finetuning of quantized llms. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_5_1","volume-title":"InternLM-XComposer2: Mastering Free-form Text-Image Composition and Comprehension in Vision-Language Large Model. arXiv preprint arXiv:2401.16420","author":"Dong Xiaoyi","year":"2024","unstructured":"Xiaoyi Dong, Pan Zhang, Yuhang Zang, Yuhang Cao, Bin Wang, Linke Ouyang, Xilin Wei, Songyang Zhang, Haodong Duan, Maosong Cao, Wenwei Zhang, Yining Li, Hang Yan, Yang Gao, Xinyue Zhang, Wei Li, Jingwen Li, Kai Chen, Conghui He, Xingcheng Zhang, Yu Qiao, Dahua Lin, and Jiaqi Wang. 2024. InternLM-XComposer2: Mastering Free-form Text-Image Composition and Comprehension in Vision-Language Large Model. arXiv preprint arXiv:2401.16420 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"InternLM-XComposer2--4KHD: A Pioneering Large Vision-Language Model Handling Resolutions from 336 Pixels to 4K HD. arXiv preprint arXiv:2404.06512","author":"Dong Xiaoyi","year":"2024","unstructured":"Xiaoyi Dong, Pan Zhang, Yuhang Zang, Yuhang Cao, Bin Wang, Linke Ouyang, Songyang Zhang, Haodong Duan, Wenwei Zhang, Yining Li, Hang Yan, Yang Gao, Zhe Chen, Xinyue Zhang, Wei Li, Jingwen Li, Wenhai Wang, Kai Chen, Conghui He, Xingcheng Zhang, Jifeng Dai, Yu Qiao, Dahua Lin, and Jiaqi Wang. 2024. InternLM-XComposer2--4KHD: A Pioneering Large Vision-Language Model Handling Resolutions from 336 Pixels to 4K HD. arXiv preprint arXiv:2404.06512 (2024)."},{"key":"e_1_3_2_1_7_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01891-x"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01282"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_14_1","volume-title":"International conference on machine learning. PMLR, 2790--2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International conference on machine learning. PMLR, 2790--2799."},{"key":"e_1_3_2_1_15_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_16_1","volume-title":"T-Rex2: Towards Generic Object Detection via Text-Visual Prompt Synergy. arXiv preprint arXiv:2403.14610","author":"Jiang Qing","year":"2024","unstructured":"Qing Jiang, Feng Li, Zhaoyang Zeng, Tianhe Ren, Shilong Liu, and Lei Zhang. 2024. T-Rex2: Towards Generic Object Detection via Text-Visual Prompt Synergy. arXiv preprint arXiv:2403.14610 (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"e_1_3_2_1_18_1","first-page":"1022","article-title":"Compacter: Efficient low-rank hypercomplex adapter layers","volume":"34","author":"Mahabadi Rabeeh Karimi","year":"2021","unstructured":"Rabeeh Karimi Mahabadi, James Henderson, and Sebastian Ruder. 2021. Compacter: Efficient low-rank hypercomplex adapter layers. Advances in Neural Information Processing Systems, Vol. 34 (2021), 1022--1035.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","volume-title":"F-vlm: Open-vocabulary object detection upon frozen vision and language models. arXiv preprint arXiv:2209.15639","author":"Kuo Weicheng","year":"2022","unstructured":"Weicheng Kuo, Yin Cui, Xiuye Gu, AJ Piergiovanni, and Anelia Angelova. 2022. F-vlm: Open-vocabulary object detection upon frozen vision and language models. arXiv preprint arXiv:2209.15639 (2022)."},{"key":"e_1_3_2_1_20_1","volume-title":"The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691","author":"Lester Brian","year":"2021","unstructured":"Brian Lester, Rami Al-Rfou, and Noah Constant. 2021. The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)."},{"key":"e_1_3_2_1_21_1","first-page":"9287","article-title":"Elevater: A benchmark and toolkit for evaluating language-augmented visual models","volume":"35","author":"Li Chunyuan","year":"2022","unstructured":"Chunyuan Li, Haotian Liu, Liunian Li, Pengchuan Zhang, Jyoti Aneja, Jianwei Yang, Ping Jin, Houdong Hu, Zicheng Liu, Yong Jae Lee, et al. 2022. Elevater: A benchmark and toolkit for evaluating language-augmented visual models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 9287--9301.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_22_1","unstructured":"Feng Li Qing Jiang Hao Zhang Tianhe Ren Shilong Liu Xueyan Zou Huaizhe Xu Hongyang Li Chunyuan Li Jianwei Yang et al. 2023. Visual In-Context Prompting. arXiv preprint arXiv:2311.13601 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"e_1_3_2_1_24_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv","author":"Li Liunian Harold","year":"1908","unstructured":"Liunian Harold Li, Mark Yatskar, D Yin, CJ Hsieh, and KW Chang. 1908. Visualbert: A simple and performant baseline for vision and language. arXiv 2019. arXiv preprint arXiv:1908.03557, Vol. 3 (1908)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings, Part XXX 16","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_1_27_1","volume-title":"Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190","author":"Li Xiang Lisa","year":"2021","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)."},{"key":"e_1_3_2_1_28_1","volume-title":"Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models. arXiv:2403.18814","author":"Li Yanwei","year":"2023","unstructured":"Yanwei Li, Yuechen Zhang, Chengyao Wang, Zhisheng Zhong, Yixin Chen, Ruihang Chu, Shaoteng Liu, and Jiaya Jia. 2023. Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models. arXiv:2403.18814 (2023)."},{"key":"e_1_3_2_1_29_1","first-page":"109","article-title":"Scaling & shifting your features: A new baseline for efficient model tuning","volume":"35","author":"Lian Dongze","year":"2022","unstructured":"Dongze Lian, Daquan Zhou, Jiashi Feng, and Xinchao Wang. 2022. Scaling & shifting your features: A new baseline for efficient model tuning. Advances in Neural Information Processing Systems, Vol. 35 (2022), 109--123.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"e_1_3_2_1_31_1","volume-title":"Generative Region-Language Pretraining for Open-Ended Object Detection. arXiv preprint arXiv:2403.10191","author":"Lin Chuang","year":"2024","unstructured":"Chuang Lin, Yi Jiang, Lizhen Qu, Zehuan Yuan, and Jianfei Cai. 2024. Generative Region-Language Pretraining for Open-Ended Object Detection. arXiv preprint arXiv:2403.10191 (2024)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"e_1_3_2_1_33_1","volume-title":"Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv preprint arXiv:2311.07575","author":"Lin Ziyi","year":"2023","unstructured":"Ziyi Lin, Chris Liu, Renrui Zhang, Peng Gao, Longtian Qiu, Han Xiao, Han Qiu, Chen Lin, Wenqi Shao, Keqin Chen, et al. 2023. Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv preprint arXiv:2311.07575 (2023)."},{"key":"e_1_3_2_1_34_1","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Chunyuan Li Jianwei Yang Hang Su Jun Zhu et al. 2023. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Zhengxiao Du, Zhilin Yang, and Jie Tang.","author":"Liu Xiao","year":"2021","unstructured":"Xiao Liu, Kaixuan Ji, Yicheng Fu, Weng Lam Tam, Zhengxiao Du, Zhilin Yang, and Jie Tang. 2021. P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:2110.07602 (2021)."},{"key":"e_1_3_2_1_36_1","volume-title":"GPT understands, too. AI Open","author":"Liu Xiao","year":"2023","unstructured":"Xiao Liu, Yanan Zheng, Zhengxiao Du, Ming Ding, Yujie Qian, Zhilin Yang, and Jie Tang. 2023. GPT understands, too. AI Open (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_39_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_40_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Luo Gen","year":"2024","unstructured":"Gen Luo, Yiyi Zhou, Tianhe Ren, Shengxin Chen, Xiaoshuai Sun, and Rongrong Ji. 2024. Cheap and quick: Efficient vision-language instruction tuning for large language models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"Unipelt: A unified framework for parameter-efficient language model tuning. arXiv preprint arXiv:2110.07577","author":"Mao Yuning","year":"2021","unstructured":"Yuning Mao, Lambert Mathias, Rui Hou, Amjad Almahairi, Hao Ma, Jiawei Han, Wen-tau Yih, and Madian Khabsa. 2021. Unipelt: A unified framework for parameter-efficient language model tuning. arXiv preprint arXiv:2110.07577 (2021)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"M Minderer A Gritsenko A Stone M Neumann D Weissenborn A Dosovitskiy A Mahendran A Arnab M Dehghani Z Shen et al. 2022. Simple open-vocabulary object detection with vision transformers. arxiv 2022. arXiv preprint arXiv:2205.06230 Vol. 2 (2022).","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"e_1_3_2_1_43_1","volume-title":"Momentor: Advancing Video Large Language Model with Fine-Grained Temporal Reasoning. arxiv: 2402.11435 [cs.CV]","author":"Qian Long","year":"2024","unstructured":"Long Qian, Juncheng Li, Yu Wu, Yaobo Ye, Hao Fei, Tat-Seng Chua, Yueting Zhuang, and Siliang Tang. 2024. Momentor: Advancing Video Large Language Model with Fine-Grained Temporal Reasoning. arxiv: 2402.11435 [cs.CV]"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6868"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"e_1_3_2_1_46_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, Vol. 28 (2015)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00852"},{"key":"e_1_3_2_1_48_1","volume-title":"Aligning and Prompting Everything All at Once for Universal Visual Perception. arXiv preprint arXiv:2312.02153","author":"Shen Yunhang","year":"2023","unstructured":"Yunhang Shen, Chaoyou Fu, Peixian Chen, Mengdan Zhang, Ke Li, Xing Sun, Yunsheng Wu, Shaohui Lin, and Rongrong Ji. 2023. Aligning and Prompting Everything All at Once for Universal Visual Perception. arXiv preprint arXiv:2312.02153 (2023)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"e_1_3_2_1_50_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_1_51_1","volume-title":"Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079","author":"Wang Weihan","year":"2023","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Xixuan Song, et al. 2023. Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Weiyun Wang Yiming Ren Haowen Luo Tiantong Li Chenxiang Yan Zhe Chen Wenhai Wang Qingyun Li Lewei Lu Xizhou Zhu et al. 2024. The All-Seeing Project V2: Towards General Relation Comprehension of the Open World. arXiv preprint arXiv:2402.19474 (2024).","DOI":"10.1007\/978-3-031-73414-4_27"},{"key":"e_1_3_2_1_53_1","unstructured":"Weiyun Wang Min Shi Qingyun Li Wenhai Wang Zhenhang Huang Linjie Xing Zhe Chen Hao Li Xizhou Zhu Zhiguo Cao et al. 2023. The All-Seeing Project: Towards Panoptic Visual Recognition and Understanding of the Open World. arXiv preprint arXiv:2308.01907 (2023)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02005"},{"key":"e_1_3_2_1_55_1","unstructured":"Jianzong Wu Xiangtai Li Shilin Xu Haobo Yuan Henghui Ding Yibo Yang Xia Li Jiangning Zhang Yunhai Tong Xudong Jiang et al. 2024. Towards open vocabulary learning: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00679"},{"key":"e_1_3_2_1_58_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Xie Chi","year":"2024","unstructured":"Chi Xie, Zhao Zhang, Yixuan Wu, Feng Zhu, Rui Zhao, and Shuang Liang. 2024. Described Object Detection: Liberating Object Detection with Flexible Expressions. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_59_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Xu Yifan","year":"2024","unstructured":"Yifan Xu, Mengdan Zhang, Chaoyou Fu, Peixian Chen, Xiaoshan Yang, Ke Li, and Changsheng Xu. 2024. Multi-modal queried object detection in the wild. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02250"},{"key":"e_1_3_2_1_62_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi Chaoya Jiang Chenliang Li Yuanhong Xu Hehong Chen Junfeng Tian Qi Qian Ji Zhang and Fei Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. arxiv: 2304.14178 [cs.CL]"},{"key":"e_1_3_2_1_63_1","unstructured":"Qinghao Ye Haiyang Xu Jiabo Ye Ming Yan Anwen Hu Haowei Liu Qi Qian Ji Zhang Fei Huang and Jingren Zhou. 2023. mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration. arxiv: 2311.04257 [cs.CL]"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"crossref","unstructured":"Jun Zhan Junqi Dai Jiasheng Ye Yunhua Zhou Dong Zhang Zhigeng Liu Xin Zhang Ruibin Yuan Ge Zhang Linyang Li et al. 2024. AnyGPT: Unified Multimodal LLM with Discrete Sequence Modeling. arXiv preprint arXiv:2402.12226 (2024).","DOI":"10.18653\/v1\/2024.acl-long.521"},{"key":"e_1_3_2_1_66_1","volume-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv","author":"Zhang H","year":"2022","unstructured":"H Zhang, F Li, S Liu, L Zhang, H Su, J Zhu, LM Ni, and HY Shum. 2022. Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv 2022. arXiv preprint arXiv:2203.03605, Vol. 5 (2022)."},{"key":"e_1_3_2_1_67_1","first-page":"36067","article-title":"Glipv2: Unifying localization and vision-language understanding","volume":"35","author":"Zhang Haotian","year":"2022","unstructured":"Haotian Zhang, Pengchuan Zhang, Xiaowei Hu, Yen-Chun Chen, Liunian Li, Xiyang Dai, Lijuan Wang, Lu Yuan, Jenq-Neng Hwang, and Jianfeng Gao. 2022. Glipv2: Unifying localization and vision-language understanding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 36067--36080.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_68_1","volume-title":"InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition. arXiv preprint arXiv:2309.15112","author":"Zhang Pan","year":"2023","unstructured":"Pan Zhang, Xiaoyi Dong, Bin Wang, Yuhang Cao, Chao Xu, Linke Ouyang, Zhiyuan Zhao, Shuangrui Ding, Songyang Zhang, Haodong Duan, Wenwei Zhang, Hang Yan, Xinyue Zhang, Wei Li, Jingwen Li, Kai Chen, Conghui He, Xingcheng Zhang, Yu Qiao, Dahua Lin, and Jiaqi Wang. 2023. InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition. arXiv preprint arXiv:2309.15112 (2023)."},{"key":"e_1_3_2_1_69_1","volume-title":"Ideal: Influence-driven selective annotations empower in-context learners in large language models. arXiv preprint arXiv:2310.10873","author":"Zhang Shaokun","year":"2023","unstructured":"Shaokun Zhang, Xiaobo Xia, Zhaoqing Wang, Ling-Hao Chen, Jiale Liu, Qingyun Wu, and Tongliang Liu. 2023. Ideal: Influence-driven selective annotations empower in-context learners in large language models. arXiv preprint arXiv:2310.10873 (2023)."},{"key":"e_1_3_2_1_70_1","volume-title":"Training Language Model Agents without Modifying Language Models. arXiv preprint arXiv:2402.11359","author":"Zhang Shaokun","year":"2024","unstructured":"Shaokun Zhang, Jieyu Zhang, Jiale Liu, Linxin Song, Chi Wang, Ranjay Krishna, and Qingyun Wu. 2024. Training Language Model Agents without Modifying Language Models. arXiv preprint arXiv:2402.11359 (2024)."},{"key":"e_1_3_2_1_71_1","volume-title":"You only compress once: Towards effective and elastic bert compression via exploit-explore stochastic nature gradient. Neurocomputing","author":"Zhang Shaokun","year":"2024","unstructured":"Shaokun Zhang, Xiawu Zheng, Guilin Li, Chenyi Yang, Yuchao Li, Yan Wang, Fei Chao, Mengdi Wang, Shen Li, and Rongrong Ji. 2024. You only compress once: Towards effective and elastic bert compression via exploit-explore stochastic nature gradient. Neurocomputing (2024), 128140."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01753-6"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01393"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_35"},{"key":"e_1_3_2_1_78_1","volume-title":"Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020. Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01451"},{"key":"e_1_3_2_1_80_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Zou Xueyan","year":"2024","unstructured":"Xueyan Zou, Jianwei Yang, Hao Zhang, Feng Li, Linjie Li, Jianfeng Wang, Lijuan Wang, Jianfeng Gao, and Yong Jae Lee. 2024. Segment everything everywhere all at once. Advances in Neural Information Processing Systems, Vol. 36 (2024)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681275","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681275","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:43Z","timestamp":1750295863000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681275"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":80,"alternative-id":["10.1145\/3664647.3681275","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681275","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}