{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,27]],"date-time":"2026-04-27T14:32:55Z","timestamp":1777300375099,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3774904.3792099","type":"proceedings-article","created":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T21:54:34Z","timestamp":1775771674000},"page":"1911-1922","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["LoVR: A Benchmark for Long Video Retrieval in Multimodal Contexts"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-2963-2210","authenticated-orcid":false,"given":"Hao","family":"Liang","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China and Zhongguancun Academy, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5086-8285","authenticated-orcid":false,"given":"Qifeng","family":"Cai","sequence":"additional","affiliation":[{"name":"East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1837-9071","authenticated-orcid":false,"given":"Zhaoyang","family":"Han","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5579-8812","authenticated-orcid":false,"given":"Hejun","family":"Dong","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3009-2886","authenticated-orcid":false,"given":"Meiyi","family":"Qiang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3758-4335","authenticated-orcid":false,"given":"Ruichuan","family":"An","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8989-9662","authenticated-orcid":false,"given":"Quanqing","family":"Xu","sequence":"additional","affiliation":[{"name":"OceanBase, Ant Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1681-4677","authenticated-orcid":false,"given":"Bin","family":"Cui","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7532-5550","authenticated-orcid":false,"given":"Wentao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China and Zhongguancun Academy, Beijing, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2026,4,12]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_2_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_3_1","volume-title":"Andrea Madotto, Chen Wei, Tengyu Ma, Jiale Zhi, Jathushan Rajasegaran, Hanoona Rasheed, et al.","author":"Bolya Daniel","year":"2025","unstructured":"Daniel Bolya, Po-Yao Huang, Peize Sun, Jang Hyun Cho, Andrea Madotto, Chen Wei, Tengyu Ma, Jiale Zhi, Jathushan Rajasegaran, Hanoona Rasheed, et al., 2025. Perception encoder: The best visual embeddings are not at the output of the network. arXiv preprint arXiv:2504.13181 (2025)."},{"key":"e_1_3_2_1_4_1","volume-title":"Auroracap: Efficient, performant video detailed captioning and a new benchmark. arXiv preprint arXiv:2410.03051","author":"Chai Wenhao","year":"2024","unstructured":"Wenhao Chai, Enxin Song, Yilun Du, Chenlin Meng, Vashisht Madhavan, Omer Bar-Tal, Jenq-Neng Hwang, Saining Xie, and Christopher D Manning. 2024. Auroracap: Efficient, performant video detailed captioning and a new benchmark. arXiv preprint arXiv:2410.03051 (2024)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_1_6_1","unstructured":"Junxiang Chen Wenbin Yao et al. 2025. Expertized Caption Auto-Enhancement for Video-Text Retrieval. arXiv preprint arXiv:2502.02885 (2025)."},{"key":"e_1_3_2_1_7_1","volume-title":"Multimodal Lengthy Videos Retrieval Framework and Evaluation Metric. arXiv preprint arXiv:2504.04572","author":"Eltahir Mohamed","year":"2025","unstructured":"Mohamed Eltahir, Osamah Sarraj, Mohammed Bremoo, Mohammed Khurd, Abdulrahman Alfrihidi, Taha Alshatiri, Mohammad Almatrafi, and Tanveer Hussain. 2025. Multimodal Lengthy Videos Retrieval Framework and Evaluation Metric. arXiv preprint arXiv:2504.04572 (2025)."},{"key":"e_1_3_2_1_8_1","volume-title":"SPECTRUM: Semantic Processing and Emotion-informed video-Captioning Through Retrieval and Understanding Modalities. arXiv preprint arXiv:2411.01975","author":"Faghihi Ehsan","year":"2024","unstructured":"Ehsan Faghihi, Mohammedreza Zarenejad, and Ali-Asghar Beheshti Shirazi. 2024. SPECTRUM: Semantic Processing and Emotion-informed video-Captioning Through Retrieval and Understanding Modalities. arXiv preprint arXiv:2411.01975 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2024.105171"},{"key":"e_1_3_2_1_10_1","unstructured":"Zijian Gao Jingyu Liu Weiqi Sun Sheng Chen Dedan Chang and Lili Zhao. 2022. CLIP2TV: Align Match and Distill for Video-Text Retrieval. arXiv:2111.05610 [cs.CV] https:\/\/arxiv.org\/abs\/2111.05610"},{"key":"e_1_3_2_1_11_1","volume-title":"V2PE: Improving Multimodal Long-Context Capability of Vision-Language Models with Variable Visual Position Encoding. arXiv preprint arXiv:2412.09616","author":"Ge Junqi","year":"2024","unstructured":"Junqi Ge, Ziyi Chen, Jintao Lin, Jinguo Zhu, Xihui Liu, Jifeng Dai, and Xizhou Zhu. 2024. V2PE: Improving Multimodal Long-Context Capability of Vision-Language Models with Variable Visual Position Encoding. arXiv preprint arXiv:2412.09616 (2024)."},{"key":"e_1_3_2_1_12_1","unstructured":"Ning Han Jingjing Chen Chuhao Shi Yawen Zeng Guangyi Xiao and Hao Chen. 2022. BiC-Net: Learning Efficient Spatio-Temporal Relation for Text-Video Retrieval. arXiv:2110.15609 [cs.CV] https:\/\/arxiv.org\/abs\/2110.15609"},{"key":"e_1_3_2_1_13_1","volume-title":"TGIF: A New Dataset and Benchmark on Animated GIF Description. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Li Yuncheng","year":"2016","unstructured":"Yuncheng Li, Yale Song, Liangliang Cao, Joel Tetreault, Larry Goldberg, Alejandro Jaimes, and Jiebo Luo. 2016. TGIF: A New Dataset and Benchmark on Animated GIF Description. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_14_1","volume-title":"EVQAScore: Efficient Video Question Answering Data Evaluation. arXiv preprint arXiv:2411.06908","author":"Liang Hao","year":"2024","unstructured":"Hao Liang, Zirong Chen, and Wentao Zhang. 2024. EVQAScore: Efficient Video Question Answering Data Evaluation. arXiv preprint arXiv:2411.06908 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Mm-embed: Universal multimodal retrieval with multimodal llms. arXiv preprint arXiv:2411.02571","author":"Lin Sheng-Chieh","year":"2024","unstructured":"Sheng-Chieh Lin, Chankyu Lee, Mohammad Shoeybi, Jimmy Lin, Bryan Catanzaro, and Wei Ping. 2024. Mm-embed: Universal multimodal retrieval with multimodal llms. arXiv preprint arXiv:2411.02571 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_24"},{"key":"e_1_3_2_1_17_1","unstructured":"Huaishao Luo Lei Ji Ming Zhong Yang Chen Wen Lei Nan Duan and Tianrui Li. 2021. CLIP4Clip: An Empirical Study of CLIP for End to End Video Clip Retrieval. arXiv:2104.08860 [cs.CV] https:\/\/arxiv.org\/abs\/2104.08860"},{"key":"e_1_3_2_1_18_1","volume-title":"Video-RAG: Visually-aligned Retrieval-Augmented Long Video Comprehension. arXiv preprint arXiv:2411.13093","author":"Luo Yongdong","year":"2024","unstructured":"Yongdong Luo, Xiawu Zheng, Xiao Yang, Guilin Li, Haojia Lin, Jinfa Huang, Jiayi Ji, Fei Chao, Jiebo Luo, and Rongrong Ji. 2024. Video-RAG: Visually-aligned Retrieval-Augmented Long Video Comprehension. arXiv preprint arXiv:2411.13093 (2024)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-10073-7"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_21_1","unstructured":"PySceneDetect. 2021. PySceneDetect. https:\/\/www.scenedetect.com\/"},{"key":"e_1_3_2_1_22_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_23_1","volume-title":"International Journal of Computer Vision","author":"Rohrbach Anna","year":"2017","unstructured":"Anna Rohrbach, Atousa Torabi, Marcus Rohrbach, Niket Tandon, Chris Pal, Hugo Larochelle, Aaron Courville, and Bernt Schiele. 2017. Movie Description. International Journal of Computer Vision (2017)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"e_1_3_2_1_25_1","volume-title":"Ibrahim Alabdulmohsin, Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, et al.","author":"Tschannen Michael","year":"2025","unstructured":"Michael Tschannen, Alexey Gritsenko, Xiao Wang, Muhammad Ferjad Naeem, Ibrahim Alabdulmohsin, Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, et al., 2025. Siglip 2: Multilingual vision-language encoders with improved semantic understanding, localization, and dense features. arXiv preprint arXiv:2502.14786 (2025)."},{"key":"e_1_3_2_1_26_1","volume-title":"VideoCLIP-XL: Advancing Long Description Understanding for Video CLIP Models. arXiv preprint arXiv:2410.00741","author":"Wang Jiapeng","year":"2024","unstructured":"Jiapeng Wang, Chengyu Wang, Kunzhe Huang, Jun Huang, and Lianwen Jin. 2024b. VideoCLIP-XL: Advancing Long Description Understanding for Video CLIP Models. arXiv preprint arXiv:2410.00741 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Tarsier: Recipes for training and evaluating large video description models. arXiv preprint arXiv:2407.00634","author":"Wang Jiawei","year":"2024","unstructured":"Jiawei Wang, Liping Yuan, Yuchen Zhang, and Haomiao Sun. 2024c. Tarsier: Recipes for training and evaluating large video description models. arXiv preprint arXiv:2407.00634 (2024)."},{"key":"e_1_3_2_1_28_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_29_1","first-page":"28828","article-title":"Longvideobench: A benchmark for long-context interleaved video-language understanding","volume":"37","author":"Wu Haoning","year":"2024","unstructured":"Haoning Wu, Dongxu Li, Bei Chen, and Junnan Li. 2024. Longvideobench: A benchmark for long-context interleaved video-language understanding. Advances in Neural Information Processing Systems, Vol. 37 (2024), 28828-28857.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_30_1","volume-title":"Po-Yao Huang, Russell Howes, Vasu Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer, and Christoph Feichtenhofer.","author":"Xu Hu","year":"2023","unstructured":"Hu Xu, Saining Xie, Xiaoqing Ellen Tan, Po-Yao Huang, Russell Howes, Vasu Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer, and Christoph Feichtenhofer. 2023. Demystifying clip data. arXiv preprint arXiv:2309.16671 (2023)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_32_1","volume-title":"Fine-grained Video-Text Retrieval: A New Benchmark and Method. arXiv preprint arXiv:2501.00513","author":"Xu Yifan","year":"2024","unstructured":"Yifan Xu, Xinhao Li, Yichun Yang, Rui Huang, and Limin Wang. 2024. Fine-grained Video-Text Retrieval: A New Benchmark and Method. arXiv preprint arXiv:2501.00513 (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"e_1_3_2_1_36_1","volume-title":"GME: Improving Universal Multimodal Retrieval by Multimodal LLMs. arXiv preprint arXiv:2412.16855","author":"Zhang Xin","year":"2024","unstructured":"Xin Zhang, Yanzhao Zhang, Wen Xie, Mingxin Li, Ziqi Dai, Dingkun Long, Pengjun Xie, Meishan Zhang, Wenjie Li, and Min Zhang. 2024b. GME: Improving Universal Multimodal Retrieval by Multimodal LLMs. arXiv preprint arXiv:2412.16855 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"Mmicl: Empowering vision-language model with multi-modal in-context learning. arXiv preprint arXiv:2309.07915","author":"Zhao Haozhe","year":"2023","unstructured":"Haozhe Zhao, Zefan Cai, Shuzheng Si, Xiaojian Ma, Kaikai An, Liang Chen, Zixuan Liu, Sheng Wang, Wenjuan Han, and Baobao Chang. 2023. Mmicl: Empowering vision-language model with multi-modal in-context learning. arXiv preprint arXiv:2309.07915 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Video Editing for Video Retrieval. arXiv preprint arXiv:2402.02335","author":"Zhu Bin","year":"2024","unstructured":"Bin Zhu, Kevin Flanagan, Adriano Fragomeni, Michael Wray, and Dima Damen. 2024. Video Editing for Video Retrieval. arXiv preprint arXiv:2402.02335 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852","author":"Zhu Bin","year":"2023","unstructured":"Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, HongFa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, et al., 2023c. Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13735-023-00267-8"},{"key":"e_1_3_2_1_41_1","unstructured":"Cunjuan Zhu Qi Jia Wei Chen Yanming Guo and Yu Liu. 2023b. Deep Learning for Video-Text Retrieval: a Review. arXiv:2302.12552 [cs.CV] https:\/\/arxiv.org\/abs\/2302.12552"}],"event":{"name":"WWW '26: The ACM Web Conference 2026","location":"Dubai United Arab Emirates","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2026"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3774904.3792099","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,27]],"date-time":"2026-04-27T13:33:26Z","timestamp":1777296806000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774904.3792099"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":41,"alternative-id":["10.1145\/3774904.3792099","10.1145\/3774904"],"URL":"https:\/\/doi.org\/10.1145\/3774904.3792099","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-04-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}