{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T04:06:59Z","timestamp":1778126819930,"version":"3.51.4"},"reference-count":50,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,7,21]],"date-time":"2025-07-21T00:00:00Z","timestamp":1753056000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100004826","name":"Natural Science Foundation of Beijing Municipality","doi-asserted-by":"publisher","award":["L25700"],"award-info":[{"award-number":["L25700"]}],"id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["E2ET1104"],"award-info":[{"award-number":["E2ET1104"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272438"],"award-info":[{"award-number":["62272438"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1016\/j.patcog.2025.112170","type":"journal-article","created":{"date-parts":[[2025,7,21]],"date-time":"2025-07-21T16:43:40Z","timestamp":1753116220000},"page":"112170","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":5,"special_numbering":"PA","title":["RETTA: Retrieval-enhanced test-time adaptation for zero-shot video captioning"],"prefix":"10.1016","volume":"171","author":[{"given":"Yunchuan","family":"Ma","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9923-5034","authenticated-orcid":false,"given":"Laiyun","family":"Qing","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3954-2387","authenticated-orcid":false,"given":"Guorong","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yuankai","family":"Qi","sequence":"additional","affiliation":[]},{"given":"Amin","family":"Beheshti","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3326-4147","authenticated-orcid":false,"given":"Quan Z.","family":"Sheng","sequence":"additional","affiliation":[]},{"given":"Qingming","family":"Huang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2025.112170_bib0001","series-title":"KDD","first-page":"2744","article-title":"Comprehensive information integration modeling framework for video titling","author":"Zhang","year":"2020"},{"key":"10.1016\/j.patcog.2025.112170_bib0002","article-title":"Wenlan: bridging vision and language by large-Scale multi-Modal pre-Training","volume":"abs\/2103.06561","author":"Huo","year":"2021","journal-title":"CoRR"},{"key":"10.1016\/j.patcog.2025.112170_bib0003","series-title":"ACM MM","first-page":"3234","article-title":"Search-oriented micro-video captioning","author":"Nie","year":"2022"},{"key":"10.1016\/j.patcog.2025.112170_bib0004","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111138","article-title":"Fully exploring object relation interaction and hidden state attention for video captioning","volume":"159","author":"Yuan","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112170_bib0005","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109906","article-title":"Global semantic enhancement network for video captioning","volume":"145","author":"Luo","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112170_bib0006","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.109202","article-title":"A multi-layer memory sharing network for video captioning","volume":"136","author":"Niu","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112170_bib0007","series-title":"ACL","first-page":"11908","article-title":"MultiCapCLIP: auto-encoding prompts for zero-shot multilingual visual captioning","author":"Yang","year":"2023"},{"key":"10.1016\/j.patcog.2025.112170_bib0008","series-title":"ICLR","article-title":"DeCap: decoding CLIP latents for zero-shot captioning via text-only training","author":"Li","year":"2023"},{"key":"10.1016\/j.patcog.2025.112170_bib0009","series-title":"CVPR","first-page":"14100","article-title":"MeaCap: memory-augmented zero-shot image captioning","author":"Zeng","year":"2024"},{"key":"10.1016\/j.patcog.2025.112170_bib0010","series-title":"AAAI","first-page":"22308","article-title":"Zero-shot image captioning with multi-type entity representations","author":"Zeng","year":"2025"},{"key":"10.1016\/j.patcog.2025.112170_bib0011","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.126906","article-title":"EIKA: Explicit & implicit knowledge-augmented network for entity-aware sports video captioning","volume":"274","author":"Xi","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.patcog.2025.112170_bib0012","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.129177","article-title":"A simple yet effective knowledge guided method for entity-aware video captioning on a basketball benchmark","volume":"619","author":"Xi","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.patcog.2025.112170_bib0013","doi-asserted-by":"crossref","first-page":"1316","DOI":"10.1162\/tacl_a_00605","article-title":"In-context retrieval-augmented language models","volume":"11","author":"Ram","year":"2023","journal-title":"Trans. Assoc. Comput. Linguistics"},{"key":"10.1016\/j.patcog.2025.112170_bib0014","series-title":"ACL","first-page":"9147","article-title":"Few-shot transfer learning for knowledge base question answering: fusing supervised models with in-context learning","author":"Patidar","year":"2024"},{"key":"10.1016\/j.patcog.2025.112170_bib0015","series-title":"CVPR","first-page":"27134","article-title":"MV-adapter: multimodal video transfer learning for video text retrieval","author":"Jin","year":"2024"},{"key":"10.1016\/j.patcog.2025.112170_bib0016","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111144","article-title":"Cross-modal adapter for vision\u2013language retrieval","volume":"159","author":"Jiang","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112170_bib0017","series-title":"ACL","first-page":"8086","article-title":"Fantastically ordered prompts and where to find them: overcoming few-shot prompt order sensitivity","author":"Lu","year":"2022"},{"key":"10.1016\/j.patcog.2025.112170_bib0018","series-title":"NAACL","first-page":"2300","article-title":"Do prompt-based models really understand the meaning of their prompts?","author":"Webson","year":"2022"},{"key":"10.1016\/j.patcog.2025.112170_bib0019","series-title":"NeurIPS","article-title":"Test-time prompt tuning for zero-shot generalization in vision-language models","author":"Shu","year":"2022"},{"key":"10.1016\/j.patcog.2025.112170_bib0020","series-title":"ACM MM","first-page":"5796","article-title":"VPA: fully test-time visual prompt adaptation","author":"Sun","year":"2023"},{"key":"10.1016\/j.patcog.2025.112170_bib0021","series-title":"CVPR","first-page":"9837","article-title":"Open-book video captioning with retrieve-copy-generate network","author":"Zhang","year":"2021"},{"key":"10.1016\/j.patcog.2025.112170_bib0022","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111176","article-title":"Pseudo-labeling with keyword refining for few-supervised video captioning","volume":"159","author":"Li","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112170_bib0023","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107702","article-title":"Enhancing the alignment between target words and corresponding frames for video captioning","volume":"111","author":"Tu","year":"2021","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112170_bib0024","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.109204","article-title":"Relation-aware attention for video captioning via graph learning","volume":"136","author":"Tu","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112170_bib0025","series-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, June 18\u201324, 2022","first-page":"17897","article-title":"ZeroCap: zero-shot image-to-text generation for visual-semantic arithmetic","author":"Tewel","year":"2022"},{"key":"10.1016\/j.patcog.2025.112170_bib0026","article-title":"Zero-shot video captioning with evolving pseudo-tokens","volume":"abs\/2207.11100","author":"Tewel","year":"2022","journal-title":"CoRR"},{"key":"10.1016\/j.patcog.2025.112170_bib0027","article-title":"Language models can see: plugging visual controls in text generation","volume":"abs\/2205.02655","author":"Su","year":"2022","journal-title":"CoRR"},{"key":"10.1016\/j.patcog.2025.112170_bib0028","series-title":"ICLR","article-title":"When and why vision-language models behave like bags-Of-words, and what to do about it?","author":"Y\u00fcksekg\u00f6n\u00fcl","year":"2023"},{"key":"10.1016\/j.patcog.2025.112170_bib0029","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111099","article-title":"Text\u2013video retrieval re-ranking via multi-grained cross attention and frozen image encoders","volume":"159","author":"Dai","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112170_bib0030","doi-asserted-by":"crossref","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","article-title":"CLIP4Clip: an empirical study of CLIP for end to end video clip retrieval and captioning","volume":"508","author":"Luo","year":"2022","journal-title":"Neurocomputing"},{"key":"10.1016\/j.patcog.2025.112170_bib0031","series-title":"ECCV","first-page":"1","article-title":"Expanding language-image pretrained models for general video recognition","volume":"13664","author":"Ni","year":"2022"},{"key":"10.1016\/j.patcog.2025.112170_bib0032","article-title":"REALM: retrieval-augmented language model pre-training","volume":"abs\/2002.08909","author":"Guu","year":"2020","journal-title":"CoRR"},{"key":"10.1016\/j.patcog.2025.112170_bib0033","series-title":"ACL","first-page":"6086","article-title":"Latent retrieval for weakly supervised open domain question answering","author":"Lee","year":"2019"},{"key":"10.1016\/j.patcog.2025.112170_bib0034","series-title":"EMNLP","first-page":"6769","article-title":"Dense passage retrieval for open-domain question answering","author":"Karpukhin","year":"2020"},{"key":"10.1016\/j.patcog.2025.112170_bib0035","article-title":"RECOMP: improving retrieval-augmented LMs with compression and selective augmentation","volume":"abs\/2310.04408","author":"Xu","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.patcog.2025.112170_bib0036","series-title":"ICML","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"issue":"8","key":"10.1016\/j.patcog.2025.112170_bib0037","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"10.1016\/j.patcog.2025.112170_bib0038","series-title":"ACL","first-page":"1825","article-title":"AoE: angle-optimized embeddings for semantic textual similarity","author":"Li","year":"2024"},{"key":"10.1016\/j.patcog.2025.112170_bib0039","article-title":"Zero-shot dense video captioning by jointly optimizing text and moment","volume":"abs\/2307.02682","author":"Jo","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.patcog.2025.112170_bib0040","series-title":"EMNLP","first-page":"1823","article-title":"CommonGen: a constrained text generation challenge for generative commonsense reasoning","volume":"EMNLP 2020","author":"Lin","year":"2020"},{"key":"10.1016\/j.patcog.2025.112170_bib0041","series-title":"CVPR","first-page":"5288","article-title":"MSR-VTT: a large video description dataset for bridging video and language","author":"Xu","year":"2016"},{"key":"10.1016\/j.patcog.2025.112170_bib0042","series-title":"ACL","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","author":"Chen","year":"2011"},{"key":"10.1016\/j.patcog.2025.112170_bib0043","series-title":"CVPR","first-page":"4581","article-title":"Vatex: a large-scale, high-quality multilingual dataset for video-and-language research","author":"Wang","year":"2019"},{"key":"10.1016\/j.patcog.2025.112170_bib0044","series-title":"ECAI","first-page":"1079","article-title":"Delving deeper into the decoder for video captioning","volume":"325","author":"Chen","year":"2020"},{"key":"10.1016\/j.patcog.2025.112170_bib0045","series-title":"WACV","first-page":"3038","article-title":"Improving video captioning with temporal composition of a visual-syntactic embedding*","author":"Perez-Martin","year":"2021"},{"key":"10.1016\/j.patcog.2025.112170_bib0046","series-title":"CVPR","first-page":"17918","article-title":"Hierarchical modular network for video captioning","author":"Ye","year":"2022"},{"key":"10.1016\/j.patcog.2025.112170_bib0047","series-title":"EMNLP","first-page":"543","article-title":"Video-LLaMA: an instruction-tuned audio-visual language model for video understanding","author":"Zhang","year":"2023"},{"issue":"2","key":"10.1016\/j.patcog.2025.112170_bib0048","doi-asserted-by":"crossref","first-page":"63:1","DOI":"10.1145\/3546828","article-title":"Learning video-text aligned representations for video captioning","volume":"19","author":"Shi","year":"2023","journal-title":"TOMM"},{"key":"10.1016\/j.patcog.2025.112170_bib0049","article-title":"Microsoft COCO captions: data collection and evaluation server","volume":"abs\/1504.00325","author":"Chen","year":"2015","journal-title":"CoRR"},{"key":"10.1016\/j.patcog.2025.112170_bib0050","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.126296","article-title":"Crocaps: a CLIP-assisted cross-domain video captioner","volume":"268","author":"Xu","year":"2025","journal-title":"Expert Syst. Appl."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320325008313?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320325008313?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,27]],"date-time":"2026-04-27T04:28:28Z","timestamp":1777264108000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320325008313"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":50,"alternative-id":["S0031320325008313"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2025.112170","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"RETTA: Retrieval-enhanced test-time adaptation for zero-shot video captioning","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2025.112170","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 The Author(s). Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"112170"}}