{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T11:32:34Z","timestamp":1774351954262,"version":"3.50.1"},"reference-count":51,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,9,25]],"date-time":"2025-09-25T00:00:00Z","timestamp":1758758400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1016\/j.patcog.2025.112462","type":"journal-article","created":{"date-parts":[[2025,9,29]],"date-time":"2025-09-29T16:12:07Z","timestamp":1759162327000},"page":"112462","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":5,"special_numbering":"PB","title":["Parameter-efficient action planning with large language models for vision-and-language navigation"],"prefix":"10.1016","volume":"172","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3500-0898","authenticated-orcid":false,"given":"Bahram","family":"Mohammadi","sequence":"first","affiliation":[]},{"given":"Ehsan","family":"Abbasnejad","sequence":"additional","affiliation":[]},{"given":"Yuankai","family":"Qi","sequence":"additional","affiliation":[]},{"given":"Qi","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3027-8364","authenticated-orcid":false,"given":"Anton","family":"Van Den Hengel","sequence":"additional","affiliation":[]},{"given":"Javen Qinfeng","family":"Shi","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2025.112462_bib0001","series-title":"CVPR","article-title":"Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments","author":"Anderson","year":"2018"},{"key":"10.1016\/j.patcog.2025.112462_bib0002","series-title":"NeurIPS","first-page":"108208","article-title":"Vision-language navigation with energy-based policy","author":"Liu","year":"2025"},{"key":"10.1016\/j.patcog.2025.112462_bib0003","series-title":"ECCV","first-page":"260","article-title":"NavGPT-2: unleashing navigational reasoning capability for Large vision-language models","author":"Zhou","year":"2025"},{"key":"10.1016\/j.patcog.2025.112462_bib0004","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110511","article-title":"Memory-Adaptive vision-and-language navigation","author":"He","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112462_bib0005","series-title":"EMNLP","first-page":"4392","article-title":"Room-Across-Room: multilingual vision-and-language navigation with dense spatiotemporal grounding","author":"Ku","year":"2020"},{"key":"10.1016\/j.patcog.2025.112462_bib0006","series-title":"CVPR","article-title":"REVERIE: remote embodied visual referring expression in real indoor environments","author":"Qi","year":"2020"},{"key":"10.1016\/j.patcog.2025.112462_bib0007","series-title":"CVPR","first-page":"12689","article-title":"SOON: scenario oriented object navigation with graph-based exploration","author":"Zhu","year":"2021"},{"key":"10.1016\/j.patcog.2025.112462_bib0008","doi-asserted-by":"crossref","first-page":"7641","DOI":"10.1609\/aaai.v38i7.28597","article-title":"NavGPT: explicit reasoning in vision-and-language navigation with large language models","author":"Zhou","year":"2024","journal-title":"AAAI"},{"key":"10.1016\/j.patcog.2025.112462_bib0009","series-title":"ACL","first-page":"9796","article-title":"MapGPT: map-guided prompting with adaptive path planning for vision-and-language navigation","author":"Chen","year":"2024"},{"key":"10.1016\/j.patcog.2025.112462_bib0010","series-title":"TPAMI","first-page":"5945","article-title":"Navcot: boosting LLM-based vision-and-language navigation via learning disentangled reasoning","author":"Lin","year":"2025"},{"key":"10.1016\/j.patcog.2025.112462_bib0011","series-title":"NAACL","first-page":"950","article-title":"LangNav: language as a perceptual representation for navigation","author":"Pan","year":"2024"},{"key":"10.1016\/j.patcog.2025.112462_bib0012","series-title":"ICCV","first-page":"15758","article-title":"March in chat: interactive prompting for remote embodied referring expression","author":"Qiao","year":"2023"},{"key":"10.1016\/j.patcog.2025.112462_bib0013","series-title":"NeurIPS","first-page":"1877","article-title":"Language models are few-shot learners","author":"Brown","year":"2020"},{"key":"10.1016\/j.patcog.2025.112462_bib0014","series-title":"ECCV","first-page":"638","article-title":"Learning from unlabeled 3D environments for vision-and-language navigation","author":"Chen","year":"2022"},{"key":"10.1016\/j.patcog.2025.112462_bib0015","unstructured":"X.L. Li, P. Liang, Prefix-tuning: optimizing continuous prompts for generation, arXiv preprint arXiv: 2101.00190 (2021)."},{"key":"10.1016\/j.patcog.2025.112462_bib0016","series-title":"ICLR","article-title":"LoRA: low-rank adaptation of large language models","author":"Hu","year":"2022"},{"key":"10.1016\/j.patcog.2025.112462_bib0017","unstructured":"R. Rafailov, A. Sharma, E. Mitchell, C.D. Manning, S. Ermon, C. Finn, Direct preference optimization: your language model is secretly a reward model, NeurIPS (2024)."},{"key":"10.1016\/j.patcog.2025.112462_bib0018","series-title":"EMNLP","first-page":"3360","article-title":"Sub-instruction aware vision-and-language navigation","author":"Hong","year":"2020"},{"key":"10.1016\/j.patcog.2025.112462_bib0019","unstructured":"S. Chen, P.-L. Guhur, C. Schmid, I. Laptev, History aware multimodal transformer for vision-and-language navigation, NeurIPS (2021) 5834\u20135847."},{"key":"10.1016\/j.patcog.2025.112462_bib0020","series-title":"ICCV","first-page":"10873","article-title":"DREAMWALKER: mental planning for continuous vision-language navigation","author":"Wang","year":"2023"},{"key":"10.1016\/j.patcog.2025.112462_bib0021","unstructured":"L. Zhong, C. Gao, Z. Ding, Y. Liao, H. Ma, S. Zhang, X. Zhou, S. Liu, Topv-nav: unlocking the top-view spatial reasoning potential of mllm for zero-shot object navigation, arXiv preprint arXiv: 2411.16425 (2024)."},{"key":"10.1016\/j.patcog.2025.112462_bib0022","unstructured":"M. Ahn, A. Brohan, N. Brown, Y. Chebotar, O. Cortes, B. David, C. Finn, C. Fu, K. Gopalakrishnan, K. Hausman, et al., Do as i can, not as i say: grounding language in robotic affordances, arXiv preprint arXiv: 2204.01691 (2022)."},{"key":"10.1016\/j.patcog.2025.112462_bib0023","series-title":"ICML","first-page":"9118","article-title":"Language models as zero-shot planners: extracting actionable knowledge for embodied agents","author":"Huang","year":"2022"},{"key":"10.1016\/j.patcog.2025.112462_bib0024","series-title":"CoRL","first-page":"1769","article-title":"Inner monologue: embodied reasoning through planning with language models","author":"Huang","year":"2023"},{"key":"10.1016\/j.patcog.2025.112462_bib0025","series-title":"ICRA","first-page":"17380","article-title":"Discuss before moving: visual language navigation via multi-expert discussions","author":"Long","year":"2024"},{"key":"10.1016\/j.patcog.2025.112462_bib0026","article-title":"MA-FSAR: multimodal adaptation of CLIP for few-shot action recognition","author":"Xing","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112462_bib0027","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.112224","article-title":"IDEA: image description enhanced CLIP-adapter for image classification","author":"Ye","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112462_bib0028","article-title":"Layerlink: bridging remote sensing object detection and large vision models with efficient fine-tuning","author":"Zhu","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.112462_bib0029","series-title":"CVPR","first-page":"16537","article-title":"Think global, act local: dual-scale graph transformer for vision-and-language navigation","author":"Chen","year":"2022"},{"key":"10.1016\/j.patcog.2025.112462_bib0030","series-title":"EMNLP","first-page":"5103","article-title":"Lxmert: learning cross-modality encoder representations from transformers","author":"Tan","year":"2019"},{"key":"10.1016\/j.patcog.2025.112462_bib0031","series-title":"CVPR","first-page":"7036","article-title":"Scene-intuitive agent for remote embodied visual grounding","author":"Lin","year":"2021"},{"key":"10.1016\/j.patcog.2025.112462_bib0032","series-title":"NAACL","first-page":"2","article-title":"Bert: pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.patcog.2025.112462_bib0033","unstructured":"J. Lu, D. Batra, D. Parikh, S. Lee, Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks, NeurIPS (2019)."},{"key":"10.1016\/j.patcog.2025.112462_bib0034","series-title":"JMLR","first-page":"627","article-title":"A reduction of imitation learning and structured prediction to no-regret online learning","author":"Ross","year":"2011"},{"key":"10.1016\/j.patcog.2025.112462_bib0035","unstructured":"J. Wei, X. Wang, D. Schuurmans, M. Bosma, F. Xia, E. Chi, Q.V. Le, D. Zhou, et al., Chain-of-thought prompting elicits reasoning in large language models, NeurIPS (2022) 24824\u201324837."},{"key":"10.1016\/j.patcog.2025.112462_bib0036","series-title":"CVPR","first-page":"2955","article-title":"Open-vocabulary panoptic segmentation with text-to-image diffusion models","author":"Xu","year":"2023"},{"key":"10.1016\/j.patcog.2025.112462_bib0037","series-title":"ICML","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume":"139","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2025.112462_bib0038","unstructured":"H. Touvron, L. Martin, K. Stone, P. Albert, A. Almahairi, Y. Babaei, N. Bashlykov, S. Batra, P. Bhargava, S. Bhosale, et al., Llama 2: open foundation and fine-tuned chat models, arXiv preprint arXiv: 2307.09288 (2023)."},{"key":"10.1016\/j.patcog.2025.112462_bib0039","series-title":"ICLR","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2018"},{"key":"10.1016\/j.patcog.2025.112462_bib0040","series-title":"ICLR","article-title":"An image is worth 16x16 words: transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"10.1016\/j.patcog.2025.112462_bib0041","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","article-title":"Imagenet large scale visual recognition challenge","volume":"115","author":"Russakovsky","year":"2015","journal-title":"IJCV"},{"key":"10.1016\/j.patcog.2025.112462_bib0042","series-title":"CVPR","first-page":"15418","article-title":"HOP: history-and-order aware pre-training for vision-and-language navigation","author":"Qiao","year":"2022"},{"key":"10.1016\/j.patcog.2025.112462_bib0043","series-title":"Proceedings of the 30th ACM International Conference on Multimedia","first-page":"4194","article-title":"Target-driven structured transformer planner for vision-language navigation","author":"Zhao","year":"2022"},{"key":"10.1016\/j.patcog.2025.112462_bib0044","series-title":"ICCV","first-page":"8317","article-title":"Learning vision-and-language navigation from YouTube videos","author":"Lin","year":"2023"},{"key":"10.1016\/j.patcog.2025.112462_bib0045","series-title":"ICCV","first-page":"2737","article-title":"Bevbert: multimodal map pre-training for language-guided navigation","author":"An","year":"2023"},{"key":"10.1016\/j.patcog.2025.112462_bib0046","series-title":"ICCV","first-page":"10968","article-title":"Bird\u2019s-eye-view scene graph for vision-language navigation","author":"Liu","year":"2023"},{"key":"10.1016\/j.patcog.2025.112462_bib0047","series-title":"CVPR","first-page":"14911","article-title":"Adaptive zone-aware hierarchical planner for vision-language navigation","author":"Gao","year":"2023"},{"key":"10.1016\/j.patcog.2025.112462_bib0048","series-title":"AAAI","first-page":"4269","article-title":"Augmented commonsense knowledge for remote object grounding","author":"Mohammadi","year":"2024"},{"key":"10.1016\/j.patcog.2025.112462_bib0049","series-title":"CVPR","first-page":"16317","article-title":"Volumetric environment representation for vision-language navigation","author":"Liu","year":"2024"},{"key":"10.1016\/j.patcog.2025.112462_bib0050","series-title":"CVPR","first-page":"2583","article-title":"KERM: knowledge enhanced reasoning for vision-and-language navigation","author":"Li","year":"2023"},{"key":"10.1016\/j.patcog.2025.112462_bib0051","series-title":"ICLR","article-title":"Bootstrapping language-guided navigation learning with self-refining data flywheel","author":"Wang","year":"2025"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320325011252?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320325011252?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T18:34:53Z","timestamp":1765218893000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320325011252"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":51,"alternative-id":["S0031320325011252"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2025.112462","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Parameter-efficient action planning with large language models for vision-and-language navigation","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2025.112462","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 The Author(s). Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"112462"}}