{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,22]],"date-time":"2026-03-22T11:34:09Z","timestamp":1774179249629,"version":"3.50.1"},"reference-count":40,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100013072","name":"Major Science and Technology Project of Hainan Province","doi-asserted-by":"publisher","award":["2021HZ022007"],"award-info":[{"award-number":["2021HZ022007"]}],"id":[{"id":"10.13039\/501100013072","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFB3600503"],"award-info":[{"award-number":["2021YFB3600503"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003392","name":"Natural Science Foundation of Fujian Province","doi-asserted-by":"publisher","award":["2020J01494"],"award-info":[{"award-number":["2020J01494"]}],"id":[{"id":"10.13039\/501100003392","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003392","name":"Natural Science Foundation of Fujian Province","doi-asserted-by":"publisher","award":["2021J01612"],"award-info":[{"award-number":["2021J01612"]}],"id":[{"id":"10.13039\/501100003392","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61972097"],"award-info":[{"award-number":["61972097"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21A20472"],"award-info":[{"award-number":["U21A20472"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1016\/j.patcog.2025.111663","type":"journal-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T11:54:21Z","timestamp":1744199661000},"page":"111663","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":5,"special_numbering":"C","title":["Language\u2013Image Consistency Augmentation and Distillation Network for visual grounding"],"prefix":"10.1016","volume":"166","author":[{"given":"Xiao","family":"Ke","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9500-9499","authenticated-orcid":false,"given":"Peirong","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Wenzhong","family":"Guo","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2025.111663_b1","article-title":"UniRTL: A universal RGBT and low-light benchmark for object tracking","volume":"158","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.111663_b2","doi-asserted-by":"crossref","first-page":"3937","DOI":"10.1109\/TMM.2023.3318289","article-title":"Inexactly matched referring expression comprehension with rationale","volume":"26","author":"Li","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2025.111663_b3","doi-asserted-by":"crossref","unstructured":"J. Yang, J. Duan, S. Tran, Y. Xu, S. Chanda, L. Chen, B. Zeng, T. Chilimbi, J. Huang, Vision-language pre-training with triple contrastive learning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 15671\u201315680.","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"10.1016\/j.patcog.2025.111663_b4","doi-asserted-by":"crossref","unstructured":"H. Tan, M. Bansal, LXMERT: Learning Cross-Modality Encoder Representations from Transformers, in: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), 2019, pp. 5100\u20135111.","DOI":"10.18653\/v1\/D19-1514"},{"key":"10.1016\/j.patcog.2025.111663_b5","series-title":"European Conference on Computer Vision","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"key":"10.1016\/j.patcog.2025.111663_b6","series-title":"International Conference on Machine Learning","first-page":"23318","article-title":"Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","author":"Wang","year":"2022"},{"key":"10.1016\/j.patcog.2025.111663_b7","series-title":"International Conference on Machine Learning","first-page":"38728","article-title":"Mplug-2: A modularized multi-modal foundation model across text, image and video","author":"Xu","year":"2023"},{"key":"10.1016\/j.patcog.2025.111663_b8","first-page":"32942","article-title":"Coarse-to-fine vision-language pre-training with fusion in the backbone","volume":"35","author":"Dou","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2025.111663_b9","doi-asserted-by":"crossref","unstructured":"F. Liu, S. Ge, X. Wu, Competence-based Multimodal Curriculum Learning for Medical Report Generation, in: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), 2021, pp. 3001\u20133012.","DOI":"10.18653\/v1\/2021.acl-long.234"},{"key":"10.1016\/j.patcog.2025.111663_b10","article-title":"GADNet: Improving image\u2013text matching via graph-based aggregation and disentanglement","volume":"157","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.111663_b11","article-title":"VLCDoC: Vision-language contrastive pre-training model for cross-Modal document classification","volume":"139","year":"2023","journal-title":"Pattern Recognit."},{"issue":"8","key":"10.1016\/j.patcog.2025.111663_b12","doi-asserted-by":"crossref","first-page":"2765","DOI":"10.1109\/TPAMI.2020.2973983","article-title":"Relationship-embedded representation learning for grounding referring expressions","volume":"43","author":"Yang","year":"2020","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2025.111663_b13","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2025.111663_b14","article-title":"Vlt: Vision-language transformer and query generation for referring segmentation","author":"Ding","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2025.111663_b15","doi-asserted-by":"crossref","first-page":"4334","DOI":"10.1109\/TMM.2023.3321501","article-title":"CLIP-VG: Self-paced curriculum adapting of CLIP for visual grounding","volume":"26","author":"Xiao","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2025.111663_b16","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110719","article-title":"Token-word mixer meets object-aware transformer for referring image segmentation","volume":"155","author":"Zhang","year":"2024","journal-title":"Pattern Recognit."},{"issue":"11","key":"10.1016\/j.patcog.2025.111663_b17","doi-asserted-by":"crossref","first-page":"13636","DOI":"10.1109\/TPAMI.2023.3296823","article-title":"TransVG++: End-to-end visual grounding with language conditioned vision transformer","volume":"45","author":"Deng","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2025.111663_b18","series-title":"European Conference on Computer Vision","first-page":"3","article-title":"Yoro-lightweight end to end visual grounding","author":"Ho","year":"2022"},{"key":"10.1016\/j.patcog.2025.111663_b19","doi-asserted-by":"crossref","unstructured":"S. Liu, S. Huang, F. Li, H. Zhang, Y. Liang, H. Su, J. Zhu, L. Zhang, Dq-detr: Dual query detection transformer for phrase extraction and grounding, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 37, 2023, pp. 1728\u20131736.","DOI":"10.1609\/aaai.v37i2.25261"},{"key":"10.1016\/j.patcog.2025.111663_b20","doi-asserted-by":"crossref","unstructured":"A. Kamath, M. Singh, Y. LeCun, G. Synnaeve, I. Misra, N. Carion, Mdetr-modulated detection for end-to-end multi-modal understanding, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 1780\u20131790.","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"10.1016\/j.patcog.2025.111663_b21","doi-asserted-by":"crossref","unstructured":"Z.-Y. Dou, Y. Xu, Z. Gan, J. Wang, S. Wang, L. Wang, C. Zhu, P. Zhang, L. Yuan, N. Peng, et al., An empirical study of training end-to-end vision-and-language transformers, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 18166\u201318176.","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"10.1016\/j.patcog.2025.111663_b22","doi-asserted-by":"crossref","unstructured":"J. Liu, H. Ding, Z. Cai, Y. Zhang, R.K. Satzoda, V. Mahadevan, R. Manmatha, PolyFormer: Referring image segmentation as sequential polygon generation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 18653\u201318663.","DOI":"10.1109\/CVPR52729.2023.01789"},{"key":"10.1016\/j.patcog.2025.111663_b23","first-page":"19652","article-title":"Referring transformer: A one-step approach to multi-task visual grounding","volume":"34","author":"Li","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2025.111663_b24","first-page":"121475","article-title":"Cogvlm: Visual expert for pretrained language models","volume":"37","author":"Wang","year":"2025","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2025.111663_b25","doi-asserted-by":"crossref","unstructured":"A. Shrivastava, A. Gupta, R. Girshick, Training region-based object detectors with online hard example mining, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 761\u2013769.","DOI":"10.1109\/CVPR.2016.89"},{"key":"10.1016\/j.patcog.2025.111663_b26","series-title":"International Conference on Machine Learning","first-page":"794","article-title":"Gradnorm: Gradient normalization for adaptive loss balancing in deep multitask networks","author":"Chen","year":"2018"},{"key":"10.1016\/j.patcog.2025.111663_b27","first-page":"1","article-title":"Penalizing the hard example but not too much: A strong baseline for fine-grained visual classification","author":"Liang","year":"2022","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.patcog.2025.111663_b28","doi-asserted-by":"crossref","unstructured":"P. Bhat, E. Arani, B. Zonooz, Distill on the go: Online knowledge distillation in self-supervised learning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 2678\u20132687.","DOI":"10.1109\/CVPRW53098.2021.00301"},{"key":"10.1016\/j.patcog.2025.111663_b29","doi-asserted-by":"crossref","unstructured":"L. Zhang, J. Song, A. Gao, J. Chen, C. Bao, K. Ma, Be your own teacher: Improve the performance of convolutional neural networks via self distillation, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 3713\u20133722.","DOI":"10.1109\/ICCV.2019.00381"},{"key":"10.1016\/j.patcog.2025.111663_b30","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110754","article-title":"Text-guided distillation learning to diversify video embeddings for text-video retrieval","volume":"156","author":"Lee","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.111663_b31","doi-asserted-by":"crossref","unstructured":"S.T. Wasim, M. Naseer, S. Khan, F.S. Khan, M. Shah, Vita-CLIP: Video and text adaptive CLIP via Multimodal Prompting, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 23034\u201323044.","DOI":"10.1109\/CVPR52729.2023.02206"},{"key":"10.1016\/j.patcog.2025.111663_b32","series-title":"International Conference on Machine Learning","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia","year":"2021"},{"key":"10.1016\/j.patcog.2025.111663_b33","series-title":"Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, the Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14","first-page":"792","article-title":"Modeling context between objects for referring expression understanding","author":"Nagaraja","year":"2016"},{"key":"10.1016\/j.patcog.2025.111663_b34","doi-asserted-by":"crossref","unstructured":"X. Liu, Z. Wang, J. Shao, X. Wang, H. Li, Improving referring expression grounding with cross-modal attention-guided erasing, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 1950\u20131959.","DOI":"10.1109\/CVPR.2019.00205"},{"issue":"2","key":"10.1016\/j.patcog.2025.111663_b35","doi-asserted-by":"crossref","first-page":"1523","DOI":"10.1109\/TNNLS.2022.3183827","article-title":"Word2pix: Word to pixel cross-attention transformer in visual grounding","volume":"35","author":"Zhao","year":"2022","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.patcog.2025.111663_b36","doi-asserted-by":"crossref","first-page":"8805","DOI":"10.1109\/TMM.2023.3241802","article-title":"Multiple relational learning network for joint referring expression comprehension and segmentation","volume":"25","author":"Hua","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2025.111663_b37","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.110084","article-title":"MPCCT: Multimodal vision-language learning paradigm with context-based compact transformer","volume":"147","author":"Chen","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2025.111663_b38","doi-asserted-by":"crossref","unstructured":"Y. Han, Y. Hu, X. Song, H. Tang, M. Xu, L. Nie, Exploiting the Social-Like Prior in Transformer for Visual Reasoning, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 38, 2024, pp. 2058\u20132066.","DOI":"10.1609\/aaai.v38i3.27977"},{"key":"10.1016\/j.patcog.2025.111663_b39","article-title":"LGR-NET: Language guided reasoning network for referring expression comprehension","author":"Lu","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2025.111663_b40","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.123794","article-title":"SINet: Improving relational features in two-stage referring expression comprehension","volume":"251","author":"Guo","year":"2024","journal-title":"Expert Syst. Appl."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320325003231?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320325003231?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,11,4]],"date-time":"2025-11-04T00:38:53Z","timestamp":1762216733000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320325003231"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10]]},"references-count":40,"alternative-id":["S0031320325003231"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2025.111663","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2025,10]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Language\u2013Image Consistency Augmentation and Distillation Network for visual grounding","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2025.111663","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"111663"}}