{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:04:42Z","timestamp":1767985482976,"version":"3.49.0"},"reference-count":70,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1016\/j.knosys.2025.113531","type":"journal-article","created":{"date-parts":[[2025,5,16]],"date-time":"2025-05-16T02:30:26Z","timestamp":1747362626000},"page":"113531","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":5,"special_numbering":"C","title":["RGB-D visual object tracking with transformer-based multi-modal feature fusion"],"prefix":"10.1016","volume":"322","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1617-1325","authenticated-orcid":false,"given":"Long","family":"Gao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuze","family":"Ke","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wanlin","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yan","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gang","family":"He","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunsong","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2025.113531_b1","first-page":"16743","article-title":"Swintrack: A simple and strong baseline for transformer tracking","volume":"35","author":"Lin","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2025.113531_b2","doi-asserted-by":"crossref","unstructured":"Y. Cui, C. Jiang, L. Wang, G. Wu, Mixformer: End-to-end tracking with iterative mixed attention, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 13608\u201313618.","DOI":"10.1109\/CVPR52688.2022.01324"},{"issue":"5","key":"10.1016\/j.knosys.2025.113531_b3","doi-asserted-by":"crossref","first-page":"1562","DOI":"10.1109\/TPAMI.2019.2957464","article-title":"Got-10k: A large high-diversity benchmark for generic object tracking in the wild","volume":"43","author":"Huang","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2025.113531_b4","doi-asserted-by":"crossref","unstructured":"M. Muller, A. Bibi, S. Giancola, S. Alsubaihi, B. Ghanem, Trackingnet: A large-scale dataset and benchmark for object tracking in the wild, in: Proceedings of the European Conference on Computer Vision, ECCV, 2018, pp. 300\u2013317.","DOI":"10.1007\/978-3-030-01246-5_19"},{"key":"10.1016\/j.knosys.2025.113531_b5","doi-asserted-by":"crossref","unstructured":"H. Fan, L. Lin, F. Yang, P. Chu, G. Deng, S. Yu, H. Bai, Y. Xu, C. Liao, H. Ling, Lasot: A high-quality benchmark for large-scale single object tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 5374\u20135383.","DOI":"10.1109\/CVPR.2019.00552"},{"key":"10.1016\/j.knosys.2025.113531_b6","doi-asserted-by":"crossref","unstructured":"J. Zhu, S. Lai, X. Chen, D. Wang, H. Lu, Visual prompt multi-modal tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 9516\u20139526.","DOI":"10.1109\/CVPR52729.2023.00918"},{"key":"10.1016\/j.knosys.2025.113531_b7","doi-asserted-by":"crossref","unstructured":"H. Zhao, D. Wang, H. Lu, Representation Learning for Visual Object Tracking by Masked Appearance Transfer, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 18696\u201318705.","DOI":"10.1109\/CVPR52729.2023.01793"},{"key":"10.1016\/j.knosys.2025.113531_b8","doi-asserted-by":"crossref","unstructured":"S. Yan, J. Yang, J. K\u00e4pyl\u00e4, F. Zheng, A. Leonardis, J.K. K\u00e4m\u00e4r\u00e4inen, Depthtrack: Unveiling the power of rgbd tracking, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10725\u201310733.","DOI":"10.1109\/ICCV48922.2021.01055"},{"key":"10.1016\/j.knosys.2025.113531_b9","doi-asserted-by":"crossref","unstructured":"A. Lukezic, U. Kart, J. Kapyla, A. Durmush, J.K. Kamarainen, J. Matas, M. Kristan, Cdtb: A color and depth visual object tracking dataset and benchmark, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 10013\u201310022.","DOI":"10.1109\/ICCV.2019.01011"},{"key":"10.1016\/j.knosys.2025.113531_b10","series-title":"Computer Vision\u2013ECCV 2020 Workshops: Glasgow, UK, August 23\u201328, 2020, Proceedings, Part V 16","first-page":"547","article-title":"The eighth visual object tracking VOT2020 challenge results","author":"Kristan","year":"2020"},{"key":"10.1016\/j.knosys.2025.113531_b11","unstructured":"M. Kristan, J. Matas, A. Leonardis, M. Felsberg, R. Pflugfelder, J.K. K\u00e4m\u00e4r\u00e4inen, H.J. Chang, M. Danelljan, L. Cehovin, A. Luke\u017ei\u010d, et al., The ninth visual object tracking vot2021 challenge results, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 2711\u20132738."},{"key":"10.1016\/j.knosys.2025.113531_b12","series-title":"Rgbd object tracking: An in-depth review","author":"Yang","year":"2022"},{"key":"10.1016\/j.knosys.2025.113531_b13","doi-asserted-by":"crossref","DOI":"10.1016\/j.asoc.2020.106302","article-title":"Robust fusion for RGB-D tracking using CNN features","volume":"92","author":"Wang","year":"2020","journal-title":"Appl. Soft Comput."},{"issue":"8","key":"10.1016\/j.knosys.2025.113531_b14","doi-asserted-by":"crossref","first-page":"2485","DOI":"10.1109\/TCYB.2017.2740952","article-title":"Robust fusion of color and depth data for RGB-D target tracking using adaptive range-invariant depth models and spatio-temporal consistency constraints","volume":"48","author":"Xiao","year":"2017","journal-title":"IEEE Trans. Cybern."},{"key":"10.1016\/j.knosys.2025.113531_b15","doi-asserted-by":"crossref","first-page":"81","DOI":"10.1016\/j.cviu.2016.05.011","article-title":"An occlusion-aware particle filter tracker to handle complex and persistent occlusions","volume":"150","author":"Meshgi","year":"2016","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.knosys.2025.113531_b16","series-title":"European Conference on Computer Vision","first-page":"431","article-title":"The tenth visual object tracking vot2022 challenge results","author":"Kristan","year":"2022"},{"key":"10.1016\/j.knosys.2025.113531_b17","first-page":"3870","article-title":"RGBD1K: A large-scale dataset and benchmark for RGB-D object tracking","volume":"vol. 37, no. 3","author":"Zhu","year":"2023"},{"key":"10.1016\/j.knosys.2025.113531_b18","doi-asserted-by":"crossref","unstructured":"H. Wu, B. Xiao, N. Codella, M. Liu, X. Dai, L. Yuan, L. Zhang, Cvt: Introducing convolutions to vision transformers, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 22\u201331.","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"10.1016\/j.knosys.2025.113531_b19","doi-asserted-by":"crossref","unstructured":"Z. Liu, Y. Lin, Y. Cao, H. Hu, Y. Wei, Z. Zhang, S. Lin, B. Guo, Swin transformer: Hierarchical vision transformer using shifted windows, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10.1016\/j.knosys.2025.113531_b20","series-title":"2021 IEEE International Conference on Image Processing","first-page":"1334","article-title":"Semantic role aware correlation transformer for text to video retrieval","author":"Satar","year":"2021"},{"key":"10.1016\/j.knosys.2025.113531_b21","series-title":"2019 IEEE International Conference on Image Processing","first-page":"1945","article-title":"Text recognition in images based on transformer with hierarchical attention","author":"Zhu","year":"2019"},{"key":"10.1016\/j.knosys.2025.113531_b22","doi-asserted-by":"crossref","unstructured":"K. He, X. Chen, S. Xie, Y. Li, P. Doll\u00e1r, R. Girshick, Masked autoencoders are scalable vision learners, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 16000\u201316009.","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"10.1016\/j.knosys.2025.113531_b23","series-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection","author":"Zhang","year":"2022"},{"issue":"4","key":"10.1016\/j.knosys.2025.113531_b24","doi-asserted-by":"crossref","first-page":"1025","DOI":"10.5755\/j01.itc.52.4.33460","article-title":"Adaptive clustering object detection method for UAV images under long-tailed distributions","volume":"52","author":"Li","year":"2023","journal-title":"Inf. Technol. Control."},{"issue":"2","key":"10.1016\/j.knosys.2025.113531_b25","doi-asserted-by":"crossref","first-page":"294","DOI":"10.5755\/j01.itc.51.2.30667","article-title":"Lightweight deeplearning method for multi-vehicle object recognition","volume":"51","author":"Li","year":"2022","journal-title":"Inf. Technol. Control."},{"issue":"1","key":"10.1016\/j.knosys.2025.113531_b26","doi-asserted-by":"crossref","first-page":"13","DOI":"10.5755\/j01.itc.50.1.25094","article-title":"Multi-object recognition method based on improved yolov2 model","volume":"50","author":"Li","year":"2021","journal-title":"Inf. Technol. Control."},{"issue":"22","key":"10.1016\/j.knosys.2025.113531_b27","doi-asserted-by":"crossref","first-page":"4855","DOI":"10.3390\/s19224855","article-title":"Multi-level features extraction for discontinuous target tracking in remote sensing image monitoring","volume":"19","author":"Zhou","year":"2019","journal-title":"Sensors"},{"issue":"3","key":"10.1016\/j.knosys.2025.113531_b28","doi-asserted-by":"crossref","first-page":"1098","DOI":"10.3390\/s22031098","article-title":"Tracking of a fixed-shape moving object based on the gradient descent method","volume":"22","author":"Masood","year":"2022","journal-title":"Sensors"},{"key":"10.1016\/j.knosys.2025.113531_b29","series-title":"Computer Vision\u2013ECCV 2016 Workshops: Amsterdam, the Netherlands, October 8-10 and 15-16, 2016, Proceedings, Part II 14","first-page":"850","article-title":"Fully-convolutional siamese networks for object tracking","author":"Bertinetto","year":"2016"},{"key":"10.1016\/j.knosys.2025.113531_b30","doi-asserted-by":"crossref","unstructured":"B. Li, J. Yan, W. Wu, Z. Zhu, X. Hu, High performance visual tracking with siamese region proposal network, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 8971\u20138980.","DOI":"10.1109\/CVPR.2018.00935"},{"key":"10.1016\/j.knosys.2025.113531_b31","doi-asserted-by":"crossref","unstructured":"B. Li, W. Wu, Q. Wang, F. Zhang, J. Xing, J. Yan, Siamrpn++: Evolution of siamese visual tracking with very deep networks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 4282\u20134291.","DOI":"10.1109\/CVPR.2019.00441"},{"key":"10.1016\/j.knosys.2025.113531_b32","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2020.2987601","article-title":"Siamon: Siamese occlusion-aware network for visual tracking","author":"Fan","year":"2021","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2025.113531_b33","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2022.3207202","article-title":"SiamTHN: Siamese target highlight network for visual tracking","author":"Bao","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2025.113531_b34","doi-asserted-by":"crossref","unstructured":"D. Guo, J. Wang, Y. Cui, Z. Wang, S. Chen, SiamCAR: Siamese fully convolutional classification and regression for visual tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 6269\u20136277.","DOI":"10.1109\/CVPR42600.2020.00630"},{"key":"10.1016\/j.knosys.2025.113531_b35","first-page":"12549","article-title":"Siamfc++: Towards robust and accurate visual tracking with target estimation guidelines","volume":"vol. 34, no. 07","author":"Xu","year":"2020"},{"issue":"2","key":"10.1016\/j.knosys.2025.113531_b36","doi-asserted-by":"crossref","first-page":"847","DOI":"10.1109\/TCSVT.2022.3207202","article-title":"Siamese-based twin attention network for visual tracking","volume":"33","author":"Bao","year":"2022","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"2","key":"10.1016\/j.knosys.2025.113531_b37","doi-asserted-by":"crossref","first-page":"674","DOI":"10.1109\/TCSVT.2021.3063001","article-title":"Feature aggregation networks based on dual attention capsules for visual object tracking","volume":"32","author":"Cao","year":"2021","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"7","key":"10.1016\/j.knosys.2025.113531_b38","doi-asserted-by":"crossref","first-page":"3068","DOI":"10.1109\/TCYB.2019.2936503","article-title":"Visual object tracking by hierarchical attention siamese network","volume":"50","author":"Shen","year":"2019","journal-title":"IEEE Trans. Cybern."},{"key":"10.1016\/j.knosys.2025.113531_b39","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2022.109666","article-title":"Visual object tracking via non-local correlation attention learning","volume":"254","author":"Gao","year":"2022","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2025.113531_b40","article-title":"Attention-driven memory network for online visual tracking","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.knosys.2025.113531_b41","doi-asserted-by":"crossref","unstructured":"X. Chen, B. Yan, J. Zhu, D. Wang, X. Yang, H. Lu, Transformer tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 8126\u20138135.","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"10.1016\/j.knosys.2025.113531_b42","doi-asserted-by":"crossref","unstructured":"N. Wang, W. Zhou, J. Wang, H. Li, Transformer meets tracker: Exploiting temporal context for robust visual tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 1571\u20131580.","DOI":"10.1109\/CVPR46437.2021.00162"},{"key":"10.1016\/j.knosys.2025.113531_b43","doi-asserted-by":"crossref","unstructured":"Z. Fu, Q. Liu, Z. Fu, Y. Wang, Stmtrack: Template-free visual tracking with space-time memory networks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 13774\u201313783.","DOI":"10.1109\/CVPR46437.2021.01356"},{"key":"10.1016\/j.knosys.2025.113531_b44","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109964","article-title":"Transformer-based visual object tracking via fine\u2013coarse concatenated attention and cross concatenated MLP","volume":"146","author":"Gao","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2025.113531_b45","article-title":"Transformer tracking via frequency fusion","author":"Hu","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2025.113531_b46","article-title":"Robust tracking via fully exploring background prior knowledge","author":"Zhou","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"4","key":"10.1016\/j.knosys.2025.113531_b47","doi-asserted-by":"crossref","first-page":"1671","DOI":"10.1109\/TCSVT.2022.3212987","article-title":"Leveraging local and global cues for visual tracking via parallel interaction network","volume":"33","author":"Zheng","year":"2022","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2025.113531_b48","doi-asserted-by":"crossref","unstructured":"Q. Wu, T. Yang, Z. Liu, B. Wu, Y. Shan, A.B. Chan, DropMAE: Masked Autoencoders with Spatial-Attention Dropout for Tracking Tasks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 14561\u201314571.","DOI":"10.1109\/CVPR52729.2023.01399"},{"key":"10.1016\/j.knosys.2025.113531_b49","doi-asserted-by":"crossref","unstructured":"X. Chen, H. Peng, D. Wang, H. Lu, H. Hu, SeqTrack: Sequence to Sequence Learning for Visual Object Tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 14572\u201314581.","DOI":"10.1109\/CVPR52729.2023.01400"},{"key":"10.1016\/j.knosys.2025.113531_b50","series-title":"European Conference on Computer Vision","first-page":"341","article-title":"Joint feature learning and relation modeling for tracking: A one-stream framework","author":"Ye","year":"2022"},{"key":"10.1016\/j.knosys.2025.113531_b51","article-title":"Robust multi-modality anchor graph-based label prediction for RGB-infrared tracking","author":"Lan","year":"2019","journal-title":"IEEE Trans. Ind. Inform."},{"key":"10.1016\/j.knosys.2025.113531_b52","doi-asserted-by":"crossref","first-page":"3335","DOI":"10.1109\/TIP.2021.3060862","article-title":"Jointly modeling motion and appearance cues for robust RGB-T tracking","volume":"30","author":"Zhang","year":"2021","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2025.113531_b53","doi-asserted-by":"crossref","unstructured":"H. Chen, Q. Wu, Y. Liang, X. Gao, H. Wang, Asynchronous tracking-by-detection on adaptive time surfaces for event-based object tracking, in: Proceedings of the 27th ACM International Conference on Multimedia, 2019, pp. 473\u2013481.","DOI":"10.1145\/3343031.3350975"},{"key":"10.1016\/j.knosys.2025.113531_b54","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TGRS.2022.3230378","article-title":"TFTN: A transformer-based fusion tracking framework of hyperspectral and RGB","volume":"60","author":"Zhao","year":"2022","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.knosys.2025.113531_b55","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1016\/j.neucom.2017.11.068","article-title":"Fusing two-stream convolutional neural networks for RGB-T object tracking","volume":"281","author":"Li","year":"2018","journal-title":"Neurocomputing"},{"key":"10.1016\/j.knosys.2025.113531_b56","series-title":"European Conference on Computer Vision","first-page":"478","article-title":"Learning dual-fused modality-aware representations for RGBD tracking","author":"Gao","year":"2022"},{"key":"10.1016\/j.knosys.2025.113531_b57","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109516","article-title":"A uniform transformer-based structure for feature fusion and enhancement for RGB-D saliency detection","volume":"140","author":"Wang","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2025.113531_b58","first-page":"927","article-title":"Bi-directional adapter for multimodal tracking","volume":"vol. 38, no. 2","author":"Cao","year":"2024"},{"key":"10.1016\/j.knosys.2025.113531_b59","doi-asserted-by":"crossref","unstructured":"J. Yang, Z. Li, F. Zheng, A. Leonardis, J. Song, Prompting for multi-modal tracking, in: Proceedings of the 30th ACM International Conference on Multimedia, 2022, pp. 3492\u20133500.","DOI":"10.1145\/3503161.3547851"},{"key":"10.1016\/j.knosys.2025.113531_b60","doi-asserted-by":"crossref","unstructured":"S. Awwad, F. Hussein, M. Piccardi, Local depth patterns for tracking in depth videos, in: Proceedings of the 23rd ACM International Conference on Multimedia, 2015, pp. 1115\u20131118.","DOI":"10.1145\/2733373.2806295"},{"issue":"3","key":"10.1016\/j.knosys.2025.113531_b61","doi-asserted-by":"crossref","first-page":"664","DOI":"10.1109\/TMM.2018.2863604","article-title":"Context-aware three-dimensional mean-shift with occlusion handling for robust object tracking in RGB-D videos","volume":"21","author":"Liu","year":"2018","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.knosys.2025.113531_b62","doi-asserted-by":"crossref","unstructured":"U. Kart, J.K. Kamarainen, J. Matas, How to make an rgbd tracker?, in: Proceedings of the European Conference on Computer Vision (ECCV) Workshops, 2018.","DOI":"10.1007\/978-3-030-11009-3_8"},{"key":"10.1016\/j.knosys.2025.113531_b63","doi-asserted-by":"crossref","unstructured":"U. Kart, A. Lukezic, M. Kristan, J.-K. Kamarainen, J. Matas, Object tracking by reconstruction with view-specific discriminative correlation filters, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 1339\u20131348.","DOI":"10.1109\/CVPR.2019.00143"},{"key":"10.1016\/j.knosys.2025.113531_b64","series-title":"2020 25th International Conference on Pattern Recognition","first-page":"7825","article-title":"DAL: A deep depth-aware long-term tracker","author":"Qian","year":"2021"},{"key":"10.1016\/j.knosys.2025.113531_b65","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2025.113531_b66","doi-asserted-by":"crossref","unstructured":"H. Rezatofighi, N. Tsoi, J. Gwak, A. Sadeghian, I. Reid, S. Savarese, Generalized intersection over union: A metric and a loss for bounding box regression, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 658\u2013666.","DOI":"10.1109\/CVPR.2019.00075"},{"key":"10.1016\/j.knosys.2025.113531_b67","series-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"10.1016\/j.knosys.2025.113531_b68","series-title":"2020 25th International Conference on Pattern Recognition","first-page":"670","article-title":"TSDM: Tracking by SiamRPN++ with a depth-refiner and a mask-generator","author":"Zhao","year":"2021"},{"key":"10.1016\/j.knosys.2025.113531_b69","doi-asserted-by":"crossref","unstructured":"B. Yan, H. Peng, J. Fu, D. Wang, H. Lu, Learning spatio-temporal transformer for visual tracking, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10448\u201310457.","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"10.1016\/j.knosys.2025.113531_b70","doi-asserted-by":"crossref","unstructured":"B. Yan, X. Zhang, D. Wang, H. Lu, X. Yang, Alpha-refine: Boosting tracking performance by precise bounding box estimation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 5289\u20135298.","DOI":"10.1109\/CVPR46437.2021.00525"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705125005775?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705125005775?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T09:42:04Z","timestamp":1762335724000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705125005775"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7]]},"references-count":70,"alternative-id":["S0950705125005775"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2025.113531","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2025,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"RGB-D visual object tracking with transformer-based multi-modal feature fusion","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2025.113531","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113531"}}