{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T02:10:11Z","timestamp":1755828611592,"version":"3.44.0"},"reference-count":53,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2027,8,5]],"date-time":"2027-08-05T00:00:00Z","timestamp":1817424000000},"content-version":"am","delay-in-days":703,"URL":"http:\/\/www.elsevier.com\/open-access\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001691","name":"Japan Society for the Promotion of Science","doi-asserted-by":"publisher","award":["22K12079"],"award-info":[{"award-number":["22K12079"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Journal of Visual Communication and Image Representation"],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1016\/j.jvcir.2025.104554","type":"journal-article","created":{"date-parts":[[2025,7,30]],"date-time":"2025-07-30T15:14:44Z","timestamp":1753888484000},"page":"104554","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["EFTrack: Enhanced fusion for visual object tracking"],"prefix":"10.1016","volume":"111","author":[{"given":"Xu","family":"Guan","sequence":"first","affiliation":[]},{"given":"Chunyan","family":"Hu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1991-9143","authenticated-orcid":false,"given":"Lin","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Shuai","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Feifei","family":"Lee","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3079-9207","authenticated-orcid":false,"given":"Qiu","family":"Chen","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.jvcir.2025.104554_b0005","doi-asserted-by":"crossref","unstructured":"Bertinetto L, Valmadre J, Henriques J F, et al., Fully-convolutional Siamese networks for object tracking, in Proc. European Conference on Computer Vision Workshops, 2016, pp. 850-865.","DOI":"10.1007\/978-3-319-48881-3_56"},{"key":"10.1016\/j.jvcir.2025.104554_b0010","doi-asserted-by":"crossref","unstructured":"Nam H, Han B. Learning multi-domain convolutional neural networks for visual tracking, in Proc. IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 4293-4302.","DOI":"10.1109\/CVPR.2016.465"},{"key":"10.1016\/j.jvcir.2025.104554_b0015","doi-asserted-by":"crossref","unstructured":"Li B, Wu W, Wang Q, et al., SiamRPN++: Evolution of Siamese visual tracking with very deep networks, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 4282-4291.","DOI":"10.1109\/CVPR.2019.00441"},{"key":"10.1016\/j.jvcir.2025.104554_b0020","doi-asserted-by":"crossref","unstructured":"Bhat G, Danelljan M, Gool L V, et al., Learning discriminative model prediction for tracking, in Proc. IEEE\/CVF International Conference on Computer Vision, 2019, pp. 6182-6191.","DOI":"10.1109\/ICCV.2019.00628"},{"key":"10.1016\/j.jvcir.2025.104554_b0025","doi-asserted-by":"crossref","unstructured":"Voigtlaender P, Luiten J, Torr P H S, et al., Siam R-CNN: Visual tracking by re-detection, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 6578-6588.","DOI":"10.1109\/CVPR42600.2020.00661"},{"key":"10.1016\/j.jvcir.2025.104554_b0030","doi-asserted-by":"crossref","unstructured":"Zhang Z, Peng H, Fu J, et al., Ocean: Object-aware anchor-free tracking, in Proc. European Conference on Computer Vision, 2020, pp. 771-787.","DOI":"10.1007\/978-3-030-58589-1_46"},{"key":"10.1016\/j.jvcir.2025.104554_b0035","doi-asserted-by":"crossref","unstructured":"Wang N, Zhou W, Wang J, et al., Transformer meets tracker: Exploiting temporal context for robust visual tracking, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 1571-1580.","DOI":"10.1109\/CVPR46437.2021.00162"},{"key":"10.1016\/j.jvcir.2025.104554_b0040","doi-asserted-by":"crossref","unstructured":"Chen X, Yan B, Zhu J, et al., Transformer tracking, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 8126-8135.","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"10.1016\/j.jvcir.2025.104554_b0045","doi-asserted-by":"crossref","unstructured":"Yan B, Peng H, Fu J, et al., Learning spatio-temporal transformer for visual tracking, in Proc. IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10448-10457.","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"10.1016\/j.jvcir.2025.104554_b0050","doi-asserted-by":"crossref","unstructured":"Song Z, Yu J, Chen Y P P, et al., Transformer tracking with cyclic shifting window attention, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 8791-8800.","DOI":"10.1109\/CVPR52688.2022.00859"},{"key":"10.1016\/j.jvcir.2025.104554_b0055","first-page":"16743","article-title":"SwinTrack: a simple and strong baseline for transformer tracking","volume":"35","author":"Lin","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.jvcir.2025.104554_b0060","doi-asserted-by":"crossref","unstructured":"Gao S, Zhou C, Ma C, et al., AiATrack: Attention in attention for transformer visual tracking, in Proc. European Conference on Computer Vision, 2022, pp. 146-164.","DOI":"10.1007\/978-3-031-20047-2_9"},{"key":"10.1016\/j.jvcir.2025.104554_b0065","doi-asserted-by":"crossref","unstructured":"Chen B, Li P, Bai L, et al., Backbone is all your need: A simplified architecture for visual object tracking, in Proc. European Conference on Computer Vision, 2022, pp. 375-392.","DOI":"10.1007\/978-3-031-20047-2_22"},{"key":"10.1016\/j.jvcir.2025.104554_b0070","doi-asserted-by":"crossref","unstructured":"Ye B, Chang H, Ma B, et al., Joint feature learning and relation modeling for tracking: A one-stream framework, in Proc. European Conference on Computer Vision, 2022, pp. 341-357.","DOI":"10.1007\/978-3-031-20047-2_20"},{"key":"10.1016\/j.jvcir.2025.104554_b0075","doi-asserted-by":"crossref","unstructured":"Cui Y, Jiang C, Wang L, et al., MixFormer: End-to-end tracking with iterative mixed attention, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 13608-13618.","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"10.1016\/j.jvcir.2025.104554_b0080","doi-asserted-by":"crossref","unstructured":"Cui Y, Jiang C, Wu G, et al., MixFormer: End-to-end tracking with iterative mixed attention, arXiv preprint arXiv:2302.02814, 2023.","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"10.1016\/j.jvcir.2025.104554_b0085","doi-asserted-by":"crossref","unstructured":"He K, Zhang C, Xie S, et al., Target-aware tracking with long-term context attention, arXiv preprint arXiv:2302.13840, 2023.","DOI":"10.1609\/aaai.v37i1.25155"},{"key":"10.1016\/j.jvcir.2025.104554_b0090","doi-asserted-by":"crossref","unstructured":"Chen X, Peng H, Wang D, et al., SeqTrack: Sequence to sequence learning for visual object tracking, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 14572-14581.","DOI":"10.1109\/CVPR52729.2023.01400"},{"key":"10.1016\/j.jvcir.2025.104554_b0095","doi-asserted-by":"crossref","unstructured":"Wu Q, Yang T, Liu Z, et al., DropMAE: Masked autoencoders with spatial-attention dropout for tracking tasks, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 14561-14571.","DOI":"10.1109\/CVPR52729.2023.01399"},{"issue":"8","key":"10.1016\/j.jvcir.2025.104554_b0100","doi-asserted-by":"crossref","first-page":"1125","DOI":"10.1109\/TMM.2015.2440996","article-title":"Visual object tracking by structure complexity coefficients","volume":"17","author":"Yuan","year":"2015","journal-title":"IEEE Trans. Multimedia"},{"issue":"5","key":"10.1016\/j.jvcir.2025.104554_b0105","doi-asserted-by":"crossref","first-page":"1122","DOI":"10.1109\/TMM.2018.2872897","article-title":"Multi-correlation filters with triangle-structure constraints for object tracking","volume":"21","author":"Ruan","year":"2018","journal-title":"IEEE Trans. Multimedia"},{"key":"10.1016\/j.jvcir.2025.104554_b0110","doi-asserted-by":"crossref","first-page":"301","DOI":"10.1109\/TMM.2021.3050073","article-title":"Robust visual object tracking via adaptive attribute-aware discriminative correlation filters","volume":"24","author":"Zhu","year":"2021","journal-title":"IEEE Trans. Multimedia"},{"key":"10.1016\/j.jvcir.2025.104554_b0115","doi-asserted-by":"crossref","unstructured":"Hong L, Yan S, Zhang R, et al., Onetracker: Unifying visual object tracking with foundation models and efficient tuning, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 19079-19091.","DOI":"10.1109\/CVPR52733.2024.01805"},{"key":"10.1016\/j.jvcir.2025.104554_b0120","doi-asserted-by":"crossref","unstructured":"Xiao Y, Wang Q, Zhang S, et al., SpatialTracker: Tracking any 2D pixels in 3D space, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 20406-20417.","DOI":"10.1109\/CVPR52733.2024.01929"},{"key":"10.1016\/j.jvcir.2025.104554_b0125","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, et al., An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929, 2020."},{"key":"10.1016\/j.jvcir.2025.104554_b0130","first-page":"980","article-title":"Global filter networks for image classification","volume":"34","author":"Rao","year":"2021","journal-title":"Adv. Neural Inf. Proces. Syst."},{"key":"10.1016\/j.jvcir.2025.104554_b0135","unstructured":"Patro B N, Namboodiri V P, Agneeswaran V S. SpectFormer: Frequency and attention are what you need in a vision Transformer, arXiv preprint arXiv:2304.06446, 2023."},{"key":"10.1016\/j.jvcir.2025.104554_b0140","doi-asserted-by":"crossref","unstructured":"Lin T Y, Doll\u00e1r P, Girshick R, et al., Feature pyramid networks for object detection, in Proc. IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 2117-2125.","DOI":"10.1109\/CVPR.2017.106"},{"key":"10.1016\/j.jvcir.2025.104554_b0145","doi-asserted-by":"crossref","first-page":"127","DOI":"10.1016\/j.neucom.2022.02.016","article-title":"An improved feature pyramid network for object detection","volume":"483","author":"Zhu","year":"2022","journal-title":"Neurocomputing"},{"key":"10.1016\/j.jvcir.2025.104554_b0150","doi-asserted-by":"crossref","first-page":"179666","DOI":"10.1109\/ACCESS.2020.3027590","article-title":"Pedestrian as points: an improved anchor-free method for center-based pedestrian detection","volume":"8","author":"Cai","year":"2020","journal-title":"IEEE Access"},{"issue":"12","key":"10.1016\/j.jvcir.2025.104554_b0155","doi-asserted-by":"crossref","first-page":"6933","DOI":"10.3390\/app13126933","article-title":"DMA-Net: Decoupled multi-scale attention for few-shot object detection","volume":"13","author":"Xie","year":"2023","journal-title":"Appl. Sci."},{"key":"10.1016\/j.jvcir.2025.104554_b0160","doi-asserted-by":"crossref","unstructured":"Ronneberger O, Fischer P, Brox T. U-Net: Convolutional networks for biomedical image segmentation, in Proc. Medical Image Computing and Computer-Assisted Intervention, 2015, pp. 234-241.","DOI":"10.1007\/978-3-319-24574-4_28"},{"issue":"4","key":"10.1016\/j.jvcir.2025.104554_b0165","doi-asserted-by":"crossref","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","article-title":"DeepLab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs","volume":"40","author":"Chen","year":"2017","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.jvcir.2025.104554_b0170","unstructured":"Yang X, Wang X. Kolmogorov-arnold transformer, arXiv preprint arXiv:2409.10594, 2024."},{"key":"10.1016\/j.jvcir.2025.104554_b0175","doi-asserted-by":"crossref","unstructured":"Kuipers T P, Arya D, Gupta D K. Hard occlusions in visual object tracking, in Proc. European Conference on Computer Vision Workshops, 2020, pp. 299-314.","DOI":"10.1007\/978-3-030-68238-5_22"},{"key":"10.1016\/j.jvcir.2025.104554_b0180","first-page":"25739","article-title":"Deep model reassembly","volume":"35","author":"Yang","year":"2022","journal-title":"Adv. Neural Inf. Proces. Syst."},{"key":"10.1016\/j.jvcir.2025.104554_b0185","doi-asserted-by":"crossref","unstructured":"Jing Y, Mao Y, Yang Y, et al., Learning graph neural networks for image style transfer, in Proc. European Conference on Computer Vision, 2022, pp. 111-128.","DOI":"10.1007\/978-3-031-20071-7_7"},{"key":"10.1016\/j.jvcir.2025.104554_b0190","doi-asserted-by":"crossref","unstructured":"Jing Y, Liu Y, Yang Y, et al., Stroke controllable fast style transfer with adaptive receptive fields, in Proc. European Conference on Computer Vision, 2018, pp. 238-254.","DOI":"10.1007\/978-3-030-01261-8_15"},{"key":"10.1016\/j.jvcir.2025.104554_b0195","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, et al., ImageNet: A large-scale hierarchical image database, in Proc. IEEE Conference on Computer Vision and Pattern Recognition, 2009, pp. 248-255.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"10.1016\/j.jvcir.2025.104554_b0200","first-page":"1100","article-title":"Dataset distillation via factorization","volume":"35","author":"Liu","year":"2022","journal-title":"Adv. Neural Inf. Proces. Syst."},{"key":"10.1016\/j.jvcir.2025.104554_b0205","unstructured":"Carreira J, Noland E, Hillier C, et al., A short note on the kinetics-700 human action dataset, arXiv preprint arXiv:1907.06987, 2019."},{"key":"10.1016\/j.jvcir.2025.104554_b0210","doi-asserted-by":"crossref","unstructured":"Lin T Y, Goyal P, Girshick R, et al., Focal loss for dense object detection, in Proc. IEEE International Conference on Computer Vision, 2017, pp. 2980-2988.","DOI":"10.1109\/ICCV.2017.324"},{"key":"10.1016\/j.jvcir.2025.104554_b0215","doi-asserted-by":"crossref","unstructured":"Rezatofighi H, Tsoi N, Gwak J Y, et al., Generalized intersection over union: A metric and a loss for bounding box regression, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 658-666.","DOI":"10.1109\/CVPR.2019.00075"},{"key":"10.1016\/j.jvcir.2025.104554_b0220","unstructured":"Loshchilov I, Hutter F. Decoupled weight decay regularization, arXiv preprint arXiv:1711.05101, 2017."},{"key":"10.1016\/j.jvcir.2025.104554_b0225","doi-asserted-by":"crossref","unstructured":"Muller M, Bibi A, Giancola S, et al., TrackingNet: A large-scale dataset and benchmark for object tracking in the wild, in Proc. European Conference on Computer Vision, 2018, pp. 300-317.","DOI":"10.1007\/978-3-030-01246-5_19"},{"key":"10.1016\/j.jvcir.2025.104554_b0230","doi-asserted-by":"crossref","unstructured":"Fan H, Lin L, Yang F, et al., LaSOT: A high-quality benchmark for large-scale single object tracking, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 5374-5383.","DOI":"10.1109\/CVPR.2019.00552"},{"issue":"5","key":"10.1016\/j.jvcir.2025.104554_b0235","doi-asserted-by":"crossref","first-page":"1562","DOI":"10.1109\/TPAMI.2019.2957464","article-title":"Got-10k: a large high-diversity benchmark for generic object tracking in the wild","volume":"43","author":"Huang","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.jvcir.2025.104554_b0240","doi-asserted-by":"crossref","unstructured":"Lin T Y, Maire M, Belongie S, et al., Microsoft COCO: Common objects in context, in Proc. European Conference on Computer Vision, 2014, pp. 740-755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"10.1016\/j.jvcir.2025.104554_b0245","doi-asserted-by":"crossref","first-page":"439","DOI":"10.1007\/s11263-020-01387-y","article-title":"LaSOT: a high-quality large-scale single object tracking benchmark","volume":"129","author":"Fan","year":"2021","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.jvcir.2025.104554_b0250","doi-asserted-by":"crossref","unstructured":"Wang X, Shu X, Zhang Z, et al., Towards more flexible and accurate object tracking with natural language: Algorithms and benchmark, in Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 13763-13773.","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"10.1016\/j.jvcir.2025.104554_b0255","doi-asserted-by":"crossref","unstructured":"Mueller M, Smith N, Ghanem B. A benchmark and simulator for UAV tracking, in Proc. European Conference on Computer Vision, 2016, pp. 445-461.","DOI":"10.1007\/978-3-319-46448-0_27"},{"key":"10.1016\/j.jvcir.2025.104554_b0260","doi-asserted-by":"crossref","unstructured":"Wu Y, Lim J, Yang M H. Online object tracking: A benchmark, in Proc. IEEE Conference on Computer Vision and Pattern Recognition, 2013, pp. 2411-2418.","DOI":"10.1109\/CVPR.2013.312"},{"key":"10.1016\/j.jvcir.2025.104554_b0265","first-page":"10353","article-title":"Hornet: Efficient high-order spatial interactions with recursive gated convolutions","volume":"35","author":"Rao","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."}],"container-title":["Journal of Visual Communication and Image Representation"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1047320325001683?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1047320325001683?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T12:51:01Z","timestamp":1755780661000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1047320325001683"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":53,"alternative-id":["S1047320325001683"],"URL":"https:\/\/doi.org\/10.1016\/j.jvcir.2025.104554","relation":{},"ISSN":["1047-3203"],"issn-type":[{"type":"print","value":"1047-3203"}],"subject":[],"published":{"date-parts":[[2025,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"EFTrack: Enhanced fusion for visual object tracking","name":"articletitle","label":"Article Title"},{"value":"Journal of Visual Communication and Image Representation","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.jvcir.2025.104554","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Inc. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104554"}}