{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T18:11:30Z","timestamp":1767982290865,"version":"3.49.0"},"reference-count":42,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100003213","name":"Beijing Municipal Education Commission","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003213","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Vision and Image Understanding"],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1016\/j.cviu.2024.104183","type":"journal-article","created":{"date-parts":[[2024,9,20]],"date-time":"2024-09-20T01:13:45Z","timestamp":1726794825000},"page":"104183","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":2,"special_numbering":"C","title":["LCMA-Net: A light cross-modal attention network for streamer re-identification in live video"],"prefix":"10.1016","volume":"249","author":[{"given":"Jiacheng","family":"Yao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1290-0738","authenticated-orcid":false,"given":"Jing","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Hui","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Li","family":"Zhuo","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"8","key":"10.1016\/j.cviu.2024.104183_b1","first-page":"4021","article-title":"Deep polynomial neural networks","volume":"44","author":"Chrysos","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.cviu.2024.104183_b2","doi-asserted-by":"crossref","unstructured":"Chung,\u00a0J.S., Nagrani,\u00a0A., Zisserman,\u00a0A., 2018. VoxCeleb2: Deep speaker recognition. In: INTERSPEECH. pp. 1086\u20131090.","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"10.1016\/j.cviu.2024.104183_b3","series-title":"The 53th statistical report on internet development in China","author":"CNNIC","year":"2024"},{"key":"10.1016\/j.cviu.2024.104183_b4","doi-asserted-by":"crossref","unstructured":"Davila,\u00a0D., Du,\u00a0D., Lewis,\u00a0B., Funk,\u00a0C., Van\u00a0Pelt,\u00a0J., Collins,\u00a0R., Corona,\u00a0K., Brown,\u00a0M., McCloskey,\u00a0S., Hoogs,\u00a0A., Clipp,\u00a0B., 2023. MEVID: Multi-view extended videos with identities for video person re-identification. In: IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 1634\u20131643.","DOI":"10.1109\/WACV56688.2023.00168"},{"key":"10.1016\/j.cviu.2024.104183_b5","series-title":"Trillionpairs","author":"Deep Glint","year":"2020"},{"key":"10.1016\/j.cviu.2024.104183_b6","doi-asserted-by":"crossref","unstructured":"Deng,\u00a0J., Guo,\u00a0J., Ververas,\u00a0E., Kotsia,\u00a0I., Zafeiriou,\u00a0S., 2020. RetinaFace: Single-shot multi-level face localisation in the wild. In: IEEE Conference on Computer Vision and Pattern Recognition. pp. 5202\u20135211.","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"10.1016\/j.cviu.2024.104183_b7","doi-asserted-by":"crossref","unstructured":"Deng,\u00a0J., Guo,\u00a0J., Xue,\u00a0N., Zafeiriou,\u00a0S., 2019. ArcFace: Additive angular margin loss for deep face recognition. In: IEEE Conference on Computer Vision and Pattern Recognition. pp. 4685\u20134694.","DOI":"10.1109\/CVPR.2019.00482"},{"key":"10.1016\/j.cviu.2024.104183_b8","doi-asserted-by":"crossref","unstructured":"Duta,\u00a0I.C., Liu,\u00a0L., Zhu,\u00a0F., Shao,\u00a0L., 2021. Improved residual networks for image and video recognition. In: International Conference on Pattern Recognition. pp. 9415\u20139422.","DOI":"10.1109\/ICPR48806.2021.9412193"},{"key":"10.1016\/j.cviu.2024.104183_b9","doi-asserted-by":"crossref","unstructured":"Gao,\u00a0Y., Beijbom,\u00a0O., Zhang,\u00a0N., Darrell,\u00a0T., 2016. Compact bilinear pooling. In: IEEE Conference on Computer Vision and Pattern Recognition. pp. 317\u2013326.","DOI":"10.1109\/CVPR.2016.41"},{"issue":"2","key":"10.1016\/j.cviu.2024.104183_b10","doi-asserted-by":"crossref","first-page":"652","DOI":"10.1109\/TPAMI.2019.2938758","article-title":"Res2Net: a new multi-scale backbone architecture","volume":"43","author":"Gao","year":"2021","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.cviu.2024.104183_b11","doi-asserted-by":"crossref","unstructured":"Gu,\u00a0X., Chang,\u00a0H., Ma,\u00a0B., Bai,\u00a0S., Shan,\u00a0S., Chen,\u00a0X., 2022. Clothes-changing person re-identification with RGB modality only. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1050\u20131059.","DOI":"10.1109\/CVPR52688.2022.00113"},{"issue":"1","key":"10.1016\/j.cviu.2024.104183_b12","doi-asserted-by":"crossref","first-page":"586","DOI":"10.1007\/s10489-022-03559-4","article-title":"Sparse co-attention visual question answering networks based on thresholds","volume":"53","author":"Guo","year":"2023","journal-title":"Appl. Intell."},{"key":"10.1016\/j.cviu.2024.104183_b13","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2021.103352","article-title":"Cross-modal distillation for RGB-depth person re-identification","volume":"216","author":"Hafner","year":"2022","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.cviu.2024.104183_b14","doi-asserted-by":"crossref","unstructured":"He,\u00a0K., Zhang,\u00a0X., Ren,\u00a0S., Sun,\u00a0J., 2016. Deep residual learning for image recognition. In: IEEE Conference on Computer Vision and Pattern Recognition. pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"issue":"50","key":"10.1016\/j.cviu.2024.104183_b15","doi-asserted-by":"crossref","DOI":"10.21105\/joss.02154","article-title":"Spleeter: a fast and efficient music source separation tool with pre-trained models","volume":"5","author":"Hennequin","year":"2020","journal-title":"J. Open Source Softw."},{"issue":"11","key":"10.1016\/j.cviu.2024.104183_b16","doi-asserted-by":"crossref","first-page":"2548","DOI":"10.1049\/ipr2.12243","article-title":"SeqFace: Learning discriminative features by using face sequences","volume":"15","author":"Hu","year":"2021","journal-title":"IET Image Process."},{"key":"10.1016\/j.cviu.2024.104183_b17","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.108653","article-title":"Cross-modality person re-identification via multi-task learning","volume":"128","author":"Huang","year":"2022","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.cviu.2024.104183_b18","doi-asserted-by":"crossref","unstructured":"Josi,\u00a0A., Alehdaghi,\u00a0M., Cruz,\u00a0R.M.O., Granger,\u00a0E., 2023. Multimodal data augmentation for visual-infrared person ReID with corrupted data. In: IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 32\u201341.","DOI":"10.1109\/WACVW58289.2023.00008"},{"key":"10.1016\/j.cviu.2024.104183_b19","doi-asserted-by":"crossref","unstructured":"Jung,\u00a0J., Kim,\u00a0S., Shim,\u00a0H., Kim,\u00a0J., Yu,\u00a0H., 2020. Improved RawNet with feature map scaling for text-independent speaker verification using raw waveforms. In: INTERSPEECH. pp. 1496\u20131500.","DOI":"10.21437\/Interspeech.2020-1011"},{"key":"10.1016\/j.cviu.2024.104183_b20","unstructured":"Kim,\u00a0J.H., On,\u00a0K.W., Lim,\u00a0W., Kim,\u00a0J., Ha,\u00a0J.W., Zhang,\u00a0B.T., 2017. Hadamard product for low-rank bilinear pooling. In: International Conference on Learning Representations. pp. 1\u201314."},{"key":"10.1016\/j.cviu.2024.104183_b21","series-title":"CN-Celeb-AV: A multi-genre audio-visual dataset for person recognition","author":"Li","year":"2023"},{"key":"10.1016\/j.cviu.2024.104183_b22","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2019.107037","article-title":"Spatio-temporal deformable 3d convnets with attention for action recognition","volume":"98","author":"Li","year":"2020","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.cviu.2024.104183_b23","doi-asserted-by":"crossref","unstructured":"Li,\u00a0L., Nai,\u00a0R., Wang,\u00a0D., 2022. Real Additive Margin Softmax for Speaker Verification. In: IEEE International Conference on Acoustics, Speech and Signal Processing. pp. 7527\u20137531.","DOI":"10.1109\/ICASSP43922.2022.9747166"},{"key":"10.1016\/j.cviu.2024.104183_b24","first-page":"75","article-title":"Frame aggregation and multi-modal fusion framework for video-based person recognition","volume":"2021","author":"Li","year":"2021","journal-title":"MultiMedia Model."},{"key":"10.1016\/j.cviu.2024.104183_b25","doi-asserted-by":"crossref","first-page":"383","DOI":"10.1016\/j.neucom.2020.07.148","article-title":"Streamer action recognition in live video with spatial\u2013temporal attention and deep dictionary learning","volume":"453","author":"Li","year":"2021","journal-title":"Neurocomputing"},{"key":"10.1016\/j.cviu.2024.104183_b26","doi-asserted-by":"crossref","unstructured":"Lin,\u00a0T.Y., Goyal,\u00a0P., Girshick,\u00a0R., He,\u00a0K., Doll\u00e1r,\u00a0P., 2017. Focal loss for dense object detection. In: IEEE International Conference on Computer Vision. pp. 2999\u20133007.","DOI":"10.1109\/ICCV.2017.324"},{"key":"10.1016\/j.cviu.2024.104183_b27","doi-asserted-by":"crossref","unstructured":"Lin,\u00a0B., Zhang,\u00a0S., Bao,\u00a0F., 2020. Gait recognition with multiple-temporal-scale 3d convolutional neural network. In: ACM International Conference on Multimedia. pp. 3054\u20133062.","DOI":"10.1145\/3394171.3413861"},{"key":"10.1016\/j.cviu.2024.104183_b28","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2023.103634","article-title":"Siamese graph attention networks for robust visual object tracking","volume":"229","author":"Lu","year":"2023","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.cviu.2024.104183_b29","doi-asserted-by":"crossref","unstructured":"Lu,\u00a0Y., Wu,\u00a0Y., Liu,\u00a0B., Zhang,\u00a0T., Li,\u00a0B., Chu,\u00a0Q., Yu,\u00a0N., 2020. Cross-modality person re-identification with shared-specific feature transfer. In: IEEE Conference on Computer Vision and Pattern Recognition. pp. 13376\u201313386.","DOI":"10.1109\/CVPR42600.2020.01339"},{"issue":"4","key":"10.1016\/j.cviu.2024.104183_b30","doi-asserted-by":"crossref","first-page":"2970","DOI":"10.1109\/TAFFC.2023.3250460","article-title":"Driver emotion recognition with a hybrid attentional multimodal fusion framework","volume":"14","author":"Mou","year":"2023","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.cviu.2024.104183_b31","doi-asserted-by":"crossref","unstructured":"Wagner,\u00a0J., Schiller,\u00a0D., Seiderer,\u00a0A., Andr\u00e9,\u00a0E., 2018. Deep learning in paralinguistic recognition tasks: Are hand-crafted features still relevant?. In: INTERSPEECH. pp. 147\u2013151.","DOI":"10.21437\/Interspeech.2018-1238"},{"key":"10.1016\/j.cviu.2024.104183_b32","doi-asserted-by":"crossref","unstructured":"Wang,\u00a0H., Wang,\u00a0Y., Zhou,\u00a0Z., Ji,\u00a0X., Gong,\u00a0D., Zhou,\u00a0J., Li,\u00a0Z., Liu,\u00a0W., 2018. CosFace: Large margin cosine loss for deep face recognition. In: IEEE Conference on Computer Vision and Pattern Recognition. pp. 5265\u20135274.","DOI":"10.1109\/CVPR.2018.00552"},{"issue":"12","key":"10.1016\/j.cviu.2024.104183_b33","doi-asserted-by":"crossref","first-page":"4876","DOI":"10.1109\/TCSVT.2019.2958871","article-title":"Porn streamer recognition in live video streaming via attention-gated multimodal deep features","volume":"30","author":"Wang","year":"2020","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.cviu.2024.104183_b34","doi-asserted-by":"crossref","unstructured":"Wang,\u00a0X., Zhang,\u00a0S., Wang,\u00a0S., Fu,\u00a0T., Shi,\u00a0H., Mei,\u00a0T., 2020b. Mis-classified vector guided softmax loss for face recognition. In: AAAI Conference on Artificial Intelligence. pp. 12241\u201312248.","DOI":"10.1609\/aaai.v34i07.6906"},{"key":"10.1016\/j.cviu.2024.104183_b35","doi-asserted-by":"crossref","unstructured":"Wolf,\u00a0L., Hassner,\u00a0T., Maoz,\u00a0I., 2011. Face recognition in unconstrained videos with matched background similarity. In: IEEE Conference on Computer Vision and Pattern Recognition. pp. 529\u2013534.","DOI":"10.1109\/CVPR.2011.5995566"},{"key":"10.1016\/j.cviu.2024.104183_b36","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2024.104068","article-title":"Classroom teacher action recognition based on spatio-temporal dual-branch feature fusion","volume":"247","author":"Wu","year":"2024","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.cviu.2024.104183_b37","unstructured":"Wu,\u00a0G., Zhu,\u00a0X., Gong,\u00a0S., 2019. Spatio-temporal associative representation for video person re-identification. In: British Machine Vision Conference. Art. no. 278."},{"key":"10.1016\/j.cviu.2024.104183_b38","doi-asserted-by":"crossref","unstructured":"Xie,\u00a0S., Girshick,\u00a0R., Doll\u00e1r,\u00a0P., Tu,\u00a0Z., He,\u00a0K., 2017. Aggregated residual transformations for deep neural networks. In: IEEE Conference on Computer Vision and Pattern Recognition. pp. 5987\u20135995.","DOI":"10.1109\/CVPR.2017.634"},{"key":"10.1016\/j.cviu.2024.104183_b39","doi-asserted-by":"crossref","unstructured":"Yang,\u00a0Z., Fang,\u00a0Y., Zhu,\u00a0C., Pryzant,\u00a0R., Chen,\u00a0D., Shi,\u00a0Y., Xu,\u00a0Y., Qian,\u00a0Y., Gao,\u00a0M., Chen,\u00a0Y., Lu,\u00a0L., Xie,\u00a0Y., Gmyr,\u00a0R., Codella,\u00a0N., Kanda,\u00a0N., Xiao,\u00a0B., Yuan,\u00a0L., Yoshioka,\u00a0T., Zeng,\u00a0M., Huang,\u00a0X., 2023. i-Code: An integrative and composable multimodal learning framework. In: AAAI Conference on Artificial Intelligence. pp. 10880\u201310890.","DOI":"10.1609\/aaai.v37i9.26290"},{"key":"10.1016\/j.cviu.2024.104183_b40","doi-asserted-by":"crossref","DOI":"10.1186\/s13636-021-00234-3","article-title":"Anchor voiceprint recognition in live streaming via RawNet-SA and gated recurrent unit","volume":"2021","author":"Yao","year":"2021","journal-title":"EURASIP J. Audio Speech Music Process."},{"key":"10.1016\/j.cviu.2024.104183_b41","doi-asserted-by":"crossref","unstructured":"Yu,\u00a0H., Cheng,\u00a0X., Peng,\u00a0W., Liu,\u00a0W., Zhao,\u00a0G., 2023. Modality Unifying Network for Visible-Infrared Person Re-Identification. In: IEEE\/CVF International Conference on Computer Vision. pp. 11151\u201311161.","DOI":"10.1109\/ICCV51070.2023.01027"},{"key":"10.1016\/j.cviu.2024.104183_b42","doi-asserted-by":"crossref","unstructured":"Yuan,\u00a0Y., Yang,\u00a0K., Zhang,\u00a0C., 2017. Hard-aware deeply cascaded embedding. In: IEEE International Conference on Computer Vision. pp. 814\u2013823.","DOI":"10.1109\/ICCV.2017.94"}],"container-title":["Computer Vision and Image Understanding"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1077314224002649?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1077314224002649?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T11:44:49Z","timestamp":1732189489000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1077314224002649"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":42,"alternative-id":["S1077314224002649"],"URL":"https:\/\/doi.org\/10.1016\/j.cviu.2024.104183","relation":{},"ISSN":["1077-3142"],"issn-type":[{"value":"1077-3142","type":"print"}],"subject":[],"published":{"date-parts":[[2024,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"LCMA-Net: A light cross-modal attention network for streamer re-identification in live video","name":"articletitle","label":"Article Title"},{"value":"Computer Vision and Image Understanding","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.cviu.2024.104183","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Elsevier Inc. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104183"}}