{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:47:16Z","timestamp":1775069236063,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":76,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Chinese Knowledge Center of Engineering Science and Technology"},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2018AAA0101900"],"award-info":[{"award-number":["2018AAA0101900"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Alibaba-Zhejiang University Joint Research Institute of Frontier Technologies, Key Research and Development Program of Zhejiang Province, China","award":["No. 2021C01013"],"award-info":[{"award-number":["No. 2021C01013"]}]},{"name":"Zhejiang NSF","award":["LR21F020004"],"award-info":[{"award-number":["LR21F020004"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3547886","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:42:35Z","timestamp":1665416555000},"page":"5083-5092","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Dilated Context Integrated Network with Cross-Modal Consensus for Temporal Emotion Localization in Videos"],"prefix":"10.1145","author":[{"given":"Juncheng","family":"Li","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Junlin","family":"Xie","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Linchao","family":"Zhu","sequence":"additional","affiliation":[{"name":"University of Technology Sydney, Sydney, NSW, Australia"}]},{"given":"Long","family":"Qian","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Siliang","family":"Tang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Wenqiao","family":"Zhang","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"given":"Haochen","family":"Shi","sequence":"additional","affiliation":[{"name":"Universit\u00e9 de Montr\u00e9al, Montr\u00e9al, PQ, Canada"}]},{"given":"Shengyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Longhui","family":"Wei","sequence":"additional","affiliation":[{"name":"Huawei Cloud, Beijing, China"}]},{"given":"Qi","family":"Tian","sequence":"additional","affiliation":[{"name":"Huawei Cloud, Shenzhen, China"}]},{"given":"Yueting","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675","author":"Abu-El-Haija Sami","year":"2016","unstructured":"Sami Abu-El-Haija , Nisarg Kothari , Joonseok Lee , Paul Natsev , George Toderici , Balakrishnan Varadarajan , and Sudheendra Vijayanarasimhan . 2016. Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675 ( 2016 ). Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, and Sudheendra Vijayanarasimhan. 2016. Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675 (2016)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2017.81"},{"key":"e_1_3_2_2_3_1","volume-title":"Distributed neural system for emotional intelligence revealed by lesion mapping. Social cognitive and affective neuroscience","author":"Barbey Aron K","year":"2014","unstructured":"Aron K Barbey , Roberto Colom , and Jordan Grafman . 2014. Distributed neural system for emotional intelligence revealed by lesion mapping. Social cognitive and affective neuroscience , Vol. 9 , 3 ( 2014 ), 265--272. Aron K Barbey, Roberto Colom, and Jordan Grafman. 2014. Distributed neural system for emotional intelligence revealed by lesion mapping. Social cognitive and affective neuroscience, Vol. 9, 3 (2014), 265--272."},{"key":"e_1_3_2_2_4_1","volume-title":"Procedings of the British Machine Vision Conference","author":"Buch Shyamal","year":"2019","unstructured":"Shyamal Buch , Victor Escorcia , Bernard Ghanem , Li Fei-Fei , and Juan Carlos Niebles . 2019 . End-to-end, single-stream temporal action detection in untrimmed videos . In Procedings of the British Machine Vision Conference 2017. British Machine Vision Association. Shyamal Buch, Victor Escorcia, Bernard Ghanem, Li Fei-Fei, and Juan Carlos Niebles. 2019. End-to-end, single-stream temporal action detection in untrimmed videos. In Procedings of the British Machine Vision Conference 2017. British Machine Vision Association."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.675"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00124"},{"key":"e_1_3_2_2_9_1","volume-title":"Look closer to ground better: Weakly-supervised temporal grounding of sentence in video. arXiv preprint arXiv:2001.09308","author":"Chen Zhenfang","year":"2020","unstructured":"Zhenfang Chen , Lin Ma , Wenhan Luo , Peng Tang , and Kwan-Yee K Wong . 2020. Look closer to ground better: Weakly-supervised temporal grounding of sentence in video. arXiv preprint arXiv:2001.09308 ( 2020 ). Zhenfang Chen, Lin Ma, Wenhan Luo, Peng Tang, and Kwan-Yee K Wong. 2020. Look closer to ground better: Weakly-supervised temporal grounding of sentence in video. arXiv preprint arXiv:2001.09308 (2020)."},{"key":"e_1_3_2_2_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2018 . Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018). Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_11_1","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Duan Xuguang","year":"2018","unstructured":"Xuguang Duan , Wenbing Huang , Chuang Gan , Jingdong Wang , Wenwu Zhu , and Junzhou Huang . 2018 . Weakly supervised dense event captioning in videos . Advances in Neural Information Processing Systems , Vol. 31 (2018). Xuguang Duan, Wenbing Huang, Chuang Gan, Jingdong Wang, Wenwu Zhu, and Junzhou Huang. 2018. Weakly supervised dense event captioning in videos. Advances in Neural Information Processing Systems, Vol. 31 (2018)."},{"key":"e_1_3_2_2_12_1","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi John","year":"2011","unstructured":"John Duchi , Elad Hazan , and Yoram Singer . 2011 . Adaptive subgradient methods for online learning and stochastic optimization . Journal of machine learning research , Vol. 12 , 7 (2011). John Duchi, Elad Hazan, and Yoram Singer. 2011. Adaptive subgradient methods for online learning and stochastic optimization. Journal of machine learning research, Vol. 12, 7 (2011).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_47"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.600"},{"key":"e_1_3_2_2_15_1","volume-title":"Mar'ia Pilar Jim\u00e9nez S\u00e1nchez, Mar'ia Dolores Mart'in D'iaz, and Francisco Javier Dom'inguez S\u00e1nchez.","author":"Fern\u00e1ndez-Abascal Enrique Garc'ia","year":"2010","unstructured":"Enrique Garc'ia Fern\u00e1ndez-Abascal , Beatriz Garc'ia Rodr'iguez , Mar'ia Pilar Jim\u00e9nez S\u00e1nchez, Mar'ia Dolores Mart'in D'iaz, and Francisco Javier Dom'inguez S\u00e1nchez. 2010 . Psicolog'ia de la emoci\u00f3n. Editorial Universitaria Ram\u00f3n Areces . Enrique Garc'ia Fern\u00e1ndez-Abascal, Beatriz Garc'ia Rodr'iguez, Mar'ia Pilar Jim\u00e9nez S\u00e1nchez, Mar'ia Dolores Mart'in D'iaz, and Francisco Javier Dom'inguez S\u00e1nchez. 2010. Psicolog'ia de la emoci\u00f3n. Editorial Universitaria Ram\u00f3n Areces."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_2_17_1","unstructured":"A. Gorban H. Idrees Y.-G. Jiang A. Roshan Zamir I. Laptev M. Shah and R. Sukthankar. 2015. THUMOS Challenge: Action Recognition with a Large Number of Classes. http:\/\/www.thumos.info\/.  A. Gorban H. Idrees Y.-G. Jiang A. Roshan Zamir I. Laptev M. Shah and R. Sukthankar. 2015. THUMOS Challenge: Action Recognition with a Large Number of Classes. http:\/\/www.thumos.info\/."},{"key":"e_1_3_2_2_18_1","volume-title":"Collaborative Intelligence Orchestration: Inconsistency-Based Fusion of Semi-Supervised Learning and Active Learning. arXiv preprint arXiv:2206.03288","author":"Guo Jiannan","year":"2022","unstructured":"Jiannan Guo , Yangyang Kang , Yu Duan , Xiaozhong Liu , Siliang Tang , Wenqiao Zhang , Kun Kuang , Changlong Sun , and Fei Wu. 2022. Collaborative Intelligence Orchestration: Inconsistency-Based Fusion of Semi-Supervised Learning and Active Learning. arXiv preprint arXiv:2206.03288 ( 2022 ). Jiannan Guo, Yangyang Kang, Yu Duan, Xiaozhong Liu, Siliang Tang, Wenqiao Zhang, Kun Kuang, Changlong Sun, and Fei Wu. 2022. Collaborative Intelligence Orchestration: Inconsistency-Based Fusion of Semi-Supervised Learning and Active Learning. arXiv preprint arXiv:2206.03288 (2022)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00289"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-21380-4_20"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.211"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1093\/brain\/awt370"},{"key":"e_1_3_2_2_24_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber . 1997. Long short-term memory. Neural computation , Vol. 9 , 8 ( 1997 ), 1735--1780. Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation, Vol. 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_2_25_1","volume-title":"Fusionnet: Fusing via fully-aware attention with application to machine comprehension. arXiv preprint arXiv:1711.07341","author":"Huang Hsin-Yuan","year":"2017","unstructured":"Hsin-Yuan Huang , Chenguang Zhu , Yelong Shen , and Weizhu Chen . 2017 . Fusionnet: Fusing via fully-aware attention with application to machine comprehension. arXiv preprint arXiv:1711.07341 (2017). Hsin-Yuan Huang, Chenguang Zhu, Yelong Shen, and Weizhu Chen. 2017. Fusionnet: Fusing via fully-aware attention with application to machine comprehension. arXiv preprint arXiv:1711.07341 (2017)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"crossref","unstructured":"Ziqi Jiang Shengyu Zhang Siyuan Yao Wenqiao Zhang Sihan Zhang Juncheng Li Zhou Zhao and Fei Wu. 2022. Weakly-supervised Disentanglement Network for Video Fingerspelling Detection. In ACM MM.  Ziqi Jiang Shengyu Zhang Siyuan Yao Wenqiao Zhang Sihan Zhang Juncheng Li Zhou Zhao and Fei Wu. 2022. Weakly-supervised Disentanglement Network for Video Fingerspelling Detection. In ACM MM.","DOI":"10.1145\/3503161.3548213"},{"key":"e_1_3_2_2_27_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev etal 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017).  Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_2_28_1","volume-title":"Attribute-aware interpretation learning for thyroid ultrasound diagnosis. Artificial Intelligence in Medicine","author":"Kong Ming","year":"2022","unstructured":"Ming Kong , Qing Guo , Shuowen Zhou , Mengze Li , Kun Kuang , Zhengxing Huang , Fei Wu , Xiaohong Chen , and Qiang Zhu . 2022. Attribute-aware interpretation learning for thyroid ultrasound diagnosis. Artificial Intelligence in Medicine ( 2022 ), 102344. Ming Kong, Qing Guo, Shuowen Zhou, Mengze Li, Kun Kuang, Zhengxing Huang, Fei Wu, Xiaohong Chen, and Qiang Zhu. 2022. Attribute-aware interpretation learning for thyroid ultrasound diagnosis. Artificial Intelligence in Medicine (2022), 102344."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.285"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00987"},{"key":"e_1_3_2_2_32_1","volume-title":"Deep learning. nature","author":"LeCun Yann","year":"2015","unstructured":"Yann LeCun , Yoshua Bengio , and Geoffrey Hinton . 2015. Deep learning. nature , Vol. 521 , 7553 ( 2015 ), 436--444. Yann LeCun, Yoshua Bengio, and Geoffrey Hinton. 2015. Deep learning. nature, Vol. 521, 7553 (2015), 436--444."},{"key":"e_1_3_2_2_33_1","volume-title":"Tvqa: Localized, compositional video question answering. arXiv preprint arXiv:1809.01696","author":"Lei Jie","year":"2018","unstructured":"Jie Lei , Licheng Yu , Mohit Bansal , and Tamara L Berg . 2018 . Tvqa: Localized, compositional video question answering. arXiv preprint arXiv:1809.01696 (2018). Jie Lei, Licheng Yu, Mohit Bansal, and Tamara L Berg. 2018. Tvqa: Localized, compositional video question answering. arXiv preprint arXiv:1809.01696 (2018)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"e_1_3_2_2_35_1","volume-title":"Fine-Grained Semantically Aligned Vision-Language Pre-Training. arXiv preprint arXiv:2208.02515","author":"Li Juncheng","year":"2022","unstructured":"Juncheng Li , Xin He , Longhui Wei , Long Qian , Linchao Zhu , Lingxi Xie , Yueting Zhuang , Qi Tian , and Siliang Tang . 2022a. Fine-Grained Semantically Aligned Vision-Language Pre-Training. arXiv preprint arXiv:2208.02515 ( 2022 ). Juncheng Li, Xin He, Longhui Wei, Long Qian, Linchao Zhu, Lingxi Xie, Yueting Zhuang, Qi Tian, and Siliang Tang. 2022a. Fine-Grained Semantically Aligned Vision-Language Pre-Training. arXiv preprint arXiv:2208.02515 (2022)."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413886"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351017"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00188"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01214"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00304"},{"key":"e_1_3_2_2_41_1","volume-title":"Optoelectronic Imaging and Multimedia Technology VII","author":"Li Mengze","unstructured":"Mengze Li , Ming Kong , Kun Kuang , Qiang Zhu , and Fei Wu. 2020a. Multi-task attribute-fusion model for fine-grained image recognition . In Optoelectronic Imaging and Multimedia Technology VII , Vol. 11550 . SPIE , 114--123. Mengze Li, Ming Kong, Kun Kuang, Qiang Zhu, and Fei Wu. 2020a. Multi-task attribute-fusion model for fine-grained image recognition. In Optoelectronic Imaging and Multimedia Technology VII, Vol. 11550. SPIE, 114--123."},{"key":"e_1_3_2_2_42_1","unstructured":"Mengze Li Kun Kuang Qiang Zhu Xiaohong Chen Qing Guo and Fei Wu. 2020b. IB-M: A Flexible Framework to Align an Interpretable Model and a Black-box Model. In BIBM.  Mengze Li Kun Kuang Qiang Zhu Xiaohong Chen Qing Guo and Fei Wu. 2020b. IB-M: A Flexible Framework to Align an Interpretable Model and a Black-box Model. In BIBM."},{"key":"e_1_3_2_2_43_1","unstructured":"Mengze Li Tianbao Wang Haoyu Zhang Shengyu Zhang Zhou Zhao Jiaxu Miao Wenqiao Zhang Wenming Tan Jin Wang Peng Wang etal 2022b. End-to-End Modeling via Information Tree for One-Shot Natural Language Spatial Video Grounding. arXiv preprint arXiv:2203.08013 (2022).  Mengze Li Tianbao Wang Haoyu Zhang Shengyu Zhang Zhou Zhao Jiaxu Miao Wenqiao Zhang Wenming Tan Jin Wang Peng Wang et al. 2022b. End-to-End Modeling via Information Tree for One-Shot Natural Language Spatial Video Grounding. arXiv preprint arXiv:2203.08013 (2022)."},{"key":"e_1_3_2_2_44_1","volume-title":"HERO: HiErarchical spatio-tempoRal reasOning with Contrastive Action Correspondence for End-to-End Video Object Grounding. In ACM MM.","author":"Li Mengze","year":"2022","unstructured":"Mengze Li , Tianbao Wang , Haoyu Zhang , Shengyu Zhang , Zhou Zhao , Wenqiao Zhang , Jiaxu Miao , Shiliang Pu , and Fei Wu . 2022 c. HERO: HiErarchical spatio-tempoRal reasOning with Contrastive Action Correspondence for End-to-End Video Object Grounding. In ACM MM. Mengze Li, Tianbao Wang, Haoyu Zhang, Shengyu Zhang, Zhou Zhao, Wenqiao Zhang, Jiaxu Miao, Shiliang Pu, and Fei Wu. 2022c. HERO: HiErarchical spatio-tempoRal reasOning with Contrastive Action Correspondence for End-to-End Video Object Grounding. In ACM MM."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00399"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6820"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00372"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00043"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00750"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2014.2384198"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00561"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00877"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00706"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00212"},{"key":"e_1_3_2_2_56_1","volume-title":"Assessing the effectiveness of a large database of emotion-eliciting films: A new tool for emotion researchers. Cognition and emotion","author":"Schaefer Alexandre","year":"2010","unstructured":"Alexandre Schaefer , Fr\u00e9d\u00e9ric Nils , Xavier Sanchez , and Pierre Philippot . 2010. Assessing the effectiveness of a large database of emotion-eliciting films: A new tool for emotion researchers. Cognition and emotion , Vol. 24 , 7 ( 2010 ), 1153--1172. Alexandre Schaefer, Fr\u00e9d\u00e9ric Nils, Xavier Sanchez, and Pierre Philippot. 2010. Assessing the effectiveness of a large database of emotion-eliciting films: A new tool for emotion researchers. Cognition and emotion, Vol. 24, 7 (2010), 1153--1172."},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.501"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_2_59_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998--6008.  Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998--6008."},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00895"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.678"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01312"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00719"},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58539-6_3"},{"key":"e_1_3_2_2_65_1","volume-title":"DeVLBert: Learning Deconfounded Visio-Linguistic Representations. In MM '20: The 28th ACM International Conference on Multimedia. ACM, 4373--4382","author":"Zhang Shengyu","year":"2020","unstructured":"Shengyu Zhang , Tan Jiang , Tan Wang , Kun Kuang , Zhou Zhao , Jianke Zhu , Jin Yu , Hongxia Yang , and Fei Wu . 2020 a. DeVLBert: Learning Deconfounded Visio-Linguistic Representations. In MM '20: The 28th ACM International Conference on Multimedia. ACM, 4373--4382 . Shengyu Zhang, Tan Jiang, Tan Wang, Kun Kuang, Zhou Zhao, Jianke Zhu, Jin Yu, Hongxia Yang, and Fei Wu. 2020a. DeVLBert: Learning Deconfounded Visio-Linguistic Representations. In MM '20: The 28th ACM International Conference on Multimedia. ACM, 4373--4382."},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_2_67_1","volume-title":"Poet: Product-oriented Video Captioner for E-commerce. In MM '20: The 28th ACM International Conference on Multimedia. ACM, 1292--1301","author":"Zhang Shengyu","year":"2020","unstructured":"Shengyu Zhang , Ziqi Tan , Jin Yu , Zhou Zhao , Kun Kuang , Jie Liu , Jingren Zhou , Hongxia Yang , and Fei Wu . 2020 c. Poet: Product-oriented Video Captioner for E-commerce. In MM '20: The 28th ACM International Conference on Multimedia. ACM, 1292--1301 . Shengyu Zhang, Ziqi Tan, Jin Yu, Zhou Zhao, Kun Kuang, Jie Liu, Jingren Zhou, Hongxia Yang, and Fei Wu. 2020c. Poet: Product-oriented Video Captioner for E-commerce. In MM '20: The 28th ACM International Conference on Multimedia. ACM, 1292--1301."},{"key":"e_1_3_2_2_68_1","volume-title":"Comprehensive Information Integration Modeling Framework for Video Titling. In KDD '20: The 26th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. ACM, 2744--2754","author":"Zhang Shengyu","year":"2020","unstructured":"Shengyu Zhang , Ziqi Tan , Zhou Zhao , Jin Yu , Kun Kuang , Tan Jiang , Jingren Zhou , Hongxia Yang , and Fei Wu . 2020 d. Comprehensive Information Integration Modeling Framework for Video Titling. In KDD '20: The 26th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. ACM, 2744--2754 . Shengyu Zhang, Ziqi Tan, Zhou Zhao, Jin Yu, Kun Kuang, Tan Jiang, Jingren Zhou, Hongxia Yang, and Fei Wu. 2020d. Comprehensive Information Integration Modeling Framework for Video Titling. In KDD '20: The 26th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. ACM, 2744--2754."},{"key":"e_1_3_2_2_69_1","volume-title":"BOSS: Bottom-up Cross-modal Semantic Composition with Hybrid Counterfactual Training for Robust Content-based Image Retrieval. https:\/\/doi.org\/10.48550\/ARXIV.2207.04211","author":"Zhang Wenqiao","year":"2022","unstructured":"Wenqiao Zhang , Jiannan Guo , Mengze Li , Haochen Shi , Shengyu Zhang , Juncheng Li , Siliang Tang , Wu Fei , Tat-Seng Chua , and Yueting Zhuang . 2022 a. BOSS: Bottom-up Cross-modal Semantic Composition with Hybrid Counterfactual Training for Robust Content-based Image Retrieval. https:\/\/doi.org\/10.48550\/ARXIV.2207.04211 Wenqiao Zhang, Jiannan Guo, Mengze Li, Haochen Shi, Shengyu Zhang, Juncheng Li, Siliang Tang, Wu Fei, Tat-Seng Chua, and Yueting Zhuang. 2022a. BOSS: Bottom-up Cross-modal Semantic Composition with Hybrid Counterfactual Training for Robust Content-based Image Retrieval. https:\/\/doi.org\/10.48550\/ARXIV.2207.04211"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20243"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16452"},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2935678"},{"key":"e_1_3_2_2_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413745"},{"key":"e_1_3_2_2_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413746"},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02001"},{"key":"e_1_3_2_2_76_1","first-page":"18123","article-title":"g. Counterfactual contrastive learning for weakly-supervised vision-language grounding","volume":"33","author":"Zhang Zhu","year":"2020","unstructured":"Zhu Zhang , Zhou Zhao , Zhijie Lin , Xiuqiang He , 2020 g. Counterfactual contrastive learning for weakly-supervised vision-language grounding . Advances in Neural Information Processing Systems , Vol. 33 (2020), 18123 -- 18134 . Zhu Zhang, Zhou Zhao, Zhijie Lin, Xiuqiang He, et al. 2020 g. Counterfactual contrastive learning for weakly-supervised vision-language grounding. Advances in Neural Information Processing Systems, Vol. 33 (2020), 18123--18134.","journal-title":"Advances in Neural Information Processing Systems"}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","location":"Lisboa Portugal","acronym":"MM '22","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3547886","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3547886","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:30Z","timestamp":1750186830000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3547886"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":76,"alternative-id":["10.1145\/3503161.3547886","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3547886","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}