{"id":"https://openalex.org/W4392903653","doi":"https://doi.org/10.1109/icassp48485.2024.10446092","title":"CM-PIE: Cross-Modal Perception for Interactive-Enhanced Audio-Visual Video Parsing","display_name":"CM-PIE: Cross-Modal Perception for Interactive-Enhanced Audio-Visual Video Parsing","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903653","doi":"https://doi.org/10.1109/icassp48485.2024.10446092"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446092","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446092","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101653430","display_name":"Yaru Chen","orcid":"https://orcid.org/0000-0002-1917-8752"},"institutions":[{"id":"https://openalex.org/I4210121626","display_name":"Signal Processing (United States)","ror":"https://ror.org/021gzyw51","country_code":"US","type":"company","lineage":["https://openalex.org/I4210121626"]},{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]},{"id":"https://openalex.org/I52158045","display_name":"China Agricultural University","ror":"https://ror.org/04v3ywz14","country_code":"CN","type":"education","lineage":["https://openalex.org/I52158045"]}],"countries":["CN","GB","US"],"is_corresponding":true,"raw_author_name":"Yaru Chen","raw_affiliation_strings":["University of Surrey,Centre for Vision Speech and Signal Processing (CVSSP),United Kindom","Centre for Vision Speech and Signal Processing (CVSSP), University of Surrey, United Kindom","College of Information and Electrical Engineering, China Agricultural University, China"],"affiliations":[{"raw_affiliation_string":"University of Surrey,Centre for Vision Speech and Signal Processing (CVSSP),United Kindom","institution_ids":["https://openalex.org/I28290843"]},{"raw_affiliation_string":"Centre for Vision Speech and Signal Processing (CVSSP), University of Surrey, United Kindom","institution_ids":["https://openalex.org/I28290843","https://openalex.org/I4210121626"]},{"raw_affiliation_string":"College of Information and Electrical Engineering, China Agricultural University, China","institution_ids":["https://openalex.org/I52158045"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088611866","display_name":"Ruohao Guo","orcid":"https://orcid.org/0000-0002-1091-272X"},"institutions":[{"id":"https://openalex.org/I4210113068","display_name":"Peer Intelligence Technology (China)","ror":"https://ror.org/0252z0n87","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210113068"]},{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruohao Guo","raw_affiliation_strings":["Peking University,School of Intelligence Science and Technology,China","School of Intelligence Science and Technology, Peking University, China"],"affiliations":[{"raw_affiliation_string":"Peking University,School of Intelligence Science and Technology,China","institution_ids":["https://openalex.org/I20231570","https://openalex.org/I4210113068"]},{"raw_affiliation_string":"School of Intelligence Science and Technology, Peking University, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080214850","display_name":"Xubo Liu","orcid":"https://orcid.org/0000-0002-2558-0959"},"institutions":[{"id":"https://openalex.org/I4210121626","display_name":"Signal Processing (United States)","ror":"https://ror.org/021gzyw51","country_code":"US","type":"company","lineage":["https://openalex.org/I4210121626"]},{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Xubo Liu","raw_affiliation_strings":["University of Surrey,Centre for Vision Speech and Signal Processing (CVSSP),United Kindom","Centre for Vision Speech and Signal Processing (CVSSP), University of Surrey, United Kindom"],"affiliations":[{"raw_affiliation_string":"University of Surrey,Centre for Vision Speech and Signal Processing (CVSSP),United Kindom","institution_ids":["https://openalex.org/I28290843"]},{"raw_affiliation_string":"Centre for Vision Speech and Signal Processing (CVSSP), University of Surrey, United Kindom","institution_ids":["https://openalex.org/I28290843","https://openalex.org/I4210121626"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050479982","display_name":"Peipei Wu","orcid":"https://orcid.org/0009-0006-4551-3223"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]},{"id":"https://openalex.org/I4210121626","display_name":"Signal Processing (United States)","ror":"https://ror.org/021gzyw51","country_code":"US","type":"company","lineage":["https://openalex.org/I4210121626"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Peipei Wu","raw_affiliation_strings":["University of Surrey,Centre for Vision Speech and Signal Processing (CVSSP),United Kindom","Centre for Vision Speech and Signal Processing (CVSSP), University of Surrey, United Kindom"],"affiliations":[{"raw_affiliation_string":"University of Surrey,Centre for Vision Speech and Signal Processing (CVSSP),United Kindom","institution_ids":["https://openalex.org/I28290843"]},{"raw_affiliation_string":"Centre for Vision Speech and Signal Processing (CVSSP), University of Surrey, United Kindom","institution_ids":["https://openalex.org/I28290843","https://openalex.org/I4210121626"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100338853","display_name":"Guangyao Li","orcid":"https://orcid.org/0000-0002-2179-8555"},"institutions":[{"id":"https://openalex.org/I78988378","display_name":"Renmin University of China","ror":"https://ror.org/041pakw92","country_code":"CN","type":"education","lineage":["https://openalex.org/I78988378"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangyao Li","raw_affiliation_strings":["Renmin University of China,Gaoling School of Artificial Intelligence,China","Gaoling School of Artificial Intelligence, Renmin University of China, China"],"affiliations":[{"raw_affiliation_string":"Renmin University of China,Gaoling School of Artificial Intelligence,China","institution_ids":["https://openalex.org/I78988378"]},{"raw_affiliation_string":"Gaoling School of Artificial Intelligence, Renmin University of China, China","institution_ids":["https://openalex.org/I78988378"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049883225","display_name":"Zhenbo Li","orcid":"https://orcid.org/0000-0003-2914-1192"},"institutions":[{"id":"https://openalex.org/I52158045","display_name":"China Agricultural University","ror":"https://ror.org/04v3ywz14","country_code":"CN","type":"education","lineage":["https://openalex.org/I52158045"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenbo Li","raw_affiliation_strings":["China Agricultural University,College of Information and Electrical Engineering,China","College of Information and Electrical Engineering, China Agricultural University, China"],"affiliations":[{"raw_affiliation_string":"China Agricultural University,College of Information and Electrical Engineering,China","institution_ids":["https://openalex.org/I52158045"]},{"raw_affiliation_string":"College of Information and Electrical Engineering, China Agricultural University, China","institution_ids":["https://openalex.org/I52158045"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100676721","display_name":"Wenwu Wang","orcid":"https://orcid.org/0000-0002-8393-5703"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]},{"id":"https://openalex.org/I4210121626","display_name":"Signal Processing (United States)","ror":"https://ror.org/021gzyw51","country_code":"US","type":"company","lineage":["https://openalex.org/I4210121626"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Wenwu Wang","raw_affiliation_strings":["University of Surrey,Centre for Vision Speech and Signal Processing (CVSSP),United Kindom","Centre for Vision Speech and Signal Processing (CVSSP), University of Surrey, United Kindom"],"affiliations":[{"raw_affiliation_string":"University of Surrey,Centre for Vision Speech and Signal Processing (CVSSP),United Kindom","institution_ids":["https://openalex.org/I28290843"]},{"raw_affiliation_string":"Centre for Vision Speech and Signal Processing (CVSSP), University of Surrey, United Kindom","institution_ids":["https://openalex.org/I28290843","https://openalex.org/I4210121626"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5101653430"],"corresponding_institution_ids":["https://openalex.org/I28290843","https://openalex.org/I4210121626","https://openalex.org/I52158045"],"apc_list":null,"apc_paid":null,"fwci":1.8677,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.84954757,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"8421","last_page":"8425"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8580441474914551},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.8089243769645691},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6300351619720459},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.551638126373291},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.5508395433425903},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5093547105789185},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4926000237464905},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4879853129386902},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.4844503402709961},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4825391173362732},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.46194013953208923},{"id":"https://openalex.org/keywords/dependency-grammar","display_name":"Dependency grammar","score":0.4493390619754791},{"id":"https://openalex.org/keywords/crossmodal","display_name":"Crossmodal","score":0.43897074460983276},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.41236746311187744},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.34505224227905273},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.1764695644378662}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8580441474914551},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.8089243769645691},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6300351619720459},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.551638126373291},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.5508395433425903},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5093547105789185},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4926000237464905},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4879853129386902},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.4844503402709961},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4825391173362732},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.46194013953208923},{"id":"https://openalex.org/C164883195","wikidata":"https://www.wikidata.org/wiki/Q674834","display_name":"Dependency grammar","level":3,"score":0.4493390619754791},{"id":"https://openalex.org/C60115397","wikidata":"https://www.wikidata.org/wiki/Q5188732","display_name":"Crossmodal","level":4,"score":0.43897074460983276},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.41236746311187744},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.34505224227905273},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.1764695644378662},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446092","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446092","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320322725","display_name":"China Scholarship Council","ror":"https://ror.org/04atp4p48"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":15,"referenced_works":["https://openalex.org/W2191798142","https://openalex.org/W2194775991","https://openalex.org/W2526050071","https://openalex.org/W2752782242","https://openalex.org/W2931433835","https://openalex.org/W2963155035","https://openalex.org/W2964109005","https://openalex.org/W2988200020","https://openalex.org/W2990113535","https://openalex.org/W3100732527","https://openalex.org/W3118120400","https://openalex.org/W4226206782","https://openalex.org/W4292958273","https://openalex.org/W4312380001","https://openalex.org/W4372260201"],"related_works":["https://openalex.org/W2251084681","https://openalex.org/W287510790","https://openalex.org/W2098784136","https://openalex.org/W3117798239","https://openalex.org/W2968543375","https://openalex.org/W2571817549","https://openalex.org/W1970099335","https://openalex.org/W1541975828","https://openalex.org/W2888625260","https://openalex.org/W2159336305"],"abstract_inverted_index":{"Audio-visual":[0],"video":[1,9,40],"parsing":[2,124],"is":[3,99],"the":[4,14,29,34,38,42,50,104,127],"task":[5],"of":[6,52,107],"categorizing":[7],"a":[8,59,64,75,89,95],"with":[10],"weak":[11],"labels":[12],"at":[13],"segment":[15],"level,":[16],"and":[17,55,109,130],"predicting":[18],"them":[19],"as":[20],"audible":[21],"or":[22],"visible":[23],"events.":[24],"Recent":[25],"methods":[26],"have":[27],"leveraged":[28],"attention":[30,91],"mechanism":[31],"to":[32,101,135],"capture":[33],"semantic":[35,105],"correlations":[36],"among":[37],"whole":[39],"across":[41],"audio-visual":[43],"modalities.":[44],"However,":[45],"these":[46],"approaches":[47],"may":[48],"overlook":[49],"importance":[51],"individual":[53],"segments":[54],"their":[56],"interrelations":[57],"within":[58],"video,":[60],"typically":[61],"relying":[62],"on":[63,126],"single":[65],"modality":[66],"when":[67],"learning":[68],"features.":[69],"In":[70,93],"this":[71],"paper,":[72],"we":[73],"propose":[74],"novel":[76],"interactive-enhanced":[77],"cross-modal":[78,96],"perception":[79],"method":[80],"(CM-PIE),":[81],"which":[82],"can":[83],"learn":[84],"fine-grained":[85],"features":[86],"by":[87,112],"applying":[88],"segment-based":[90],"module.":[92],"addition,":[94],"aggregation":[97],"block":[98],"introduced":[100],"jointly":[102],"optimize":[103],"representation":[106],"audio":[108],"visual":[110],"signals":[111],"enhancing":[113],"inter-modal":[114],"interactions.":[115],"Experimental":[116],"results":[117],"show":[118],"that":[119],"our":[120],"model":[121],"offers":[122],"improved":[123],"performance":[125],"Look,":[128],"Listen,":[129],"Parse":[131],"(LLP)":[132],"dataset":[133],"compared":[134],"other":[136],"methods.":[137]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
