{"id":"https://openalex.org/W4390214375","doi":"https://doi.org/10.48550/arxiv.2312.14378","title":"Multimodal Attention Merging for Improved Speech Recognition and Audio Event Classification","display_name":"Multimodal Attention Merging for Improved Speech Recognition and Audio Event Classification","publication_year":2023,"publication_date":"2023-12-22","ids":{"openalex":"https://openalex.org/W4390214375","doi":"https://doi.org/10.48550/arxiv.2312.14378"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2312.14378","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.14378","pdf_url":"https://arxiv.org/pdf/2312.14378","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2312.14378","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013573557","display_name":"Anirudh S. Sundar","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sundar, Anirudh S.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020376803","display_name":"Chao-Han Huck Yang","orcid":"https://orcid.org/0000-0003-2879-8811"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Chao-Han Huck","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104260047","display_name":"David M. Chan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chan, David M.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102095407","display_name":"Shalini Ghosh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ghosh, Shalini","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060593241","display_name":"Venkatesh Ravichandran","orcid":"https://orcid.org/0009-0001-7214-2919"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ravichandran, Venkatesh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5005758272","display_name":"Phani Sankar Nidadavolu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nidadavolu, Phani Sankar","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5013573557"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9887999892234802,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7672328352928162},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.7494716644287109},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.588409960269928},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5700576305389404},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.5393038392066956},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4536036252975464},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.41689378023147583},{"id":"https://openalex.org/keywords/labeled-data","display_name":"Labeled data","score":0.41291365027427673},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3493642807006836},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.34521716833114624},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.33008643984794617},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.08291992545127869}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7672328352928162},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.7494716644287109},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.588409960269928},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5700576305389404},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.5393038392066956},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4536036252975464},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.41689378023147583},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.41291365027427673},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3493642807006836},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34521716833114624},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33008643984794617},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.08291992545127869},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2312.14378","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.14378","pdf_url":"https://arxiv.org/pdf/2312.14378","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2312.14378","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2312.14378","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2312.14378","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.14378","pdf_url":"https://arxiv.org/pdf/2312.14378","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"score":0.6100000143051147,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4390214375.pdf","grobid_xml":"https://content.openalex.org/works/W4390214375.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2532234348","https://openalex.org/W1569283511","https://openalex.org/W108084911","https://openalex.org/W2393440248","https://openalex.org/W4236193183","https://openalex.org/W1569386110","https://openalex.org/W2053866214","https://openalex.org/W2944691285","https://openalex.org/W3045896262","https://openalex.org/W4386301987"],"abstract_inverted_index":{"Training":[0],"large":[1],"foundation":[2],"models":[3,59],"using":[4],"self-supervised":[5],"objectives":[6],"on":[7,13],"unlabeled":[8],"data,":[9],"followed":[10],"by":[11,31,95,110],"fine-tuning":[12,34],"downstream":[14,40],"tasks,":[15],"has":[16],"emerged":[17],"as":[18],"a":[19,77,122,131],"standard":[20],"procedure.":[21],"Unfortunately,":[22],"the":[23,82],"efficacy":[24],"of":[25,58,88,103],"this":[26],"approach":[27,124],"is":[28,117],"often":[29],"constrained":[30],"both":[32],"limited":[33],"compute":[35],"and":[36,66,74,99,140],"scarcity":[37],"in":[38,61,70,130,136,144],"labeled":[39],"data.":[41],"We":[42],"introduce":[43],"Multimodal":[44],"Attention":[45],"Merging":[46],"(MAM),":[47],"an":[48,89,104],"attempt":[49],"that":[50],"facilitates":[51],"direct":[52],"knowledge":[53],"transfer":[54],"from":[55],"attention":[56,127],"matrices":[57],"rooted":[60],"high":[62],"resource":[63],"modalities,":[64],"text":[65],"images,":[67],"to":[68,97,125,147],"those":[69],"resource-constrained":[71],"domains,":[72],"speech":[73],"audio,":[75],"employing":[76],"zero-shot":[78],"paradigm.":[79],"MAM":[80],"reduces":[81],"relative":[83,100,134,142],"Word":[84],"Error":[85],"Rate":[86],"(WER)":[87],"Automatic":[90],"Speech":[91],"Recognition":[92],"(ASR)":[93],"model":[94,109],"up":[96],"6.70%,":[98],"classification":[101],"error":[102],"Audio":[105],"Event":[106],"Classification":[107],"(AEC)":[108],"10.63%.":[111],"In":[112],"cases":[113],"where":[114],"some":[115],"data/compute":[116],"available,":[118],"we":[119],"present":[120],"Learnable-MAM,":[121],"data-driven":[123],"merging":[126],"matrices,":[128],"resulting":[129],"further":[132],"2.90%":[133],"reduction":[135,143],"WER":[137],"for":[138],"ASR":[139],"18.42%":[141],"AEC":[145],"compared":[146],"fine-tuning.":[148]},"counts_by_year":[],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2023-12-26T00:00:00"}
