{"id":"https://openalex.org/W4405035829","doi":"https://doi.org/10.48550/arxiv.2412.02611","title":"AV-Odyssey Bench: Can Your Multimodal LLMs Really Understand Audio-Visual Information?","display_name":"AV-Odyssey Bench: Can Your Multimodal LLMs Really Understand Audio-Visual Information?","publication_year":2024,"publication_date":"2024-12-03","ids":{"openalex":"https://openalex.org/W4405035829","doi":"https://doi.org/10.48550/arxiv.2412.02611"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2412.02611","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.02611","pdf_url":"https://arxiv.org/pdf/2412.02611","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.02611","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5115004435","display_name":"Kaixiong Gong","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gong, Kaixiong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Feng, Kaituo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Kaituo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114211815","display_name":"Bohao Li","orcid":"https://orcid.org/0009-0005-4965-2901"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Bohao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100664217","display_name":"Yibing Wang","orcid":"https://orcid.org/0000-0002-3508-1491"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yibing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cheng, Mofan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Mofan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang, Shijia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Shijia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021002113","display_name":"Jihye Han","orcid":"https://orcid.org/0009-0005-6369-4091"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Jiaming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057282504","display_name":"Benyou Wang","orcid":"https://orcid.org/0000-0002-1501-9914"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Benyou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102019858","display_name":"Yutong Bai","orcid":"https://orcid.org/0000-0002-6210-7757"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Yutong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang, Zhuoran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhuoran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5078165161","display_name":"Xiangyu Yue","orcid":"https://orcid.org/0000-0002-6887-2046"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yue, Xiangyu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5115004435"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9729999899864197,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9729999899864197,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9466999769210815,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9329000115394592,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.7723751068115234},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.35533589124679565},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.3312395215034485}],"concepts":[{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.7723751068115234},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.35533589124679565},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3312395215034485}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2412.02611","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.02611","pdf_url":"https://arxiv.org/pdf/2412.02611","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.02611","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.02611","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.02611","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.02611","pdf_url":"https://arxiv.org/pdf/2412.02611","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4405035829.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2271369634","https://openalex.org/W3147472394","https://openalex.org/W2047100085","https://openalex.org/W2350550760","https://openalex.org/W578794879","https://openalex.org/W2625296515","https://openalex.org/W3137890128"],"abstract_inverted_index":{"Recently,":[0],"multimodal":[1],"large":[2],"language":[3],"models":[4,27,113,157],"(MLLMs),":[5],"such":[6],"as":[7,138],"GPT-4o,":[8],"Gemini":[9],"1.5":[10],"Pro,":[11],"and":[12,22,60,106,121,127,155,158,179],"Reka":[13],"Core,":[14],"have":[15,134],"expanded":[16],"their":[17],"capabilities":[18],"to":[19,84,171],"include":[20],"vision":[21],"audio":[23,107,122],"modalities.":[24],"While":[25],"these":[26,73],"demonstrate":[28],"impressive":[29],"performance":[30],"across":[31],"a":[32,68,79,151],"wide":[33],"range":[34],"of":[35,55,64,130,153,166],"audio-visual":[36,81,93],"applications,":[37],"our":[38],"proposed":[39],"DeafTest":[40],"reveals":[41],"that":[42],"MLLMs":[43,88],"often":[44],"struggle":[45],"with":[46],"simple":[47],"tasks":[48],"humans":[49],"find":[50],"trivial:":[51],"1)":[52],"determining":[53,62],"which":[54,63],"two":[56,65],"sounds":[57,66],"is":[58],"louder,":[59],"2)":[61],"has":[67],"higher":[69],"pitch.":[70],"Motivated":[71],"by":[72],"observations,":[74],"we":[75,133,169],"introduce":[76],"AV-Odyssey":[77],"Bench,":[78],"comprehensive":[80],"benchmark":[82,96,150],"designed":[83],"assess":[85],"whether":[86],"those":[87],"can":[89],"truly":[90],"understand":[91],"the":[92,136,141,160,164],"information.":[94],"This":[95],"encompasses":[97],"4,555":[98],"carefully":[99],"crafted":[100],"problems,":[101],"each":[102],"incorporating":[103],"text,":[104],"visual,":[105],"components.":[108],"To":[109,124],"successfully":[110],"infer":[111],"answers,":[112],"must":[114],"effectively":[115],"leverage":[116],"clues":[117],"from":[118],"both":[119],"visual":[120],"inputs.":[123],"ensure":[125],"precise":[126],"objective":[128],"evaluation":[129,145],"MLLM":[131],"responses,":[132],"structured":[135],"questions":[137],"multiple-choice,":[139],"eliminating":[140],"need":[142],"for":[143,175],"human":[144],"or":[146],"LLM-assisted":[147],"assessment.":[148],"We":[149],"series":[152],"closed-source":[154],"open-source":[156],"summarize":[159],"observations.":[161],"By":[162],"revealing":[163],"limitations":[165],"current":[167],"models,":[168],"aim":[170],"provide":[172],"useful":[173],"insight":[174],"future":[176],"dataset":[177],"collection":[178],"model":[180],"development.":[181]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2024-12-06T00:00:00"}
