{"id":"https://openalex.org/W4417092509","doi":"https://doi.org/10.48550/arxiv.2505.01481","title":"VideoHallu: Evaluating and Mitigating Multi-modal Hallucinations on Synthetic Video Understanding","display_name":"VideoHallu: Evaluating and Mitigating Multi-modal Hallucinations on Synthetic Video Understanding","publication_year":2025,"publication_date":"2025-05-02","ids":{"openalex":"https://openalex.org/W4417092509","doi":"https://doi.org/10.48550/arxiv.2505.01481"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2505.01481","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.01481","pdf_url":"https://arxiv.org/pdf/2505.01481","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2505.01481","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5053817812","display_name":"Zongxia Li","orcid":"https://orcid.org/0009-0001-1437-5132"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Zongxia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078545561","display_name":"Xiyang Wu","orcid":"https://orcid.org/0000-0001-8538-8267"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Xiyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088872677","display_name":"Guangyao Shi","orcid":"https://orcid.org/0000-0002-1164-5969"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Guangyao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103130333","display_name":"Qin Ye","orcid":"https://orcid.org/0000-0002-2191-3568"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Yubin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068782412","display_name":"Hongyang Du","orcid":"https://orcid.org/0000-0002-8220-6525"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Hongyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027505093","display_name":"Fuxiao Liu","orcid":"https://orcid.org/0000-0002-3078-0613"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Fuxiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039076312","display_name":"Tianyi Zhou","orcid":"https://orcid.org/0000-0001-5348-0632"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Tianyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004194238","display_name":"Dinesh Manocha","orcid":"https://orcid.org/0000-0001-7047-9801"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Manocha, Dinesh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5092513269","display_name":"Jordan Lee Boyd-Graber","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Boyd-Graber, Jordan Lee","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5053817812"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.3833000063896179,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.3833000063896179,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.2563999891281128,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.0869000032544136,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6919000148773193},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.527999997138977},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.47290000319480896},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.4417000114917755},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.39969998598098755},{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.33230000734329224},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3278999924659729}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6919000148773193},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6628000140190125},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6247000098228455},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.527999997138977},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.49939998984336853},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.47290000319480896},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.4417000114917755},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.39969998598098755},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.33230000734329224},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3278999924659729},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3176000118255615},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2890999913215637},{"id":"https://openalex.org/C2779321571","wikidata":"https://www.wikidata.org/wiki/Q7936605","display_name":"Visual learning","level":2,"score":0.27950000762939453},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2766999900341034},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.25049999356269836},{"id":"https://openalex.org/C121687571","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Activity recognition","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2505.01481","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.01481","pdf_url":"https://arxiv.org/pdf/2505.01481","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2505.01481","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2505.01481","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2505.01481","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.01481","pdf_url":"https://arxiv.org/pdf/2505.01481","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1],"(VLMs)":[2],"have":[3],"achieved":[4],"strong":[5,124],"results":[6,125],"in":[7,140],"video":[8],"understanding,":[9,32],"yet":[10],"a":[11,90],"key":[12],"question":[13],"remains:":[14],"do":[15],"they":[16,133],"truly":[17],"comprehend":[18],"visual":[19,31,141],"content":[20],"or":[21,83],"only":[22],"learn":[23],"shallow":[24],"correlations":[25],"between":[26],"vision":[27],"and":[28,36,95,102,131],"language?":[29],"Real":[30],"especially":[33],"of":[34,93,112,115,150],"physics":[35],"common":[37],"sense,":[38],"is":[39,160],"essential":[40],"for":[41],"AI":[42],"systems":[43],"that":[44,79],"interact":[45],"with":[46,99],"the":[47],"physical":[48],"world.":[49],"Current":[50],"evaluations":[51],"mostly":[52],"use":[53],"real-world":[54],"videos":[55,78],"similar":[56],"to":[57],"training":[58],"data,":[59],"so":[60],"high":[61],"benchmark":[62,156],"scores":[63],"may":[64],"not":[65],"reflect":[66],"real":[67],"reasoning":[68],"ability.":[69],"To":[70],"address":[71],"this,":[72],"we":[73],"propose":[74],"negative-control":[75],"tests":[76],"using":[77],"depict":[80],"physically":[81],"impossible":[82],"logically":[84],"inconsistent":[85],"events.":[86],"We":[87],"introduce":[88],"VideoHallu,":[89],"synthetic":[91],"dataset":[92],"physics-":[94],"commonsense-violating":[96],"scenes":[97],"generated":[98],"Veo2,":[100],"Sora,":[101],"Kling.":[103],"It":[104],"includes":[105],"expert-annotated":[106],"question-answer":[107],"pairs":[108],"across":[109],"four":[110],"categories":[111],"violations.":[113],"Tests":[114],"leading":[116],"VLMs":[117],"(Qwen-2.5-VL,":[118],"Video-R1,":[119],"VideoChat-R1)":[120],"show":[121],"that,":[122],"despite":[123],"on":[126,146],"benchmarks":[127],"such":[128,151],"as":[129],"MVBench":[130],"MMVU,":[132],"often":[134],"miss":[135],"these":[136],"violations,":[137],"exposing":[138],"gaps":[139],"reasoning.":[142],"Reinforcement":[143],"learning":[144],"fine-tuning":[145],"VideoHallu":[147],"improves":[148],"recognition":[149],"violations":[152],"without":[153],"reducing":[154],"standard":[155],"performance.":[157],"Our":[158],"data":[159],"available":[161],"at":[162],"https://github.com/zli12321/VideoHallu.git.":[163]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
