{"id":"https://openalex.org/W4402386435","doi":"https://doi.org/10.48550/arxiv.2408.05211","title":"VITA: Towards Open-Source Interactive Omni Multimodal LLM","display_name":"VITA: Towards Open-Source Interactive Omni Multimodal LLM","publication_year":2024,"publication_date":"2024-08-09","ids":{"openalex":"https://openalex.org/W4402386435","doi":"https://doi.org/10.48550/arxiv.2408.05211"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2408.05211","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.05211","pdf_url":"https://arxiv.org/pdf/2408.05211","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2408.05211","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5014172220","display_name":"Chaoyou Fu","orcid":"https://orcid.org/0000-0002-0079-7668"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Fu, Chaoyou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065463441","display_name":"Haojia Lin","orcid":"https://orcid.org/0000-0001-6392-7338"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Haojia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108982768","display_name":"Zuwei Long","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Long, Zuwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039883116","display_name":"Yunhang Shen","orcid":"https://orcid.org/0000-0002-3970-7519"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Yunhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Dai, Yuhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Yuhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059144360","display_name":"Zhao Meng","orcid":"https://orcid.org/0000-0002-6993-3189"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Meng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Yi-Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yi-Fan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Dong, Shaoqi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Shaoqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Yangze","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yangze","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058617609","display_name":"Xiong Wang","orcid":"https://orcid.org/0000-0002-4493-2155"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xiong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cao, Haoyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Haoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100758011","display_name":"Di Yin","orcid":"https://orcid.org/0000-0002-3272-9643"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Di","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048142743","display_name":"Long Ma","orcid":"https://orcid.org/0009-0004-5695-8274"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Long","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054226277","display_name":"Xiawu Zheng","orcid":"https://orcid.org/0000-0002-6855-5403"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Xiawu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016080094","display_name":"Rongrong Ji","orcid":"https://orcid.org/0000-0001-9163-2932"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Rongrong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091425569","display_name":"Yunsheng Wu","orcid":"https://orcid.org/0000-0001-7462-1414"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yunsheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109636783","display_name":"He Ran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Ran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055478558","display_name":"Caifeng Shan","orcid":"https://orcid.org/0000-0002-2131-1671"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Caifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5004402130","display_name":"Xing Sun","orcid":"https://orcid.org/0000-0001-8132-9083"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Xing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":19,"corresponding_author_ids":["https://openalex.org/A5014172220"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9742000102996826,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9595000147819519,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/open-source","display_name":"Open source","score":0.7561114430427551},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.48680493235588074},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.4535828232765198},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.06474295258522034},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.05289134383201599}],"concepts":[{"id":"https://openalex.org/C3018397939","wikidata":"https://www.wikidata.org/wiki/Q3644502","display_name":"Open source","level":3,"score":0.7561114430427551},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.48680493235588074},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4535828232765198},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.06474295258522034},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.05289134383201599}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2408.05211","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.05211","pdf_url":"https://arxiv.org/pdf/2408.05211","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2408.05211","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2408.05211","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2408.05211","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.05211","pdf_url":"https://arxiv.org/pdf/2408.05211","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"The":[0],"remarkable":[1],"multimodal":[2,55,92,121,135,154],"capabilities":[3,86,101],"and":[4,41,47,50,84,94,105,120,156],"interactive":[5,56],"experience":[6],"of":[7,43,91,102,117,153,163],"GPT-4o":[8],"underscore":[9],"their":[10],"necessity":[11],"in":[12,20,131],"practical":[13],"applications,":[14],"yet":[15],"open-source":[16,31,146],"models":[17],"rarely":[18],"excel":[19],"both":[21,118],"areas.":[22],"In":[23],"this":[24],"paper,":[25],"we":[26,66,126,176],"introduce":[27],"VITA,":[28],"the":[29,79,133,141,145,150],"first-ever":[30],"Multimodal":[32],"Large":[33],"Language":[34],"Model":[35],"(MLLM)":[36],"adept":[37],"at":[38],"simultaneous":[39],"processing":[40],"analysis":[42],"Video,":[44],"Image,":[45],"Text,":[46],"Audio":[48],"modalities,":[49],"meanwhile":[51],"has":[52],"an":[53],"advanced":[54],"experience.":[57,138],"Starting":[58],"from":[59],"Mixtral":[60],"8x7B":[61],"as":[62,108,181,186],"a":[63,115,182,187],"language":[64,80],"foundation,":[65],"expand":[67],"its":[68,111,179],"Chinese":[69],"vocabulary":[70],"followed":[71],"by":[72,110],"bilingual":[73],"instruction":[74,95],"tuning.":[75,96],"We":[76],"further":[77],"endow":[78],"model":[81],"with":[82],"visual":[83],"audio":[85,106],"through":[87],"two-stage":[88],"multi-task":[89],"learning":[90],"alignment":[93],"VITA":[97,139,169],"demonstrates":[98],"robust":[99],"foundational":[100,124],"multilingual,":[103],"vision,":[104],"understanding,":[107],"evidenced":[109],"strong":[112],"performance":[113],"across":[114],"range":[116],"unimodal":[119],"benchmarks.":[122],"Beyond":[123],"capabilities,":[125],"have":[127],"made":[128],"considerable":[129],"progress":[130],"enhancing":[132],"natural":[134],"human-computer":[136],"interaction":[137],"is":[140,160],"first":[142],"step":[143],"for":[144,189],"community":[147],"to":[148,165,170,173],"explore":[149],"seamless":[151],"integration":[152],"understanding":[155],"interaction.":[157],"While":[158],"there":[159],"still":[161],"lots":[162],"work":[164],"be":[166],"done":[167],"on":[168],"get":[171],"close":[172],"close-source":[174],"counterparts,":[175],"hope":[177],"that":[178],"role":[180],"pioneer":[183],"can":[184],"serve":[185],"cornerstone":[188],"subsequent":[190],"research.":[191],"Project":[192],"Page:":[193],"https://vita-home.github.io.":[194]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1}],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-10-10T00:00:00"}
