{"id":"https://openalex.org/W4387156669","doi":"https://doi.org/10.48550/arxiv.2309.15112","title":"InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition","display_name":"InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition","publication_year":2023,"publication_date":"2023-09-26","ids":{"openalex":"https://openalex.org/W4387156669","doi":"https://doi.org/10.48550/arxiv.2309.15112"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2309.15112","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.15112","pdf_url":"https://arxiv.org/pdf/2309.15112","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2309.15112","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101249981","display_name":"Pan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Pan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100373003","display_name":"Xiaoyi Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Xiaoyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104189020","display_name":"Bin Wang","orcid":"https://orcid.org/0009-0007-3155-3790"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Bin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010183483","display_name":"Yuhang Cao","orcid":"https://orcid.org/0009-0008-3627-590X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yuhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112920389","display_name":"Chao Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Chao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104670548","display_name":"Linke Ouyang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ouyang, Linke","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100575091","display_name":"Zhiyuan Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Zhiyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028468431","display_name":"Haodong Duan","orcid":"https://orcid.org/0000-0002-3052-4177"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Duan, Haodong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100747794","display_name":"Songyang Zhang","orcid":"https://orcid.org/0000-0003-4316-3320"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Songyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013238521","display_name":"Shuangrui Ding","orcid":"https://orcid.org/0000-0001-7033-774X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Shuangrui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Wenwei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Wenwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100567689","display_name":"Hang Yan","orcid":"https://orcid.org/0009-0004-5675-0406"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Hang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114889320","display_name":"Xinyue Zhang","orcid":"https://orcid.org/0000-0001-9068-7472"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xinyue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100317994","display_name":"Wei Li","orcid":"https://orcid.org/0000-0001-7015-7335"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100397274","display_name":"Jingwen Li","orcid":"https://orcid.org/0000-0001-8299-7390"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jingwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100438001","display_name":"Kai Chen","orcid":"https://orcid.org/0000-0003-2587-6028"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101615091","display_name":"Conghui He","orcid":"https://orcid.org/0000-0001-8697-695X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Conghui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102773472","display_name":"Xingcheng Zhang","orcid":"https://orcid.org/0009-0006-8525-0608"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xingcheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100748135","display_name":"Yu Qiao","orcid":"https://orcid.org/0000-0002-1889-2567"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010087030","display_name":"Dahua Lin","orcid":"https://orcid.org/0000-0002-8865-7896"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Dahua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100365340","display_name":"Jiaqi Wang","orcid":"https://orcid.org/0000-0001-6877-5353"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jiaqi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":21,"corresponding_author_ids":["https://openalex.org/A5101249981"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":31,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7593358755111694},{"id":"https://openalex.org/keywords/composition","display_name":"Composition (language)","score":0.730122447013855},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6822082996368408},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.5542181134223938},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.48945772647857666},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4786582887172699},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4550893306732178},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.4489155411720276},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.13725700974464417},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.07853904366493225}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7593358755111694},{"id":"https://openalex.org/C40231798","wikidata":"https://www.wikidata.org/wiki/Q1333743","display_name":"Composition (language)","level":2,"score":0.730122447013855},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6822082996368408},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.5542181134223938},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.48945772647857666},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4786582887172699},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4550893306732178},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.4489155411720276},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.13725700974464417},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.07853904366493225},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2309.15112","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.15112","pdf_url":"https://arxiv.org/pdf/2309.15112","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2309.15112","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2309.15112","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2309.15112","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.15112","pdf_url":"https://arxiv.org/pdf/2309.15112","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.8999999761581421,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4387156669.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2378211422","https://openalex.org/W2745001401","https://openalex.org/W4321353415","https://openalex.org/W2130974462","https://openalex.org/W2028665553","https://openalex.org/W2086519370","https://openalex.org/W972276598","https://openalex.org/W2087343574","https://openalex.org/W4246352526","https://openalex.org/W2121910908"],"abstract_inverted_index":{"We":[0],"propose":[1],"InternLM-XComposer,":[2],"a":[3,43,52,111,163],"vision-language":[4,131,203],"large":[5],"model":[6,19,121,213],"that":[7,38,167],"enables":[8],"advanced":[9,197],"image-text":[10],"comprehension":[11,94,199],"and":[12,35,46,55,78,145,171,191,200,205,209],"composition.":[13],"The":[14,92,211],"innovative":[15],"nature":[16],"of":[17,114,152],"our":[18,56,178],"is":[20,95],"highlighted":[21],"by":[22,97],"three":[23],"appealing":[24],"properties:":[25],"1)":[26],"Interleaved":[27],"Text-Image":[28],"Composition:":[29],"InternLM-XComposer":[30,179,194,212],"can":[31,64],"effortlessly":[32],"generate":[33,59],"coherent":[34],"contextual":[36],"articles":[37],"seamlessly":[39,195],"integrate":[40],"images,":[41],"providing":[42],"more":[44],"engaging":[45],"immersive":[47],"reading":[48],"experience.":[49],"Simply":[50],"provide":[51],"writing":[53],"instruction,":[54],"system":[57],"will":[58],"the":[60,67,70,76,81,150],"corresponding":[61],"manuscript.":[62],"It":[63],"intelligently":[65],"identify":[66],"areas":[68],"in":[69,110],"text":[71],"where":[72],"images":[73],"would":[74],"enhance":[75],"content":[77],"automatically":[79],"insert":[80],"most":[82],"appropriate":[83],"visual":[84,115],"candidates.":[85],"2)":[86],"Comprehension":[87],"with":[88,105],"Rich":[89],"Multilingual":[90],"Knowledge:":[91],"text-image":[93,158,182,198],"empowered":[96],"training":[98],"on":[99],"an":[100],"extensive":[101],"multi-modal":[102],"multilingual":[103],"database":[104],"carefully":[106],"crafted":[107],"strategies,":[108],"resulting":[109],"deep":[112],"understanding":[113],"content.":[116],"3)":[117],"State-of-the-art":[118],"Performance:":[119],"Our":[120],"consistently":[122],"achieves":[123,180],"state-of-the-art":[124],"results":[125],"across":[126],"various":[127],"mainstream":[128],"benchmarks":[129],"for":[130,155],"foundational":[132],"models,":[133],"including":[134,189],"MME":[135],"Benchmark,":[136],"MMBench,":[137],"MMBench-CN,":[138],"Seed-Bench,":[139],"CCBench":[140],"(Chinese":[141],"Cultural":[142],"Benchmark),":[143],"QBench":[144],"Tiny":[146],"LVLM.":[147],"Owing":[148],"to":[149,174,186],"absence":[151],"established":[153],"metrics":[154],"quantitatively":[156],"assessing":[157],"composition,":[159,201],"we":[160],"have":[161],"devised":[162],"robust":[164],"evaluation":[165],"procedure":[166],"comprises":[168],"both":[169],"human":[170],"GPT4-Vision":[172],"(GPT4-V)":[173],"ensure":[175],"reliability.":[176],"Notably,":[177],"competitive":[181],"composition":[183],"scores":[184],"compared":[185],"public":[187],"solutions,":[188],"GPT4-V":[190],"GPT3.5.":[192],"Collectively,":[193],"blends":[196],"revolutionizing":[202],"interaction":[204],"offering":[206],"new":[207],"insights":[208],"opportunities.":[210],"series":[214],"are":[215],"publicly":[216],"available":[217],"at":[218],"https://github.com/InternLM/InternLM-XComposer.":[219]},"counts_by_year":[{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":20}],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2025-10-10T00:00:00"}
