{"id":"https://openalex.org/W4384112212","doi":"https://doi.org/10.48550/arxiv.2307.05222","title":"Emu: Generative Pretraining in Multimodality","display_name":"Emu: Generative Pretraining in Multimodality","publication_year":2023,"publication_date":"2023-07-11","ids":{"openalex":"https://openalex.org/W4384112212","doi":"https://doi.org/10.48550/arxiv.2307.05222"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2307.05222","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2307.05222","pdf_url":"https://arxiv.org/pdf/2307.05222","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2307.05222","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100520355","display_name":"Quan Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sun, Quan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067981474","display_name":"Qiying Yu","orcid":"https://orcid.org/0009-0008-8245-3914"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Qiying","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052226444","display_name":"Yufeng Cui","orcid":"https://orcid.org/0000-0002-0031-7871"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cui, Yufeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025758534","display_name":"Fan Zhang","orcid":"https://orcid.org/0000-0002-2863-6938"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Fan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100780271","display_name":"Xiaosong Zhang","orcid":"https://orcid.org/0000-0002-9430-3525"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xiaosong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038675825","display_name":"Yueze Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yueze","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103206314","display_name":"Hongcheng Gao","orcid":"https://orcid.org/0000-0001-9327-2213"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Hongcheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100442542","display_name":"Jingjing Liu","orcid":"https://orcid.org/0009-0002-6277-5816"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jingjing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058066577","display_name":"Tiejun Huang","orcid":"https://orcid.org/0000-0002-4234-6099"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Tiejun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100653846","display_name":"Xinlong Wang","orcid":"https://orcid.org/0000-0002-8137-1692"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xinlong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5100520355"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":29,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9909999966621399,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9635999798774719,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7773959040641785},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.6473966836929321},{"id":"https://openalex.org/keywords/multimodality","display_name":"Multimodality","score":0.546053409576416},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5171141624450684},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.48603397607803345},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4678516983985901},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.434908926486969},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.423565149307251},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.41915401816368103},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.3843877911567688},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.32212311029434204},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.09533348679542542}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7773959040641785},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.6473966836929321},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.546053409576416},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5171141624450684},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.48603397607803345},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4678516983985901},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.434908926486969},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.423565149307251},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.41915401816368103},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3843877911567688},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32212311029434204},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.09533348679542542},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2307.05222","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2307.05222","pdf_url":"https://arxiv.org/pdf/2307.05222","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2307.05222","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2307.05222","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2307.05222","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2307.05222","pdf_url":"https://arxiv.org/pdf/2307.05222","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"score":0.6299999952316284,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4384112212.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W4310447809","https://openalex.org/W4200243030","https://openalex.org/W2800782462","https://openalex.org/W3209117276","https://openalex.org/W4388184981","https://openalex.org/W4323777661","https://openalex.org/W3016124757"],"abstract_inverted_index":{"We":[0],"present":[1],"Emu,":[2],"a":[3,38,66,125,143],"Transformer-based":[4],"multimodal":[5,16,27,83,127,169,175],"foundation":[6],"model,":[7],"which":[8],"can":[9,21,122],"seamlessly":[10],"generate":[11],"images":[12,109],"and":[13,35,50,104,110,118,132,135,139,158],"texts":[14],"in":[15,23,81],"context.":[17],"This":[18,85],"omnivore":[19],"model":[20],"take":[22],"any":[24],"single-modality":[25],"or":[26,75],"data":[28,94],"input":[29,58],"indiscriminately":[30],"(e.g.,":[31],"interleaved":[32,57,102,108],"image,":[33],"text":[34,53,73,140],"video)":[36],"through":[37],"one-model-for-all":[39],"autoregressive":[40],"training":[41],"process.":[42],"First,":[43],"visual":[44,79,152],"signals":[45],"are":[46,180],"encoded":[47],"into":[48],"embeddings,":[49],"together":[51],"with":[52,65,101,107,183],"tokens":[54],"form":[55],"an":[56],"sequence.":[59,84],"Emu":[60,121,161],"is":[61],"then":[62],"end-to-end":[63],"trained":[64],"unified":[67],"objective":[68],"of":[69,91,146],"classifying":[70],"the":[71,77,82,89],"next":[72,78],"token":[74],"regressing":[76],"embedding":[80],"versatile":[86],"multimodality":[87],"empowers":[88],"exploration":[90],"diverse":[92],"pretraining":[93],"sources":[95],"at":[96],"scale,":[97],"such":[98,173],"as":[99,112,114,124,174],"videos":[100],"frames":[103],"text,":[105,111],"webpages":[106],"well":[113],"web-scale":[115],"image-text":[116],"pairs":[117],"video-text":[119],"pairs.":[120],"serve":[123],"generalist":[126],"interface":[128],"for":[129],"both":[130],"image-to-text":[131],"text-to-image":[133,159],"tasks,":[134],"supports":[136],"in-context":[137],"image":[138,150],"generation.":[141],"Across":[142],"broad":[144],"range":[145],"zero-shot/few-shot":[147],"tasks":[148],"including":[149],"captioning,":[151],"question":[153,156],"answering,":[154],"video":[155],"answering":[157],"generation,":[160],"demonstrates":[162],"superb":[163],"performance":[164],"compared":[165],"to":[166],"state-of-the-art":[167],"large":[168],"models.":[170],"Extended":[171],"capabilities":[172],"assistants":[176],"via":[177],"instruction":[178],"tuning":[179],"also":[181],"demonstrated":[182],"impressive":[184],"performance.":[185]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":18},{"year":2023,"cited_by_count":7}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2023-07-13T00:00:00"}
