{"id":"https://openalex.org/W4394906559","doi":"https://doi.org/10.48550/arxiv.2404.10667","title":"VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time","display_name":"VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time","publication_year":2024,"publication_date":"2024-04-16","ids":{"openalex":"https://openalex.org/W4394906559","doi":"https://doi.org/10.48550/arxiv.2404.10667"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2404.10667","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.10667","pdf_url":"https://arxiv.org/pdf/2404.10667","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2404.10667","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5077900121","display_name":"Sicheng Xu","orcid":"https://orcid.org/0000-0002-7903-3934"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xu, Sicheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100623859","display_name":"Guojun Chen","orcid":"https://orcid.org/0000-0002-5896-2865"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Guojun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100787136","display_name":"Yuxiao Guo","orcid":"https://orcid.org/0009-0007-7082-5664"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Yu-Xiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076804411","display_name":"Jiaolong Yang","orcid":"https://orcid.org/0000-0002-7314-6567"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jiaolong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100424964","display_name":"Chong Li","orcid":"https://orcid.org/0000-0003-4549-4692"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058247066","display_name":"Zhenyu Zang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zang, Zhenyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101743497","display_name":"Yizhong Zhang","orcid":"https://orcid.org/0000-0002-4312-0040"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yizhong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100784734","display_name":"Xin Tong","orcid":"https://orcid.org/0000-0001-8788-2453"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tong, Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101666011","display_name":"Baining Guo","orcid":"https://orcid.org/0000-0001-8349-8868"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Baining","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5077900121"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9314000010490417,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9314000010490417,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9093999862670898,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9049000144004822,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5355268716812134},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4246358871459961},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.374449759721756},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3428995907306671},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3399474620819092}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5355268716812134},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4246358871459961},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.374449759721756},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3428995907306671},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3399474620819092}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2404.10667","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.10667","pdf_url":"https://arxiv.org/pdf/2404.10667","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2404.10667","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2404.10667","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2404.10667","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.10667","pdf_url":"https://arxiv.org/pdf/2404.10667","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2772917594","https://openalex.org/W2775347418","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"We":[0],"introduce":[1],"VASA,":[2],"a":[3,17,22,48,71,83,107],"framework":[4],"for":[5,161],"generating":[6,35],"lifelike":[7,165],"talking":[8],"faces":[9],"with":[10,42,133,153,164],"appealing":[11],"visual":[12],"affective":[13],"skills":[14],"(VAS)":[15],"given":[16],"single":[18],"static":[19],"image":[20],"and":[21,54,65,75,87,94,136],"speech":[23],"audio":[24],"clip.":[25],"Our":[26,125],"premiere":[27],"model,":[28],"VASA-1,":[29],"is":[30],"capable":[31],"of":[32,51,63,90,109,145],"not":[33,127],"only":[34,128],"lip":[36],"movements":[37],"that":[38,58,80,114,167],"are":[39],"exquisitely":[40],"synchronized":[41],"the":[43,61,88,142,159],"audio,":[44],"but":[45,139],"also":[46,140],"producing":[47],"large":[49],"spectrum":[50],"facial":[52,73,135],"nuances":[53],"natural":[55],"head":[56,76,137],"motions":[57],"contribute":[59],"to":[60,150],"perception":[62],"authenticity":[64],"liveliness.":[66],"The":[67],"core":[68],"innovations":[69],"include":[70],"holistic":[72],"dynamics":[74,138],"movement":[77],"generation":[78,144],"model":[79],"works":[81],"in":[82],"face":[84,96],"latent":[85,97],"space,":[86],"development":[89],"such":[91],"an":[92],"expressive":[93],"disentangled":[95],"space":[98],"using":[99],"videos.":[100],"Through":[101],"extensive":[102],"experiments":[103],"including":[104],"evaluation":[105],"on":[106],"set":[108],"new":[110],"metrics,":[111],"we":[112],"show":[113],"our":[115],"method":[116,126],"significantly":[117],"outperforms":[118],"previous":[119],"methods":[120],"along":[121],"various":[122],"dimensions":[123],"comprehensively.":[124],"delivers":[129],"high":[130],"video":[131],"quality":[132],"realistic":[134],"supports":[141],"online":[143],"512x512":[146],"videos":[147],"at":[148],"up":[149],"40":[151],"FPS":[152],"negligible":[154],"starting":[155],"latency.":[156],"It":[157],"paves":[158],"way":[160],"real-time":[162],"engagements":[163],"avatars":[166],"emulate":[168],"human":[169],"conversational":[170],"behaviors.":[171]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":6}],"updated_date":"2026-04-28T06:04:28.489925","created_date":"2025-10-10T00:00:00"}
