{"id":"https://openalex.org/W4280561221","doi":"https://doi.org/10.48550/arxiv.2205.04421","title":"NaturalSpeech: End-to-End Text to Speech Synthesis with Human-Level Quality","display_name":"NaturalSpeech: End-to-End Text to Speech Synthesis with Human-Level Quality","publication_year":2022,"publication_date":"2022-05-09","ids":{"openalex":"https://openalex.org/W4280561221","doi":"https://doi.org/10.48550/arxiv.2205.04421"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2205.04421","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2205.04421","pdf_url":"https://arxiv.org/pdf/2205.04421","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2205.04421","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100385893","display_name":"Xu Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Tan, Xu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100362824","display_name":"Jiawei Chen","orcid":"https://orcid.org/0000-0002-9759-9747"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jiawei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089924305","display_name":"Haohe Liu","orcid":"https://orcid.org/0000-0003-1036-7888"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Haohe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110372538","display_name":"Jian Cong","orcid":"https://orcid.org/0000-0003-3775-6883"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cong, Jian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120779464","display_name":"Chen Zhang","orcid":"https://orcid.org/0009-0004-3596-4683"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Chen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100360935","display_name":"Yanqing Liu","orcid":"https://orcid.org/0000-0003-0412-8805"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yanqing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100442237","display_name":"Xi Wang","orcid":"https://orcid.org/0000-0002-0434-7939"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053168506","display_name":"Yichong Leng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Leng, Yichong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113099688","display_name":"Yuanhao Yi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi, Yuanhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114859990","display_name":"Lei He","orcid":"https://orcid.org/0009-0001-7753-4700"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Lei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065394791","display_name":"Frank K. Soong","orcid":"https://orcid.org/0000-0002-9088-3577"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Soong, Frank","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020025718","display_name":"Tao Qin","orcid":"https://orcid.org/0000-0002-9095-0776"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Tao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100329353","display_name":"Sheng Zhao","orcid":"https://orcid.org/0000-0002-9624-5381"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Sheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101884287","display_name":"Tie\u2010Yan Liu","orcid":"https://orcid.org/0000-0002-0476-8020"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Tie-Yan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":["https://openalex.org/A5100385893"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":35,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9588000178337097,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9538000226020813,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mean-opinion-score","display_name":"Mean opinion score","score":0.8031991124153137},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7092844247817993},{"id":"https://openalex.org/keywords/wilcoxon-signed-rank-test","display_name":"Wilcoxon signed-rank test","score":0.6634155511856079},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5390225648880005},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5333836674690247},{"id":"https://openalex.org/keywords/quality-score","display_name":"Quality Score","score":0.5104817152023315},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5067275166511536},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.5050714612007141},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4721567630767822},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.46601182222366333},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4305807948112488},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.36967146396636963},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.15011322498321533},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1243928074836731},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08156415820121765}],"concepts":[{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.8031991124153137},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7092844247817993},{"id":"https://openalex.org/C206041023","wikidata":"https://www.wikidata.org/wiki/Q1751970","display_name":"Wilcoxon signed-rank test","level":3,"score":0.6634155511856079},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5390225648880005},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5333836674690247},{"id":"https://openalex.org/C2779346075","wikidata":"https://www.wikidata.org/wiki/Q7268763","display_name":"Quality Score","level":3,"score":0.5104817152023315},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5067275166511536},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.5050714612007141},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4721567630767822},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.46601182222366333},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4305807948112488},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.36967146396636963},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.15011322498321533},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1243928074836731},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08156415820121765},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C12868164","wikidata":"https://www.wikidata.org/wiki/Q1424533","display_name":"Mann\u2013Whitney U test","level":2,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2205.04421","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2205.04421","pdf_url":"https://arxiv.org/pdf/2205.04421","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2205.04421","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2205.04421","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2205.04421","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2205.04421","pdf_url":"https://arxiv.org/pdf/2205.04421","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.44999998807907104,"display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G8857457899","display_name":null,"funder_award_id":"EP/T019751/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2033549879","https://openalex.org/W2013629269","https://openalex.org/W2096935541","https://openalex.org/W4221158116","https://openalex.org/W2943685190","https://openalex.org/W4391020683","https://openalex.org/W3027116454","https://openalex.org/W2986204340","https://openalex.org/W3047188278","https://openalex.org/W4304098873"],"abstract_inverted_index":{"Text":[0],"to":[1,30,36,64,93,100,151],"speech":[2],"(TTS)":[3],"has":[4],"made":[5],"rapid":[6],"progress":[7],"in":[8,13,131],"both":[9],"academia":[10],"and":[11,34,60,67,109,127],"industry":[12],"recent":[14],"years.":[15],"Some":[16],"questions":[17,45],"naturally":[18],"arise":[19],"that":[20,32,75,140],"whether":[21],"a":[22,70,80,86,128],"TTS":[23,71],"system":[24,72],"can":[25],"achieve":[26,37],"human-level":[27,50,77],"quality,":[28],"how":[29,35],"define/judge":[31],"quality":[33,51,78],"it.":[38],"In":[39],"this":[40,182],"paper,":[41],"we":[42,84],"answer":[43],"these":[44],"by":[46],"first":[47,179],"defining":[48],"the":[49,54,102,105,111,114,155,178],"based":[52],"on":[53,79,135,181],"statistical":[55],"significance":[56],"of":[57,104,113],"subjective":[58],"measure":[59],"introducing":[61],"appropriate":[62],"guidelines":[63],"judge":[65],"it,":[66],"then":[68],"developing":[69],"called":[73],"NaturalSpeech":[74,143],"achieves":[76,144],"benchmark":[81],"dataset.":[82,183],"Specifically,":[83],"leverage":[85],"variational":[87],"autoencoder":[88],"(VAE)":[89],"for":[90,177],"end-to-end":[91],"text":[92,108],"waveform":[94],"generation,":[95],"with":[96,158],"several":[97],"key":[98],"modules":[99],"enhance":[101],"capacity":[103],"prior":[106],"from":[107,116,174],"reduce":[110],"complexity":[112],"posterior":[115],"speech,":[117],"including":[118],"phoneme":[119],"pre-training,":[120],"differentiable":[121],"duration":[122],"modeling,":[123,126],"bidirectional":[124],"prior/posterior":[125],"memory":[129],"mechanism":[130],"VAE.":[132],"Experiment":[133],"evaluations":[134],"popular":[136],"LJSpeech":[137],"dataset":[138],"show":[139],"our":[141],"proposed":[142],"-0.01":[145],"CMOS":[146],"(comparative":[147],"mean":[148],"opinion":[149],"score)":[150],"human":[152,175],"recordings":[153,176],"at":[154,163],"sentence":[156],"level,":[157],"Wilcoxon":[159],"signed":[160],"rank":[161],"test":[162],"p-level":[164],"p":[165],"&gt;&gt;":[166],"0.05,":[167],"which":[168],"demonstrates":[169],"no":[170],"statistically":[171],"significant":[172],"difference":[173],"time":[180]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":10},{"year":2023,"cited_by_count":21}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2022-05-22T00:00:00"}
