{"id":"https://openalex.org/W7117120441","doi":"https://doi.org/10.48550/arxiv.2512.18706","title":"X-Talk: On the Underestimated Potential of Modular Speech-to-Speech Dialogue System","display_name":"X-Talk: On the Underestimated Potential of Modular Speech-to-Speech Dialogue System","publication_year":2025,"publication_date":"2025-12-21","ids":{"openalex":"https://openalex.org/W7117120441","doi":"https://doi.org/10.48550/arxiv.2512.18706"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2512.18706","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.18706","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2512.18706","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5121131055","display_name":"Zhanxun Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Zhanxun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121176022","display_name":"Yifan Duan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Duan, Yifan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121205237","display_name":"Mengmeng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Mengmeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121209910","display_name":"Pengchao Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Pengchao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121238769","display_name":"Haotian Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Haotian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088133604","display_name":"Xiaoyu Xing","orcid":"https://orcid.org/0000-0003-0539-0936"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xing, Xiaoyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111330192","display_name":"Yijia Shan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Yijia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066440895","display_name":"Haina Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Haina","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109714651","display_name":"Yuhang Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Yuhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050188515","display_name":"Chaochao Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Chaochao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000302538","display_name":"Xipeng QIu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiu, Xipeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121230082","display_name":"Lei Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Lei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121131709","display_name":"Lan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Lan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121222474","display_name":"Nan Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Nan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121148448","display_name":"Zilong Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Zilong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121128678","display_name":"Ziyang Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Ziyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121170737","display_name":"Kai Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5121218473","display_name":"Xie Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Xie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":18,"corresponding_author_ids":["https://openalex.org/A5121131055"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.5317999720573425,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.5317999720573425,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.13860000669956207,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.12919999659061432,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.8802000284194946},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5573999881744385},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3456999957561493},{"id":"https://openalex.org/keywords/modular-construction","display_name":"Modular construction","score":0.32659998536109924},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.3046000003814697}],"concepts":[{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.8802000284194946},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.680899977684021},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5573999881744385},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.36149999499320984},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3456999957561493},{"id":"https://openalex.org/C2777614354","wikidata":"https://www.wikidata.org/wiki/Q55648394","display_name":"Modular construction","level":3,"score":0.32659998536109924},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.31380000710487366},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.3046000003814697},{"id":"https://openalex.org/C88482812","wikidata":"https://www.wikidata.org/wiki/Q6453666","display_name":"Modular programming","level":2,"score":0.2809999883174896},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2784999907016754},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2660999894142151},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.2651999890804291}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2512.18706","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.18706","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2512.18706","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.18706","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1],"X-Talk,":[2],"an":[3],"open-source":[4],"framework":[5,67],"that":[6,52],"champions":[7],"a":[8,43,53,114],"decoupled,":[9],"modular":[10,64,109],"design":[11],"for":[12,117],"LLM-driven":[13],"speech-to-speech":[14],"(S2S)":[15],"systems.":[16],"While":[17],"the":[18,35,100,105],"dominant":[19],"trend":[20],"favors":[21],"end-to-end":[22],"(E2E)":[23],"modeling":[24],"to":[25,33],"optimize":[26],"information":[27],"flow,":[28],"these":[29],"\"omni-models\"":[30],"often":[31],"struggle":[32],"balance":[34],"competing":[36],"objectives":[37],"of":[38,108],"complex":[39],"speech":[40,75],"tasks":[41],"within":[42],"single":[44],"network.":[45],"X-Talk":[46,103],"challenges":[47],"this":[48],"paradigm":[49],"by":[50],"demonstrating":[51],"systematically":[54],"optimized":[55],"cascaded":[56,101],"pipeline":[57],"can":[58],"achieve":[59],"sub-second":[60],"latency":[61],"without":[62],"sacrificing":[63],"flexibility.":[65],"Our":[66],"seamlessly":[68],"integrates":[69],"specialized":[70],"front-end":[71],"components":[72],"(e.g.,":[73,81],"VAD,":[74],"enhancement)":[76],"and":[77,84,95,112,120],"diverse":[78],"understanding":[79],"models":[80],"ASR,":[82],"emotion,":[83],"environmental":[85],"sound":[86],"analysis)":[87],"with":[88],"LLM":[89],"capabilities":[90],"like":[91],"retrieval-augmented":[92],"generation":[93],"(RAG)":[94],"tool":[96],"use.":[97],"By":[98],"revitalizing":[99],"approach,":[102],"highlights":[104],"underestimated":[106],"potential":[107],"S2S":[110],"systems":[111],"provides":[113],"robust":[115],"foundation":[116],"future":[118],"research":[119],"applications.":[121]},"counts_by_year":[],"updated_date":"2025-12-24T23:14:05.333182","created_date":"2025-12-24T00:00:00"}
