{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T17:59:02Z","timestamp":1777399142776,"version":"3.51.4"},"reference-count":85,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100003995","name":"Anhui Provincial Natural Science Foundation","doi-asserted-by":"publisher","award":["2308085QF200"],"award-info":[{"award-number":["2308085QF200"]}],"id":[{"id":"10.13039\/501100003995","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62301521"],"award-info":[{"award-number":["62301521"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U23B2053"],"award-info":[{"award-number":["U23B2053"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1016\/j.neunet.2025.107562","type":"journal-article","created":{"date-parts":[[2025,5,12]],"date-time":"2025-05-12T15:37:57Z","timestamp":1747064277000},"page":"107562","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":30,"special_numbering":"C","title":["Explicit estimation of magnitude and phase spectra in parallel for high-quality speech enhancement"],"prefix":"10.1016","volume":"189","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-8026-0702","authenticated-orcid":false,"given":"Ye-Xin","family":"Lu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6668-022X","authenticated-orcid":false,"given":"Yang","family":"Ai","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7853-5273","authenticated-orcid":false,"given":"Zhen-Hua","family":"Ling","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2025.107562_b1","doi-asserted-by":"crossref","first-page":"2477","DOI":"10.1109\/TASLP.2024.3393718","article-title":"CMGAN: Conformer-based metric-GAN for monaural speech enhancement","volume":"32","author":"Abdulatif","year":"2024","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b2","doi-asserted-by":"crossref","unstructured":"Abel, J., Strake, M., & Fingscheidt, T. (2018). A simple cepstral domain DNN approach to artificial speech bandwidth extension. In Proc. ICASSP (pp. 5469\u20135473).","DOI":"10.1109\/ICASSP.2018.8462362"},{"key":"10.1016\/j.neunet.2025.107562_b3","doi-asserted-by":"crossref","unstructured":"Ai, Y., & Ling, Z.-H. (2023). Neural Speech Phase Prediction based on Parallel Estimation Architecture and Anti-Wrapping Losses. In Proc. ICASSP (pp. 1\u20135).","DOI":"10.1109\/ICASSP49357.2023.10096553"},{"key":"10.1016\/j.neunet.2025.107562_b4","doi-asserted-by":"crossref","unstructured":"Ai, Y., Zhang, J.-X., Chen, L., & Ling, Z.-H. (2019). DNN-based spectral enhancement for neural waveform generators with low-bit quantization. In Proc. ICASSP (pp. 7025\u20137029).","DOI":"10.1109\/ICASSP.2019.8683016"},{"key":"10.1016\/j.neunet.2025.107562_b5","series-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"10.1016\/j.neunet.2025.107562_b6","series-title":"Speech enhancement","author":"Benesty","year":"2006"},{"key":"10.1016\/j.neunet.2025.107562_b7","doi-asserted-by":"crossref","unstructured":"Berouti, M., Schwartz, R., & Makhoul, J. (1979). Enhancement of speech corrupted by acoustic noise. Vol. 4, In Proc. ICASSP (pp. 208\u2013211).","DOI":"10.1109\/ICASSP.1979.1170788"},{"key":"10.1016\/j.neunet.2025.107562_b8","doi-asserted-by":"crossref","unstructured":"Chinen, M., Lim, F. S., Skoglund, J., Gureev, N., O\u2019Gorman, F., & Hines, A. (2020). ViSQOL v3: An open source production ready objective speech and audio metric. In Proc. qoMEX (pp. 1\u20136).","DOI":"10.1109\/QoMEX48832.2020.9123150"},{"key":"10.1016\/j.neunet.2025.107562_b9","unstructured":"Choi, H.-S., Kim, J.-H., Huh, J., Kim, A., Ha, J.-W., & Lee, K. (2019). Phase-aware speech enhancement with deep complex U-Net. In Proc. ICLR."},{"key":"10.1016\/j.neunet.2025.107562_b10","doi-asserted-by":"crossref","unstructured":"Dang, F., Chen, H., & Zhang, P. (2022). DPT-FSNet: Dual-path transformer based full-band and sub-band fusion network for speech enhancement. In Proc. ICASSP (pp. 6857\u20136861).","DOI":"10.1109\/ICASSP43922.2022.9746171"},{"key":"10.1016\/j.neunet.2025.107562_b11","doi-asserted-by":"crossref","unstructured":"D\u00e9fossez, A., Synnaeve, G., & Adi, Y. (2020). Real Time Speech Enhancement in the Waveform Domain. In Proc. Interspeech (pp. 3291\u20133295).","DOI":"10.21437\/Interspeech.2020-2409"},{"issue":"1","key":"10.1016\/j.neunet.2025.107562_b12","doi-asserted-by":"crossref","first-page":"45","DOI":"10.1016\/0167-6393(91)90027-Q","article-title":"Speech enhancement from noise: A regenerative approach","volume":"10","author":"Dendrinos","year":"1991","journal-title":"Speech Communication"},{"issue":"6","key":"10.1016\/j.neunet.2025.107562_b13","doi-asserted-by":"crossref","first-page":"600","DOI":"10.1097\/AUD.0000000000000028","article-title":"The effect of hearing aid noise reduction on listening effort in hearing-impaired adults","volume":"35","author":"Desjardins","year":"2014","journal-title":"Ear and Hearing"},{"issue":"10","key":"10.1016\/j.neunet.2025.107562_b14","doi-asserted-by":"crossref","first-page":"1526","DOI":"10.1109\/5.168664","article-title":"Statistical-model-based speech enhancement systems","volume":"80","author":"Ephraim","year":"1992","journal-title":"Proceedings of the IEEE"},{"issue":"4","key":"10.1016\/j.neunet.2025.107562_b15","doi-asserted-by":"crossref","first-page":"251","DOI":"10.1109\/89.397090","article-title":"A signal subspace approach for speech enhancement","volume":"3","author":"Ephraim","year":"1995","journal-title":"IEEE Transactions on Speech and Audio Processing"},{"key":"10.1016\/j.neunet.2025.107562_b16","doi-asserted-by":"crossref","unstructured":"Erdogan, H., Hershey, J. R., Watanabe, S., & Le Roux, J. (2015). Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks. In Proc. ICASSP (pp. 708\u2013712).","DOI":"10.1109\/ICASSP.2015.7178061"},{"key":"10.1016\/j.neunet.2025.107562_b17","doi-asserted-by":"crossref","unstructured":"Ernst, O., Chazan, S. E., Gannot, S., & Goldberger, J. (2018). Speech dereverberation using fully convolutional networks. In Proc. EUSiPCO (pp. 390\u2013394).","DOI":"10.23919\/EUSIPCO.2018.8553141"},{"key":"10.1016\/j.neunet.2025.107562_b18","unstructured":"Fu, S.-W., Liao, C.-F., Tsao, Y., & Lin, S.-D. (2019). MetricGAN: Generative adversarial networks based black-box metric scores optimization for speech enhancement. In Proc. ICML (pp. 2031\u20132041)."},{"key":"10.1016\/j.neunet.2025.107562_b19","doi-asserted-by":"crossref","unstructured":"Fu, Y., Liu, Y., Li, J., Luo, D., Lv, S., Jv, Y., & Xie, L. (2022). Uformer: A unet based dilated complex & real dual-path conformer network for simultaneous speech enhancement and dereverberation. In Proc. ICASSP (pp. 7417\u20137421).","DOI":"10.1109\/ICASSP43922.2022.9746020"},{"key":"10.1016\/j.neunet.2025.107562_b20","doi-asserted-by":"crossref","unstructured":"Fu, S.-W., Yu, C., Hsieh, T.-A., Plantinga, P., Ravanelli, M., Lu, X., & Tsao, Y. (2021). MetricGAN+: An improved version of MetricGAN for speech enhancement. In Proc. Interspeech (pp. 201\u2013205).","DOI":"10.21437\/Interspeech.2021-599"},{"key":"10.1016\/j.neunet.2025.107562_b21","unstructured":"Glorot, X., Bordes, A., & Bengio, Y. (2011). Deep sparse rectifier neural networks. In Proc. AISTATS (pp. 315\u2013323)."},{"issue":"5","key":"10.1016\/j.neunet.2025.107562_b22","doi-asserted-by":"crossref","first-page":"380","DOI":"10.1109\/TASSP.1976.1162849","article-title":"Distance measures for speech processing","volume":"24","author":"Gray","year":"1976","journal-title":"IEEE Transactions on Acoustics, Speech and Signal Processing"},{"key":"10.1016\/j.neunet.2025.107562_b23","unstructured":"Gritsenko, A., Salimans, T., van den Berg, R., Snoek, J., & Kalchbrenner, N. (2020). A spectral energy distance for parallel speech synthesis. Vol. 33, In Proc. NeurIPS (pp. 13062\u201313072)."},{"key":"10.1016\/j.neunet.2025.107562_b24","doi-asserted-by":"crossref","unstructured":"Hao, X., Su, X., Horaud, R., & Li, X. (2021). FullSubNet: A full-band and sub-band fusion model for real-time single-channel speech enhancement. In Proc. ICASSP (pp. 6633\u20136637).","DOI":"10.1109\/ICASSP39728.2021.9414177"},{"key":"10.1016\/j.neunet.2025.107562_b25","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2015). Delving deep into rectifiers: Surpassing human-level performance on imagenet classification. In Proc. ICCV (pp. 1026\u20131034).","DOI":"10.1109\/ICCV.2015.123"},{"issue":"1","key":"10.1016\/j.neunet.2025.107562_b26","doi-asserted-by":"crossref","first-page":"229","DOI":"10.1109\/TASL.2007.911054","article-title":"Evaluation of objective quality measures for speech enhancement","volume":"16","author":"Hu","year":"2007","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b27","unstructured":"Hu, G., & Wang, D. (2001). Speech segregation based on pitch tracking and amplitude modulation. In Proc. WASPAA (pp. 79\u201382)."},{"key":"10.1016\/j.neunet.2025.107562_b28","doi-asserted-by":"crossref","unstructured":"Kim, E., & Seo, H. (2021). SE-Conformer: Time-Domain Speech Enhancement Using Conformer. In Proc. Interspeech (pp. 2736\u20132740).","DOI":"10.21437\/Interspeech.2021-2207"},{"key":"10.1016\/j.neunet.2025.107562_b29","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/s13634-016-0306-6","article-title":"A summary of the REVERB challenge: state-of-the-art and remaining challenges in reverberant speech processing research","volume":"2016","author":"Kinoshita","year":"2016","journal-title":"EURASIP Journal on Advances in Signal Processing"},{"key":"10.1016\/j.neunet.2025.107562_b30","doi-asserted-by":"crossref","first-page":"1600","DOI":"10.1109\/TASLP.2022.3155286","article-title":"SkipConvGAN: Monaural speech dereverberation using generative adversarial networks via complex time-frequency masking","volume":"30","author":"Kothapally","year":"2022","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b31","doi-asserted-by":"crossref","first-page":"1712","DOI":"10.1109\/TASLP.2024.3358720","article-title":"Monaural speech dereverberation using deformable convolutional networks","volume":"32","author":"Kothapally","year":"2024","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b32","unstructured":"Kuleshov, V., Enam, S. Z., & Ermon, S. (2017). Audio super-resolution using neural nets. In Proc. ICLR (workshop track)."},{"key":"10.1016\/j.neunet.2025.107562_b33","unstructured":"Le Roux, J., Kameoka, H., Ono, N., & Sagayama, S. (2010). Fast signal reconstruction from magnitude STFT spectrogram based on spectrogram consistency. Vol. 10, In Proc. dAFx (pp. 397\u2013403)."},{"key":"10.1016\/j.neunet.2025.107562_b34","unstructured":"Le Roux, J., Ono, N., & Sagayama, S. (2008). Explicit consistency constraints for STFT spectrograms and their application to phase reconstruction.. In Proc. SAPA (pp. 23\u201328)."},{"issue":"2","key":"10.1016\/j.neunet.2025.107562_b35","doi-asserted-by":"crossref","first-page":"370","DOI":"10.1109\/JSTSP.2019.2904183","article-title":"Phasebook and friends: Leveraging discrete representations for source separation","volume":"13","author":"Le Roux","year":"2019","journal-title":"IEEE Journal of Selected Topics in Signal Processing"},{"key":"10.1016\/j.neunet.2025.107562_b36","doi-asserted-by":"crossref","unstructured":"Le Roux, J., Wisdom, S., Erdogan, H., & Hershey, J. R. (2019). SDR\u2013half-baked or well done?. In Proc. ICASSP (pp. 626\u2013630).","DOI":"10.1109\/ICASSP.2019.8683855"},{"key":"10.1016\/j.neunet.2025.107562_b37","doi-asserted-by":"crossref","unstructured":"Li, K., & Lee, C.-H. (2015). A deep neural network approach to speech bandwidth expansion. In Proc. ICASSP (pp. 4395\u20134399).","DOI":"10.1109\/ICASSP.2015.7178801"},{"key":"10.1016\/j.neunet.2025.107562_b38","doi-asserted-by":"crossref","unstructured":"Li, X., Li, J., & Yan, Y. (2017). Ideal Ratio Mask Estimation Using Deep Neural Networks for Monaural Speech Segregation in Noisy Reverberant Conditions.. In Proc. Interspeech (pp. 1203\u20131207).","DOI":"10.21437\/Interspeech.2017-549"},{"key":"10.1016\/j.neunet.2025.107562_b39","doi-asserted-by":"crossref","first-page":"1829","DOI":"10.1109\/TASLP.2021.3079813","article-title":"Two heads are better than one: A two-stage complex spectral mapping approach for monaural speech enhancement","volume":"29","author":"Li","year":"2021","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b40","doi-asserted-by":"crossref","unstructured":"Li, A., You, S., Yu, G., Zheng, C., & Li, X. (2022). Taylor, can you hear me now? a taylor-unfolding framework for monaural speech enhancement. In Proc. IJCAI (pp. 4193\u20134200).","DOI":"10.24963\/ijcai.2022\/582"},{"key":"10.1016\/j.neunet.2025.107562_b41","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106683","article-title":"A dual-region speech enhancement method based on voiceprint segmentation","volume":"180","author":"Li","year":"2024","journal-title":"Neural Networks"},{"issue":"3","key":"10.1016\/j.neunet.2025.107562_b42","doi-asserted-by":"crossref","first-page":"197","DOI":"10.1109\/TASSP.1978.1163086","article-title":"All-pole modeling of degraded speech","volume":"26","author":"Lim","year":"1978","journal-title":"IEEE Transactions on Acoustics, Speech and Signal Processing"},{"key":"10.1016\/j.neunet.2025.107562_b43","doi-asserted-by":"crossref","unstructured":"Lim, T. Y., Yeh, R. A., Xu, Y., Do, M. N., & Hasegawa-Johnson, M. (2018). Time-frequency networks for audio super-resolution. In Proc. ICASSP (pp. 646\u2013650).","DOI":"10.1109\/ICASSP.2018.8462049"},{"key":"10.1016\/j.neunet.2025.107562_b44","doi-asserted-by":"crossref","unstructured":"Lincoln, M., McCowan, I., Vepa, J., & Maganti, H. K. (2005). The multi-channel wall street journal audio visual corpus (MC-WSJ-AV): Specification and initial experiments. In Proc. ASRU (pp. 357\u2013362).","DOI":"10.1109\/ASRU.2005.1566470"},{"issue":"3","key":"10.1016\/j.neunet.2025.107562_b45","doi-asserted-by":"crossref","first-page":"35","DOI":"10.1109\/MSP.2014.2359987","article-title":"Deep learning for acoustic modeling in parametric speech generation: A systematic review of existing techniques and future trends","volume":"32","author":"Ling","year":"2015","journal-title":"IEEE Signal Processing Magazine"},{"key":"10.1016\/j.neunet.2025.107562_b46","doi-asserted-by":"crossref","unstructured":"Liu, H., Choi, W., Liu, X., Kong, Q., Tian, Q., & Wang, D. (2022). Neural Vocoder is All You Need for Speech Super-resolution. In Proc. Interspeech (pp. 4227\u20134231).","DOI":"10.21437\/Interspeech.2022-11017"},{"key":"10.1016\/j.neunet.2025.107562_b47","doi-asserted-by":"crossref","unstructured":"Liu, L., Guan, H., Ma, J., Dai, W., Wang, G., & Ding, S. (2023). A Mask Free Neural Network for Monaural Speech Enhancement. In Proc. Interspeech (pp. 2468\u20132472).","DOI":"10.21437\/Interspeech.2023-339"},{"key":"10.1016\/j.neunet.2025.107562_b48","series-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017"},{"key":"10.1016\/j.neunet.2025.107562_b49","doi-asserted-by":"crossref","unstructured":"Lu, Y.-X., Ai, Y., & Ling, Z.-H. (2023). MP-SENet: A Speech Enhancement Model with Parallel Denoising of Magnitude and Phase Spectra. In Proc. Interspeech (pp. 3834\u20133838).","DOI":"10.21437\/Interspeech.2023-1441"},{"issue":"7","key":"10.1016\/j.neunet.2025.107562_b50","doi-asserted-by":"crossref","first-page":"1717","DOI":"10.1109\/TASL.2010.2052251","article-title":"Speech dereverberation based on variance-normalized delayed linear prediction","volume":"18","author":"Nakatani","year":"2010","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b51","doi-asserted-by":"crossref","unstructured":"Narayanan, A., & Wang, D. (2013). Ideal ratio mask estimation using deep neural networks for robust speech recognition. In Proc. ICASSP (pp. 7092\u20137096).","DOI":"10.1109\/ICASSP.2013.6639038"},{"issue":"4","key":"10.1016\/j.neunet.2025.107562_b52","doi-asserted-by":"crossref","first-page":"465","DOI":"10.1016\/j.specom.2010.12.003","article-title":"The importance of phase in speech enhancement","volume":"53","author":"Paliwal","year":"2011","journal-title":"Speech Communication"},{"key":"10.1016\/j.neunet.2025.107562_b53","doi-asserted-by":"crossref","unstructured":"Pandey, A., & Wang, D. (2020). Densely connected neural network with dilated convolutions for real-time speech enhancement in the time domain. In Proc. ICASSP (pp. 6629\u20136633).","DOI":"10.1109\/ICASSP40776.2020.9054536"},{"key":"10.1016\/j.neunet.2025.107562_b54","doi-asserted-by":"crossref","unstructured":"Pascual, S., Bonafonte, A., & Serr\u00e0, J. (2017). SEGAN: Speech Enhancement Generative Adversarial Network. In Proc. Interspeech (pp. 3642\u20133646).","DOI":"10.21437\/Interspeech.2017-1428"},{"key":"10.1016\/j.neunet.2025.107562_b55","doi-asserted-by":"crossref","unstructured":"Reddy, C. K., Gopal, V., Cutler, R., Beyrami, E., Cheng, R., Dubey, H., Matusevych, S., Aichner, R., Aazami, A., & Braun, S. (2020). The INTERSPEECH 2020 Deep Noise Suppression Challenge: Datasets, Subjective Testing Framework, and Challenge Results. In Proc. Interspeech (pp. 2492\u20132496).","DOI":"10.21437\/Interspeech.2020-3038"},{"key":"10.1016\/j.neunet.2025.107562_b56","doi-asserted-by":"crossref","unstructured":"Rix, A. W., Beerends, J. G., Hollier, M. P., & Hekstra, A. P. (2001). Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs. Vol. 2, In Proc. ICASSP (pp. 749\u2013752).","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"10.1016\/j.neunet.2025.107562_b57","doi-asserted-by":"crossref","unstructured":"Robinson, T., Fransen, J., Pye, D., Foote, J., & Renals, S. (1995). WSJCAM0: a British English speech corpus for large vocabulary continuous speech recognition. Vol. 1, In Proc. ICASSP (pp. 81\u201384).","DOI":"10.1109\/ICASSP.1995.479278"},{"issue":"4","key":"10.1016\/j.neunet.2025.107562_b58","doi-asserted-by":"crossref","first-page":"2153","DOI":"10.1121\/1.3631668","article-title":"Intelligibility of reverberant noisy speech with ideal binary masking","volume":"130","author":"Roman","year":"2011","journal-title":"Journal of the Acoustical Society of America"},{"key":"10.1016\/j.neunet.2025.107562_b59","doi-asserted-by":"crossref","unstructured":"Shi, W., Caballero, J., Husz\u00e1r, F., Totz, J., Aitken, A. P., Bishop, R., Rueckert, D., & Wang, Z. (2016). Real-time single image and video super-resolution using an efficient sub-pixel convolutional neural network. In Proc. CVPR (pp. 1874\u20131883).","DOI":"10.1109\/CVPR.2016.207"},{"key":"10.1016\/j.neunet.2025.107562_b60","doi-asserted-by":"crossref","unstructured":"Sperber, M., Niehues, J., Neubig, G., St\u00fcker, S., & Waibel, A. (2018). Self-Attentional Acoustic Models. In Proc. Interspeech (pp. 3723\u20133727).","DOI":"10.21437\/Interspeech.2018-1910"},{"issue":"11","key":"10.1016\/j.neunet.2025.107562_b61","doi-asserted-by":"crossref","first-page":"1486","DOI":"10.1016\/j.specom.2006.09.003","article-title":"Binary and ratio time-frequency masks for robust speech recognition","volume":"48","author":"Srinivasan","year":"2006","journal-title":"Speech Communication"},{"issue":"7","key":"10.1016\/j.neunet.2025.107562_b62","doi-asserted-by":"crossref","first-page":"2125","DOI":"10.1109\/TASL.2011.2114881","article-title":"An algorithm for intelligibility prediction of time-frequency weighted noisy speech","volume":"19","author":"Taal","year":"2011","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b63","doi-asserted-by":"crossref","unstructured":"Tang, C., Luo, C., Zhao, Z., Xie, W., & Zeng, W. (2021). Joint time-frequency and time domain learning for speech enhancement. In Proc. IJCAI (pp. 3816\u20133822).","DOI":"10.24963\/ijcai.2020\/528"},{"key":"10.1016\/j.neunet.2025.107562_b64","doi-asserted-by":"crossref","unstructured":"Thiemann, J., Ito, N., & Vincent, E. (2013). The diverse environments multi-channel acoustic noise database (DEMAND): A database of multichannel environmental noise recordings. Vol. 19, In Proc. ICA. Article 035081.","DOI":"10.1121\/1.4799597"},{"key":"10.1016\/j.neunet.2025.107562_b65","series-title":"Instance normalization: The missing ingredient for fast stylization","author":"Ulyanov","year":"2016"},{"key":"10.1016\/j.neunet.2025.107562_b66","doi-asserted-by":"crossref","unstructured":"Valentini-Botinhao, C., Wang, X., Takaki, S., & Yamagishi, J. (2016). Investigating RNN-based speech enhancement methods for noise-robust Text-to-Speech.. In Proc. SSW (pp. 146\u2013152).","DOI":"10.21437\/SSW.2016-24"},{"key":"10.1016\/j.neunet.2025.107562_b67","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2025.107562_b68","doi-asserted-by":"crossref","unstructured":"Veaux, C., Yamagishi, J., & King, S. (2013). The voice bank corpus: Design, collection and data analysis of a large regional accent speech database. In Proc. o-COCOSDA\/CASLRE (pp. 1\u20134).","DOI":"10.1109\/ICSDA.2013.6709856"},{"key":"10.1016\/j.neunet.2025.107562_b69","series-title":"CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit","author":"Veaux","year":"2017"},{"key":"10.1016\/j.neunet.2025.107562_b70","doi-asserted-by":"crossref","unstructured":"Wang, Z.-Q., Cornell, S., Choi, S., Lee, Y., Kim, B.-Y., & Watanabe, S. (2023). TF-GridNet: Making time-frequency domain models great again for monaural speaker separation. In Proc. ICASSP (pp. 1\u20135).","DOI":"10.1109\/ICASSP49357.2023.10094992"},{"key":"10.1016\/j.neunet.2025.107562_b71","doi-asserted-by":"crossref","unstructured":"Wang, K., He, B., & Zhu, W.-P. (2021). TSTNN: Two-stage transformer based neural network for speech enhancement in the time domain. In Proc. ICASSP (pp. 7098\u20137102).","DOI":"10.1109\/ICASSP39728.2021.9413740"},{"issue":"4","key":"10.1016\/j.neunet.2025.107562_b72","doi-asserted-by":"crossref","first-page":"679","DOI":"10.1109\/TASSP.1982.1163920","article-title":"The unimportance of phase in speech enhancement","volume":"30","author":"Wang","year":"1982","journal-title":"IEEE\/ACM Transactions on Acoustics, Speech, and Signal Processing"},{"issue":"12","key":"10.1016\/j.neunet.2025.107562_b73","doi-asserted-by":"crossref","first-page":"1849","DOI":"10.1109\/TASLP.2014.2352935","article-title":"On training targets for supervised speech separation","volume":"22","author":"Wang","year":"2014","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b74","doi-asserted-by":"crossref","first-page":"2058","DOI":"10.1109\/TASLP.2021.3054302","article-title":"Towards robust speech super-resolution","volume":"29","author":"Wang","year":"2021","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b75","doi-asserted-by":"crossref","first-page":"1778","DOI":"10.1109\/TASLP.2020.2998279","article-title":"Complex spectral mapping for single-and multi-channel speech enhancement and robust ASR","volume":"28","author":"Wang","year":"2020","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b76","doi-asserted-by":"crossref","first-page":"2018","DOI":"10.1109\/LSP.2021.3116502","article-title":"On the compensation between magnitude and phase in speech separation","volume":"28","author":"Wang","year":"2021","journal-title":"IEEE Signal Processing Letters"},{"key":"10.1016\/j.neunet.2025.107562_b77","doi-asserted-by":"crossref","unstructured":"Weninger, F., Erdogan, H., Watanabe, S., Vincent, E., Le Roux, J., Hershey, J. R., & Schuller, B. (2015). Speech enhancement with LSTM recurrent neural networks and its application to noise-robust ASR. In International conference on latent variable analysis and signal separation (LVA\/iCA) (pp. 91\u201399).","DOI":"10.1007\/978-3-319-22482-4_11"},{"issue":"3","key":"10.1016\/j.neunet.2025.107562_b78","doi-asserted-by":"crossref","first-page":"483","DOI":"10.1109\/TASLP.2015.2512042","article-title":"Complex ratio masking for monaural speech separation","volume":"24","author":"Williamson","year":"2015","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b79","doi-asserted-by":"crossref","unstructured":"Wisdom, S., Hershey, J. R., Wilson, K., Thorpe, J., Chinen, M., Patton, B., & Saurous, R. A. (2019). Differentiable consistency constraints for improved deep speech enhancement. In Proc. ICASSP (pp. 900\u2013904).","DOI":"10.1109\/ICASSP.2019.8682783"},{"issue":"1","key":"10.1016\/j.neunet.2025.107562_b80","doi-asserted-by":"crossref","first-page":"7","DOI":"10.1109\/TASLP.2014.2364452","article-title":"A regression approach to speech enhancement based on deep neural networks","volume":"23","author":"Xu","year":"2014","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.neunet.2025.107562_b81","doi-asserted-by":"crossref","unstructured":"Yin, D., Luo, C., Xiong, Z., & Zeng, W. (2020). PHASEN: A phase-and-harmonics-aware speech enhancement network. Vol. 34, In Proc. AAAI (pp. 9458\u20139465).","DOI":"10.1609\/aaai.v34i05.6489"},{"key":"10.1016\/j.neunet.2025.107562_b82","doi-asserted-by":"crossref","unstructured":"Yin, D., Zhao, Z., Tang, C., Xiong, Z., & Luo, C. (2023). TridentSE: Guiding Speech Enhancement with 32 Global Tokens. In Proc. Interspeech (pp. 3839\u20133843).","DOI":"10.21437\/Interspeech.2023-565"},{"key":"10.1016\/j.neunet.2025.107562_b83","doi-asserted-by":"crossref","unstructured":"Yu, G., Li, A., Zheng, C., Guo, Y., Wang, Y., & Wang, H. (2022). Dual-branch attention-in-attention transformer for single-channel speech enhancement. In Proc. ICASSP (pp. 7847\u20137851).","DOI":"10.1109\/ICASSP43922.2022.9746273"},{"key":"10.1016\/j.neunet.2025.107562_b84","doi-asserted-by":"crossref","unstructured":"Zhao, S., Ma, B., Watcharasupat, K. N., & Gan, W.-S. (2022). FRCRN: Boosting feature representation using frequency recurrence for monaural speech enhancement. In Proc. ICASSP (pp. 9281\u20139285).","DOI":"10.1109\/ICASSP43922.2022.9747578"},{"issue":"1","key":"10.1016\/j.neunet.2025.107562_b85","doi-asserted-by":"crossref","first-page":"63","DOI":"10.1109\/TASLP.2018.2870742","article-title":"Phase-aware speech enhancement based on deep neural networks","volume":"27","author":"Zheng","year":"2018","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608025004411?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608025004411?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T10:39:57Z","timestamp":1749292797000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608025004411"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":85,"alternative-id":["S0893608025004411"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2025.107562","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2025,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Explicit estimation of magnitude and phase spectra in parallel for high-quality speech enhancement","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2025.107562","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"107562"}}