{"id":"https://openalex.org/W7125908648","doi":"https://doi.org/10.1145/3774934.3786429","title":"CCL-D: A High-Precision Diagnostic System for Slow and Hang Anomalies in Large-Scale Model Training","display_name":"CCL-D: A High-Precision Diagnostic System for Slow and Hang Anomalies in Large-Scale Model Training","publication_year":2026,"publication_date":"2026-01-28","ids":{"openalex":"https://openalex.org/W7125908648","doi":"https://doi.org/10.1145/3774934.3786429"},"language":null,"primary_location":{"id":"doi:10.1145/3774934.3786429","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3774934.3786429","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123337664","display_name":"Yida Gu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yida Gu","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0007-0712-8575","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124119973","display_name":"Fakang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fakang Wang","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0001-5522-0217","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jianhao Fu","orcid":"https://orcid.org/0009-0008-5109-5413"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jianhao Fu","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0008-5109-5413","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021101164","display_name":"Zhenhang Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhenhang Sun","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0002-8424-4979","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Qianyu Zhang","orcid":"https://orcid.org/0000-0002-3805-4480"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qianyu Zhang","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-3805-4480","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087484624","display_name":"Hairui Zhao","orcid":"https://orcid.org/0009-0008-7081-5172"},"institutions":[{"id":"https://openalex.org/I194450716","display_name":"Jilin University","ror":"https://ror.org/00js3aw79","country_code":"CN","type":"education","lineage":["https://openalex.org/I194450716"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hairui Zhao","raw_affiliation_strings":["Jilin University, Changchun, China"],"raw_orcid":"https://orcid.org/0009-0008-7081-5172","affiliations":[{"raw_affiliation_string":"Jilin University, Changchun, China","institution_ids":["https://openalex.org/I194450716"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124075352","display_name":"Xingchen Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingchen Liu","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-9410-5365","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yang Tian","orcid":"https://orcid.org/0009-0000-1519-466X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang Tian","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0000-1519-466X","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124070204","display_name":"Wenjing Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenjing Huang","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-5541-3519","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zedong Liu","orcid":"https://orcid.org/0009-0008-7625-2008"},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zedong Liu","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0008-7625-2008","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123280944","display_name":"Yifan Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yifan Chen","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0001-1158-6049","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103365604","display_name":"J. Y. Yang","orcid":"https://orcid.org/0009-0006-9307-0391"},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinwu Yang","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-9307-0391","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124149835","display_name":"Yueyuan Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yueyuan Zhou","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-1735-5072","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124127584","display_name":"Qian ZHAO","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qian Zhao","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0003-7108-9735","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124064297","display_name":"Haoxu Li","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoxu Li","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0008-9480-0856","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124102367","display_name":"Tao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao Wang","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0001-2779-9057","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Feng Yu","orcid":"https://orcid.org/0009-0004-0194-8941"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng Yu","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0004-0194-8941","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124086366","display_name":"Zhan Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhan Wang","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0003-4274-7671","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124101702","display_name":"Guangming Tan","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangming Tan","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-6361-5948","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5124096882","display_name":"Dingwen Tao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dingwen Tao","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5422-4497","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":20,"corresponding_author_ids":["https://openalex.org/A5123337664"],"corresponding_institution_ids":["https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15084115,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"425","last_page":"438"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9368000030517578,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9368000030517578,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10400","display_name":"Network Security and Intrusion Detection","score":0.014999999664723873,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.008200000040233135,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/anomaly-detection","display_name":"Anomaly detection","score":0.6500999927520752},{"id":"https://openalex.org/keywords/anomaly","display_name":"Anomaly (physics)","score":0.5594000220298767},{"id":"https://openalex.org/keywords/tracing","display_name":"Tracing","score":0.5479999780654907},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.43470001220703125},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.3702000081539154},{"id":"https://openalex.org/keywords/cluster","display_name":"Cluster (spacecraft)","score":0.35510000586509705}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6718000173568726},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.6500999927520752},{"id":"https://openalex.org/C12997251","wikidata":"https://www.wikidata.org/wiki/Q567560","display_name":"Anomaly (physics)","level":2,"score":0.5594000220298767},{"id":"https://openalex.org/C138673069","wikidata":"https://www.wikidata.org/wiki/Q322229","display_name":"Tracing","level":2,"score":0.5479999780654907},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46869999170303345},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.43470001220703125},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3815000057220459},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.3702000081539154},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.36230000853538513},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.35510000586509705},{"id":"https://openalex.org/C171078966","wikidata":"https://www.wikidata.org/wiki/Q111029","display_name":"Root (linguistics)","level":2,"score":0.3100000023841858},{"id":"https://openalex.org/C2781323245","wikidata":"https://www.wikidata.org/wiki/Q1363761","display_name":"Hang","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C84525736","wikidata":"https://www.wikidata.org/wiki/Q831366","display_name":"Decision tree","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C84945661","wikidata":"https://www.wikidata.org/wiki/Q7366567","display_name":"Root cause","level":2,"score":0.2531000077724457}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3774934.3786429","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3774934.3786429","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4035241901874542,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W1964981582","https://openalex.org/W2009428216","https://openalex.org/W2767472765","https://openalex.org/W2969388332","https://openalex.org/W3129831491","https://openalex.org/W3130554079","https://openalex.org/W3137759927","https://openalex.org/W3190806564","https://openalex.org/W4294433898","https://openalex.org/W4360831831","https://openalex.org/W4386113246","https://openalex.org/W4386768656","https://openalex.org/W4387302750","https://openalex.org/W4389262616","https://openalex.org/W4394862623","https://openalex.org/W4399757647","https://openalex.org/W4409248703","https://openalex.org/W4412703834","https://openalex.org/W4413756918","https://openalex.org/W4416199158"],"related_works":[],"abstract_inverted_index":{"As":[0],"training":[1],"scales":[2],"grow,":[3],"collective":[4],"communication":[5,100],"libraries":[6],"(CCL)":[7],"increasingly":[8],"face":[9],"anomalies":[10,22,70,132],"arising":[11],"from":[12],"complex":[13],"interactions":[14],"among":[15],"hardware,":[16],"software,":[17],"and":[18,31,42,67,108,133],"environmental":[19],"factors.":[20],"These":[21],"typically":[23],"manifest":[24],"as":[25],"slow/hang":[26,69,131],"communication,":[27],"the":[28,113],"most":[29],"frequent":[30],"time-consuming":[32],"category":[33],"to":[34,65,98],"diagnose.":[35],"However,":[36],"traditional":[37],"diagnostic":[38,62],"methods":[39],"remain":[40],"inaccurate":[41],"inefficient,":[43],"frequently":[44],"requiring":[45],"hours":[46],"or":[47],"even":[48],"days":[49],"for":[50],"root":[51],"cause":[52],"analysis.":[53],"To":[54],"address":[55],"this,":[56],"we":[57],"propose":[58],"CCL-D,":[59],"a":[60,77,93,119],"high-precision":[61],"system":[63],"designed":[64],"detect":[66],"locate":[68],"in":[71],"large-scale":[72],"distributed":[73,95],"training.":[74],"CCL-D":[75,125],"integrates":[76],"rank-level":[78],"real-time":[79],"probe":[80,87],"with":[81],"an":[82],"intelligent":[83],"decision":[84],"analyzer.":[85],"The":[86,102],"measures":[88],"cross-layer":[89],"anomaly":[90,106],"metrics":[91],"using":[92],"lightweight":[94],"tracing":[96],"framework":[97],"monitor":[99],"traffic.":[101],"analyzer":[103],"performs":[104],"automated":[105],"detection":[107],"root-cause":[109],"location,":[110],"precisely":[111],"identifying":[112],"faulty":[114],"GPU":[115],"rank.":[116],"Deployed":[117],"on":[118],"4,000-GPU":[120],"cluster":[121],"over":[122],"one":[123],"year,":[124],"achieved":[126],"near-complete":[127],"coverage":[128],"of":[129],"known":[130],"pinpointed":[134],"affected":[135],"ranks":[136],"within":[137],"6":[138],"minutes\u2014substantially":[139],"outperforming":[140],"existing":[141],"solutions.":[142]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2026-01-29T00:00:00"}
