{"id":"https://openalex.org/W4417300116","doi":"https://doi.org/10.48550/arxiv.2505.11835","title":"Multilingual Collaborative Defense for Large Language Models","display_name":"Multilingual Collaborative Defense for Large Language Models","publication_year":2025,"publication_date":"2025-05-17","ids":{"openalex":"https://openalex.org/W4417300116","doi":"https://doi.org/10.48550/arxiv.2505.11835"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2505.11835","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.11835","pdf_url":"https://arxiv.org/pdf/2505.11835","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2505.11835","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114378292","display_name":"Hongliang Li","orcid":"https://orcid.org/0000-0002-7481-095X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Hongliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101698034","display_name":"Jinan Xu","orcid":"https://orcid.org/0000-0003-0170-626X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Jinan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cui, Gengping","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cui, Gengping","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102678874","display_name":"Changhao Guan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guan, Changhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048033573","display_name":"Fengran Mo","orcid":"https://orcid.org/0000-0002-0838-6994"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mo, Fengran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5070912198","display_name":"Kaiyu Huang","orcid":"https://orcid.org/0000-0003-3452-5000"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Kaiyu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5114378292"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.8500000238418579,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.8500000238418579,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.03020000085234642,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.014299999922513962,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/safeguarding","display_name":"Safeguarding","score":0.9104999899864197},{"id":"https://openalex.org/keywords/transferability","display_name":"Transferability","score":0.580299973487854},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5170000195503235},{"id":"https://openalex.org/keywords/vulnerability","display_name":"Vulnerability (computing)","score":0.4846999943256378},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.3709000051021576},{"id":"https://openalex.org/keywords/federated-learning","display_name":"Federated learning","score":0.35910001397132874}],"concepts":[{"id":"https://openalex.org/C2776743756","wikidata":"https://www.wikidata.org/wiki/Q5097921","display_name":"Safeguarding","level":2,"score":0.9104999899864197},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7666000127792358},{"id":"https://openalex.org/C61272859","wikidata":"https://www.wikidata.org/wiki/Q7834031","display_name":"Transferability","level":3,"score":0.580299973487854},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5170000195503235},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.4846999943256378},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.3709000051021576},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3630000054836273},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.36160001158714294},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.361299991607666},{"id":"https://openalex.org/C2992525071","wikidata":"https://www.wikidata.org/wiki/Q50818671","display_name":"Federated learning","level":2,"score":0.35910001397132874},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3246000111103058},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.30399999022483826},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.3012999892234802},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.26989999413490295},{"id":"https://openalex.org/C171041071","wikidata":"https://www.wikidata.org/wiki/Q36870","display_name":"First language","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C2780035574","wikidata":"https://www.wikidata.org/wiki/Q30081","display_name":"Multilingualism","level":2,"score":0.2590999901294708}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2505.11835","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.11835","pdf_url":"https://arxiv.org/pdf/2505.11835","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2505.11835","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2505.11835","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2505.11835","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.11835","pdf_url":"https://arxiv.org/pdf/2505.11835","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,106,188],"robustness":[1],"and":[2,82,165],"security":[3],"of":[4,39,55,104,150,157,186],"large":[5],"language":[6,136,184,206],"models":[7],"(LLMs)":[8],"has":[9,48],"become":[10],"a":[11,34,88,94],"prominent":[12],"research":[13,51],"area.":[14],"One":[15],"notable":[16],"vulnerability":[17],"is":[18,211],"the":[19,44,53,73,135,148,183],"ability":[20],"to":[21,64,100,167,181],"bypass":[22],"LLM":[23,143],"safeguards":[24],"by":[25,140],"translating":[26],"harmful":[27],"queries":[28],"into":[29],"rare":[30],"or":[31],"underrepresented":[32,178],"languages,":[33],"simple":[35],"yet":[36],"effective":[37],"method":[38,91],"\"jailbreaking\"":[40],"these":[41,175],"models.":[42],"Despite":[43],"growing":[45],"concern,":[46],"there":[47],"been":[49],"limited":[50],"addressing":[52],"safeguarding":[54,103,116,170,197],"LLMs":[56],"in":[57,142,177,196],"multilingual":[58,66,102,155,199],"scenarios,":[59],"highlighting":[60],"an":[61],"urgent":[62],"need":[63],"enhance":[65],"safety.":[67],"In":[68],"this":[69],"work,":[70],"we":[71,152,173],"investigate":[72],"correlation":[74],"between":[75],"various":[76,169],"attack":[77],"features":[78],"across":[79,118],"different":[80],"languages":[81,180],"propose":[83],"Multilingual":[84],"Collaborative":[85],"Defense":[86],"(MCD),":[87],"novel":[89],"learning":[90],"that":[92,191],"optimizes":[93],"continuous,":[95],"soft":[96],"safety":[97,137],"prompt":[98],"automatically":[99],"facilitate":[101],"LLMs.":[105],"MCD":[107,122,133,192],"approach":[108],"offers":[109],"three":[110],"advantages:":[111],"First,":[112],"it":[113],"effectively":[114],"improves":[115],"performance":[117],"multiple":[119],"languages.":[120],"Second,":[121],"maintains":[123],"strong":[124,205],"generalization":[125],"capabilities":[126],"while":[127,202],"minimizing":[128],"false":[129],"refusal":[130],"rates.":[131],"Third,":[132],"mitigates":[134],"misalignment":[138],"caused":[139],"imbalances":[141],"training":[144],"corpora.":[145],"To":[146],"evaluate":[147],"effectiveness":[149],"MCD,":[151],"manually":[153],"construct":[154],"versions":[156],"commonly":[158],"used":[159],"jailbreak":[160,200],"benchmarks,":[161],"such":[162],"as":[163],"MaliciousInstruct":[164],"AdvBench,":[166],"assess":[168],"methods.":[171],"Additionally,":[172],"introduce":[174],"datasets":[176],"(zero-shot)":[179],"verify":[182],"transferability":[185],"MCD.":[187],"results":[189],"demonstrate":[190],"outperforms":[193],"existing":[194],"approaches":[195],"against":[198],"attempts":[201],"also":[203],"exhibiting":[204],"transfer":[207],"capabilities.":[208],"Our":[209],"code":[210],"available":[212],"at":[213],"https://github.com/HLiang-Lee/MCD.":[214]},"counts_by_year":[],"updated_date":"2026-04-17T18:11:37.981687","created_date":"2025-10-10T00:00:00"}
