{"id":"https://openalex.org/W4405562078","doi":"https://doi.org/10.48550/arxiv.2412.12480","title":"Subversion Strategy Eval: Can language models statelessly strategize to subvert control protocols?","display_name":"Subversion Strategy Eval: Can language models statelessly strategize to subvert control protocols?","publication_year":2024,"publication_date":"2024-12-17","ids":{"openalex":"https://openalex.org/W4405562078","doi":"https://doi.org/10.48550/arxiv.2412.12480"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2412.12480","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.12480","pdf_url":"https://arxiv.org/pdf/2412.12480","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.12480","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025230160","display_name":"Alex Mallen","orcid":"https://orcid.org/0000-0002-5120-1350"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Mallen, Alex","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030099788","display_name":"Charlie Griffin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Griffin, Charlie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wagner, Misha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wagner, Misha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091718585","display_name":"Alessandro Abate","orcid":"https://orcid.org/0000-0002-5627-9093"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abate, Alessandro","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5050408969","display_name":"Buck Shlegeris","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shlegeris, Buck","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5025230160"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.7190999984741211,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.7190999984741211,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.6890000104904175,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.6541000008583069,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/subversion","display_name":"Subversion","score":0.968551754951477},{"id":"https://openalex.org/keywords/stateless-protocol","display_name":"Stateless protocol","score":0.7488702535629272},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.48665958642959595},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4757770597934723},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.3296926021575928},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.31521207094192505},{"id":"https://openalex.org/keywords/political-science","display_name":"Political science","score":0.24579378962516785},{"id":"https://openalex.org/keywords/law","display_name":"Law","score":0.066508948802948},{"id":"https://openalex.org/keywords/politics","display_name":"Politics","score":0.06540325284004211}],"concepts":[{"id":"https://openalex.org/C2777367489","wikidata":"https://www.wikidata.org/wiki/Q1060330","display_name":"Subversion","level":3,"score":0.968551754951477},{"id":"https://openalex.org/C103613024","wikidata":"https://www.wikidata.org/wiki/Q230924","display_name":"Stateless protocol","level":3,"score":0.7488702535629272},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.48665958642959595},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4757770597934723},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.3296926021575928},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.31521207094192505},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.24579378962516785},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.066508948802948},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.06540325284004211},{"id":"https://openalex.org/C158379750","wikidata":"https://www.wikidata.org/wiki/Q214111","display_name":"Network packet","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2412.12480","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.12480","pdf_url":"https://arxiv.org/pdf/2412.12480","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.12480","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.12480","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.12480","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.12480","pdf_url":"https://arxiv.org/pdf/2412.12480","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4405562078.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2982445252","https://openalex.org/W3089771613","https://openalex.org/W4247478651","https://openalex.org/W2756056980","https://openalex.org/W2392564020","https://openalex.org/W2378209145","https://openalex.org/W4239407877","https://openalex.org/W2779721709","https://openalex.org/W4243130055","https://openalex.org/W3040374273"],"abstract_inverted_index":{"An":[0],"AI":[1,10,17,29,55,198],"control":[2,41,145,185,199],"protocol":[3],"is":[4],"a":[5,87,93,160,175],"plan":[6,161],"for":[7,39,178],"usefully":[8],"deploying":[9],"systems":[11,30],"that":[12,105],"aims":[13],"to":[14,59,143,158],"prevent":[15],"an":[16,54],"from":[18],"intentionally":[19],"causing":[20],"some":[21],"unacceptable":[22],"outcome.":[23],"This":[24],"paper":[25],"investigates":[26],"how":[27],"well":[28,129],"can":[31,164,172],"generate":[32,61],"and":[33,72,97,100,116,187],"act":[34,173],"on":[35],"their":[36],"own":[37],"strategies":[38],"subverting":[40,184],"protocols":[42,96,146,186],"whilst":[43],"operating":[44],"statelessly":[45],"(without":[46],"shared":[47],"memory":[48],"between":[49,162],"contexts).":[50],"To":[51],"do":[52],"this,":[53],"system":[56],"may":[57],"need":[58],"reliably":[60],"optimal":[62],"plans":[63,74],"in":[64,114,141,197],"each":[65],"context,":[66],"take":[67],"actions":[68],"with":[69,75,151],"well-calibrated":[70],"probabilities,":[71],"coordinate":[73],"other":[76],"instances":[77],"of":[78,89,95,103,135,183,193],"itself":[79],"without":[80],"communicating.":[81],"We":[82,110,120,168],"develop":[83],"Subversion":[84],"Strategy":[85],"Eval,":[86],"suite":[88],"eight":[90],"environments,":[91],"covering":[92],"range":[94],"strategic":[98,195],"capabilities,":[99],"six":[101],"sets":[102],"affordances":[104],"help":[106],"isolate":[107],"individual":[108],"capabilities.":[109],"implement":[111],"the":[112,136,156,190],"evaluation":[113],"Inspect-AI":[115],"release":[117],"it":[118],"open-source.":[119],"evaluate":[121],"Claude":[122],"3.5":[123],"models,":[124],"including":[125],"helpful-only":[126],"versions,":[127],"as":[128,130,155,174],"OpenAI":[131],"reasoning":[132],"models.":[133],"None":[134],"models":[137,150,180],"demonstrate":[138],"substantial":[139],"capability":[140],"strategizing":[142],"subvert":[144],"statelessly.":[147],"However,":[148],"providing":[149],"additional":[152],"affordances,":[153],"such":[154],"ability":[157,196],"share":[159],"contexts,":[163],"substantially":[165],"improve":[166],"performance.":[167],"hope":[169],"our":[170],"evaluations":[171],"leading":[176],"indicator":[177],"when":[179],"are":[181],"capable":[182],"also":[188],"relax":[189],"worst-case":[191],"assumption":[192],"perfect":[194],"evaluations.":[200]},"counts_by_year":[],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2024-12-19T00:00:00"}
