{"id":"https://openalex.org/W7078376186","doi":"https://doi.org/10.48550/arxiv.2508.19980","title":"Evaluating Language Model Reasoning about Confidential Information","display_name":"Evaluating Language Model Reasoning about Confidential Information","publication_year":2025,"publication_date":"2025-08-27","ids":{"openalex":"https://openalex.org/W7078376186","doi":"https://doi.org/10.48550/arxiv.2508.19980"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2508.19980","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2508.19980","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2508.19980","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Sam, Dylan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sam, Dylan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Robey, Alexander","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Robey, Alexander","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zou, Andy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zou, Andy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Fredrikson, Matt","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fredrikson, Matt","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Kolter, J. Zico","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kolter, J. Zico","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.6514000296592712,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.6514000296592712,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13067","display_name":"Geological Modeling and Analysis","score":0.03269999846816063,"subfield":{"id":"https://openalex.org/subfields/1906","display_name":"Geochemistry and Petrology"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14311","display_name":"Electrical and Electromagnetic Research","score":0.019899999722838402,"subfield":{"id":"https://openalex.org/subfields/3107","display_name":"Atomic and Molecular Physics, and Optics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/confidentiality","display_name":"Confidentiality","score":0.6072999835014343},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5497999787330627},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.5047000050544739},{"id":"https://openalex.org/keywords/password","display_name":"Password","score":0.47760000824928284},{"id":"https://openalex.org/keywords/safer","display_name":"SAFER","score":0.42590001225471497},{"id":"https://openalex.org/keywords/model-based-reasoning","display_name":"Model-based reasoning","score":0.3393000066280365},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3255999982357025}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7639999985694885},{"id":"https://openalex.org/C71745522","wikidata":"https://www.wikidata.org/wiki/Q2476929","display_name":"Confidentiality","level":2,"score":0.6072999835014343},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5497999787330627},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.5047000050544739},{"id":"https://openalex.org/C109297577","wikidata":"https://www.wikidata.org/wiki/Q161157","display_name":"Password","level":2,"score":0.47760000824928284},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.447299987077713},{"id":"https://openalex.org/C2776654903","wikidata":"https://www.wikidata.org/wiki/Q2601463","display_name":"SAFER","level":2,"score":0.42590001225471497},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4153999984264374},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3531999886035919},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.3393000066280365},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3255999982357025},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.32350000739097595},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.32120001316070557},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.32030001282691956},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.3093000054359436},{"id":"https://openalex.org/C179603123","wikidata":"https://www.wikidata.org/wiki/Q1941921","display_name":"Modeling language","level":3,"score":0.2802000045776367},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2784000039100647},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.27639999985694885},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2508.19980","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2508.19980","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2508.19980","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2508.19980","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7026532292366028,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"language":[1,31,56],"models":[2,32,57,79,163],"are":[3,164],"increasingly":[4],"deployed":[5],"as":[6],"autonomous":[7],"agents":[8],"in":[9,120,180,190],"high-stakes":[10,191],"settings,":[11],"ensuring":[12],"that":[13,53,74,101,160,172],"they":[14],"reliably":[15],"follow":[16],"user-defined":[17],"rules":[18],"has":[19],"become":[20],"a":[21,50,62,69,181],"critical":[22],"safety":[23,43],"concern.":[24],"To":[25],"this":[26,46,82],"end,":[27],"we":[28,48,99],"study":[29],"whether":[30,55,112],"exhibit":[33],"contextual":[34],"robustness,":[35],"or":[36],"the":[37,126],"capability":[38],"to":[39,41,118,167,177,184],"adhere":[40],"context-dependent":[42],"specifications.":[44],"For":[45],"analysis,":[47],"develop":[49],"benchmark":[51],"(PasswordEval)":[52],"measures":[54],"can":[58],"correctly":[59],"determine":[60],"when":[61],"user":[63,138],"request":[64],"is":[65,153],"authorized":[66],"(i.e.,":[67],"with":[68,81],"correct":[70],"password).":[71],"We":[72,123],"find":[73,100],"current":[75,161],"open-":[76],"and":[77,86,144,171],"closed-source":[78],"struggle":[80],"seemingly":[83],"simple":[84],"task,":[85],"that,":[87],"perhaps":[88],"surprisingly,":[89],"reasoning":[90,102,113,173],"capabilities":[91,174],"do":[92],"not":[93,165],"generally":[94],"improve":[95],"performance.":[96],"In":[97],"fact,":[98],"traces":[103,114],"frequently":[104],"leak":[105],"confidential":[106,169],"information,":[107,170],"which":[108],"calls":[109],"into":[110],"question":[111],"should":[115],"be":[116,178],"exposed":[117],"users":[119],"such":[121],"applications.":[122],"also":[124],"scale":[125],"difficulty":[127],"of":[128],"our":[129,157],"evaluation":[130],"along":[131],"multiple":[132],"axes:":[133],"(i)":[134],"by":[135],"adding":[136],"adversarial":[137],"pressure":[139],"through":[140,146],"various":[141],"jailbreaking":[142],"strategies,":[143],"(ii)":[145],"longer":[147],"multi-turn":[148],"conversations":[149],"where":[150],"password":[151],"verification":[152],"more":[154],"challenging.":[155],"Overall,":[156],"results":[158],"suggest":[159],"frontier":[162],"well-suited":[166],"handling":[168],"may":[175],"need":[176],"trained":[179],"different":[182],"manner":[183],"make":[185],"them":[186],"safer":[187],"for":[188],"release":[189],"settings.":[192]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
