{"id":"https://openalex.org/W4399836845","doi":"https://doi.org/10.48550/arxiv.2406.12094","title":"Who's asking? User personas and the mechanics of latent misalignment","display_name":"Who's asking? User personas and the mechanics of latent misalignment","publication_year":2024,"publication_date":"2024-06-17","ids":{"openalex":"https://openalex.org/W4399836845","doi":"https://doi.org/10.48550/arxiv.2406.12094"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2406.12094","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.12094","pdf_url":"https://arxiv.org/pdf/2406.12094","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2406.12094","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5022368228","display_name":"Asma Ghandeharioun","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ghandeharioun, Asma","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020034316","display_name":"Ann Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Ann","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099282338","display_name":"Marius Guerard","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guerard, Marius","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019880413","display_name":"Emily Reif","orcid":"https://orcid.org/0000-0003-3572-6234"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Reif, Emily","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099282339","display_name":"Michael A. Lepori","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lepori, Michael A.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5103159534","display_name":"Lucas Dixon","orcid":"https://orcid.org/0000-0003-1094-1675"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dixon, Lucas","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5022368228"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14074","display_name":"Persona Design and Applications","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T14074","display_name":"Persona Design and Applications","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/persona","display_name":"Persona","score":0.754967212677002},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.376638799905777},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3539671301841736},{"id":"https://openalex.org/keywords/aesthetics","display_name":"Aesthetics","score":0.32630497217178345},{"id":"https://openalex.org/keywords/mechanics","display_name":"Mechanics","score":0.3221464157104492},{"id":"https://openalex.org/keywords/art","display_name":"Art","score":0.31872430443763733},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.24377793073654175}],"concepts":[{"id":"https://openalex.org/C313442","wikidata":"https://www.wikidata.org/wiki/Q778556","display_name":"Persona","level":2,"score":0.754967212677002},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.376638799905777},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3539671301841736},{"id":"https://openalex.org/C107038049","wikidata":"https://www.wikidata.org/wiki/Q35986","display_name":"Aesthetics","level":1,"score":0.32630497217178345},{"id":"https://openalex.org/C57879066","wikidata":"https://www.wikidata.org/wiki/Q41217","display_name":"Mechanics","level":1,"score":0.3221464157104492},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.31872430443763733},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.24377793073654175}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2406.12094","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.12094","pdf_url":"https://arxiv.org/pdf/2406.12094","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2406.12094","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2406.12094","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2406.12094","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.12094","pdf_url":"https://arxiv.org/pdf/2406.12094","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4399836845.pdf","grobid_xml":"https://content.openalex.org/works/W4399836845.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4387497383","https://openalex.org/W3183948672","https://openalex.org/W3173606202","https://openalex.org/W3110381201","https://openalex.org/W2948807893","https://openalex.org/W2778153218","https://openalex.org/W2758277628","https://openalex.org/W2748952813","https://openalex.org/W1531601525","https://openalex.org/W4389400787"],"abstract_inverted_index":{"Despite":[0],"investments":[1],"in":[2,13,42],"improving":[3],"model":[4,34,60,103,136,144],"safety,":[5],"studies":[6],"show":[7,30,56,118,156],"that":[8,31,57,119,140],"misaligned":[9],"capabilities":[10],"remain":[11],"latent":[12],"safety-tuned":[14],"models.":[15],"In":[16,82],"this":[17,26],"work,":[18],"we":[19,29,55,76,84,155,157],"shed":[20],"light":[21],"on":[22,66,163],"the":[23,59,143,167],"mechanics":[24],"of":[25,69,150,169],"phenomenon.":[27],"First,":[28],"even":[32,91],"when":[33],"generations":[35],"are":[36],"safe,":[37],"harmful":[38,96],"content":[39,63,97],"can":[40,46,158],"persist":[41],"hidden":[43],"representations":[44],"and":[45,111,117,138],"be":[47,90],"extracted":[48],"by":[49],"decoding":[50],"from":[51],"earlier":[52],"layers.":[53],"Then,":[54],"whether":[58],"divulges":[61],"such":[62],"depends":[64],"significantly":[65,123],"its":[67,170],"perception":[68],"who":[70],"it":[71],"is":[72,122],"talking":[73],"to,":[74],"which":[75],"refer":[77],"to":[78,89,101,145],"as":[79,114],"user":[80,87],"persona.":[81],"fact,":[83],"find":[85,139],"manipulating":[86],"persona":[88],"more":[92,124,147],"effective":[93,125],"for":[94],"eliciting":[95],"than":[98],"direct":[99],"attempts":[100],"control":[102,115],"refusal.":[104],"We":[105,130],"study":[106],"both":[107],"natural":[108],"language":[109],"prompting":[110],"activation":[112,120],"steering":[113,121,171],"methods":[116],"at":[126],"bypassing":[127],"safety":[128],"filters.":[129],"investigate":[131],"why":[132],"certain":[133],"personas":[134],"break":[135],"safeguards":[137],"they":[141],"enable":[142],"form":[146],"charitable":[148],"interpretations":[149],"otherwise":[151],"dangerous":[152],"queries.":[153],"Finally,":[154],"predict":[159],"a":[160],"persona's":[161],"effect":[162],"refusal":[164],"given":[165],"only":[166],"geometry":[168],"vector.":[172]},"counts_by_year":[],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
