{"id":"https://openalex.org/W4407129404","doi":"https://doi.org/10.48550/arxiv.2502.01126","title":"Language Models Prefer What They Know: Relative Confidence Estimation via Confidence Preferences","display_name":"Language Models Prefer What They Know: Relative Confidence Estimation via Confidence Preferences","publication_year":2025,"publication_date":"2025-02-03","ids":{"openalex":"https://openalex.org/W4407129404","doi":"https://doi.org/10.48550/arxiv.2502.01126"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2502.01126","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.01126","pdf_url":"https://arxiv.org/pdf/2502.01126","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2502.01126","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5076948838","display_name":"Vaishnavi Shrivastava","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shrivastava, Vaishnavi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014379995","display_name":"Ananya Kumar","orcid":"https://orcid.org/0000-0002-9002-510X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kumar, Ananya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5025255782","display_name":"Percy Liang","orcid":"https://orcid.org/0000-0002-0458-6139"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Percy","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5076948838"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9714999794960022,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9714999794960022,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9692000150680542,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/confidence-interval","display_name":"Confidence interval","score":0.6166342496871948},{"id":"https://openalex.org/keywords/estimation","display_name":"Estimation","score":0.5723159313201904},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.49161702394485474},{"id":"https://openalex.org/keywords/low-confidence","display_name":"Low Confidence","score":0.4178916811943054},{"id":"https://openalex.org/keywords/econometrics","display_name":"Econometrics","score":0.38117414712905884},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.3672906756401062},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.28480011224746704},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.24763742089271545},{"id":"https://openalex.org/keywords/economics","display_name":"Economics","score":0.17659351229667664},{"id":"https://openalex.org/keywords/social-psychology","display_name":"Social psychology","score":0.14231151342391968}],"concepts":[{"id":"https://openalex.org/C44249647","wikidata":"https://www.wikidata.org/wiki/Q208498","display_name":"Confidence interval","level":2,"score":0.6166342496871948},{"id":"https://openalex.org/C96250715","wikidata":"https://www.wikidata.org/wiki/Q965330","display_name":"Estimation","level":2,"score":0.5723159313201904},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.49161702394485474},{"id":"https://openalex.org/C2909755999","wikidata":"https://www.wikidata.org/wiki/Q4751126","display_name":"Low Confidence","level":2,"score":0.4178916811943054},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.38117414712905884},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3672906756401062},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.28480011224746704},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.24763742089271545},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.17659351229667664},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.14231151342391968},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2502.01126","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.01126","pdf_url":"https://arxiv.org/pdf/2502.01126","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2502.01126","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2502.01126","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2502.01126","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.01126","pdf_url":"https://arxiv.org/pdf/2502.01126","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4407129404.pdf","grobid_xml":"https://content.openalex.org/works/W4407129404.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2995364151","https://openalex.org/W797163146","https://openalex.org/W2068023914","https://openalex.org/W1899429405","https://openalex.org/W2401903608","https://openalex.org/W4221078923","https://openalex.org/W2066458123","https://openalex.org/W3005777437","https://openalex.org/W1578234703","https://openalex.org/W2084691705"],"abstract_inverted_index":{"Language":[0],"models":[1,45,235],"(LMs)":[2],"should":[3],"provide":[4,48],"reliable":[5,206],"confidence":[6,30,33,52,55,83,102,148,151,156,160,164,201,207,211,225],"estimates":[7],"to":[8,18,27,47,97,144],"help":[9],"users":[10],"detect":[11],"mistakes":[12],"in":[13,56,109,118,218],"their":[14,78],"outputs":[15],"and":[16,64,93,126,142,162,179,190,228,236],"defer":[17],"human":[19],"experts":[20],"when":[21],"necessary.":[22],"Asking":[23],"a":[24,37,58,116,119],"language":[25],"model":[26,96],"assess":[28],"its":[29,42],"(\"Score":[31],"your":[32],"from":[34],"0-1.\")":[35],"is":[36],"natural":[38],"way":[39],"of":[40,51,61,77,101,121,216],"evaluating":[41,74],"uncertainty.":[43],"However,":[44],"struggle":[46],"absolute":[49,159,210,224],"assessments":[50],"(i.e.":[53],"judging":[54],"answering":[57,110,194],"question":[59,104,114,193],"independent":[60],"other":[62,92,124],"questions)":[63],"the":[65,75,95,127,146],"coarse-grained":[66],"scores":[67,208],"they":[68],"produce":[69],"are":[70,105],"not":[71],"useful":[72],"for":[73],"correctness":[76],"answers.":[79],"We":[80,153],"propose":[81],"relative":[82,99,155,200],"estimation,":[84,212],"where":[85],"we":[86,133],"match":[87,131],"up":[88],"questions":[89,125],"against":[90,123,158],"each":[91,113],"ask":[94],"make":[98],"judgments":[100],"(\"Which":[103],"you":[106],"more":[107,205],"confident":[108],"correctly?\").":[111],"Treating":[112],"as":[115,130],"\"player\"":[117],"series":[120],"matchups":[122],"model's":[128,147],"preferences":[129,149],"outcomes,":[132],"can":[134],"use":[135],"rank":[136],"aggregation":[137],"methods":[138,165,227],"like":[139],"Elo":[140],"rating":[141],"Bradley-Terry":[143],"translate":[145],"into":[150],"scores.":[152],"evaluate":[154],"estimation":[157,161,202,226],"self-consistency":[163,231],"on":[166],"five":[167],"state-of-the-art":[168],"LMs":[169],"--":[170,183],"GPT-4,":[171],"GPT-4o,":[172],"Gemini":[173],"1.5":[174],"Pro,":[175],"Claude":[176],"3.5":[177],"Sonnet,":[178],"Llama":[180],"3.1":[181],"405B":[182],"across":[184,233],"14":[185],"challenging":[186],"STEM,":[187],"social":[188],"science,":[189],"commonsense":[191],"reasoning":[192],"tasks.":[195],"Our":[196],"results":[197],"demonstrate":[198],"that":[199],"consistently":[203],"provides":[204],"than":[209],"with":[213],"average":[214],"gains":[215],"3.5%":[217],"selective":[219],"classification":[220],"AUC":[221],"over":[222,230],"direct":[223],"1.7%":[229],"approaches":[232],"all":[234],"datasets.":[237]},"counts_by_year":[],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
