{"id":"https://openalex.org/W4393213296","doi":"https://doi.org/10.48550/arxiv.2403.16446","title":"Towards Automatic Evaluation for LLMs' Clinical Capabilities: Metric, Data, and Algorithm","display_name":"Towards Automatic Evaluation for LLMs' Clinical Capabilities: Metric, Data, and Algorithm","publication_year":2024,"publication_date":"2024-03-25","ids":{"openalex":"https://openalex.org/W4393213296","doi":"https://doi.org/10.48550/arxiv.2403.16446"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2403.16446","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2403.16446","pdf_url":"https://arxiv.org/pdf/2403.16446","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2403.16446","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059565360","display_name":"Lei Liu","orcid":"https://orcid.org/0000-0001-8109-5248"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Lei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100535225","display_name":"Xiaoyan Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Xiaoyan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025496913","display_name":"Fangzhou Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Fangzhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057863237","display_name":"Chenfei Chi","orcid":"https://orcid.org/0000-0003-4075-6147"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chi, Chenfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100628783","display_name":"Yue Shen","orcid":"https://orcid.org/0000-0002-3276-7295"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Yue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102635975","display_name":"Shiwei Lyu Ming Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shiwei Lyu Ming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112259915","display_name":"Xiaowei Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Xiaowei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045349040","display_name":"Xiangguo Lyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Xiangguo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100308149","display_name":"Liya Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Liya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100346714","display_name":"Zhiqiang Zhang","orcid":"https://orcid.org/0000-0002-6848-4019"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhiqiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102317632","display_name":"Wei Xue","orcid":"https://orcid.org/0009-0005-0723-8702"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059183788","display_name":"Huang Yi-ran","orcid":"https://orcid.org/0000-0001-6289-8766"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yiran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5053242349","display_name":"Jinjie Gu","orcid":"https://orcid.org/0000-0001-7596-4945"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Jinjie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5059565360"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13643","display_name":"Artificial Intelligence in Law","score":0.83160001039505,"subfield":{"id":"https://openalex.org/subfields/3320","display_name":"Political Science and International Relations"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13643","display_name":"Artificial Intelligence in Law","score":0.83160001039505,"subfield":{"id":"https://openalex.org/subfields/3320","display_name":"Political Science and International Relations"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12755","display_name":"Legal Education and Practice Innovations","score":0.7429999709129333,"subfield":{"id":"https://openalex.org/subfields/3308","display_name":"Law"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T14330","display_name":"Library Science and Information Systems","score":0.7170000076293945,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.6491217613220215},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4930555522441864},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.47260600328445435},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.35034453868865967},{"id":"https://openalex.org/keywords/economics","display_name":"Economics","score":0.1437177062034607},{"id":"https://openalex.org/keywords/operations-management","display_name":"Operations management","score":0.12883460521697998}],"concepts":[{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.6491217613220215},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4930555522441864},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.47260600328445435},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.35034453868865967},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.1437177062034607},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.12883460521697998}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2403.16446","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2403.16446","pdf_url":"https://arxiv.org/pdf/2403.16446","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2403.16446","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2403.16446","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2403.16446","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2403.16446","pdf_url":"https://arxiv.org/pdf/2403.16446","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4393213296.pdf","grobid_xml":"https://content.openalex.org/works/W4393213296.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2051487156","https://openalex.org/W2073681303","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2382290278","https://openalex.org/W2478288626","https://openalex.org/W4391913857"],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"are":[4,129,186,234],"gaining":[5],"increasing":[6],"interests":[7],"to":[8,16,56,71,110,158,177,197,202,236],"improve":[9],"clinical":[10,29,78,100,107,113,200,254],"efficiency":[11],"for":[12,38,134,138,247],"medical":[13,127,136,207],"diagnosis,":[14],"owing":[15],"their":[17],"unprecedented":[18],"performance":[19],"in":[20,76,187,217,253],"modelling":[21],"natural":[22],"language.":[23],"Ensuring":[24],"the":[25,31,41,73,112,126,132,144,147,160,180,205,218,238,241],"safe":[26,249],"and":[27,83,94,165,228,250],"reliable":[28,251],"applications,":[30],"evaluation":[32,48,68,86,148,215],"of":[33,146,182,220,240],"LLMs":[34],"indeed":[35],"becomes":[36],"critical":[37],"better":[39],"mitigating":[40],"potential":[42],"risks,":[43],"e.g.,":[44,80],"hallucinations.":[45],"However,":[46],"current":[47],"methods":[49],"heavily":[50],"rely":[51],"on":[52],"labor-intensive":[53],"human":[54],"participation":[55],"achieve":[57],"human-preferred":[58],"judgements.":[59],"To":[60],"overcome":[61],"this":[62],"challenge,":[63],"we":[64,103,153,212],"propose":[65],"an":[66,214,229],"automatic":[67],"paradigm":[69,87,193],"tailored":[70],"assess":[72],"LLMs'":[74,206,248],"capabilities":[75,114],"delivering":[77],"services,":[79],"disease":[81],"diagnosis":[82],"treatment.":[84],"The":[85,191],"contains":[88],"three":[89],"basic":[90],"elements:":[91],"metric,":[92],"data,":[93],"algorithm.":[95],"Specifically,":[96],"inspired":[97],"by":[98],"professional":[99],"practice":[101],"pathways,":[102],"formulate":[104],"a":[105,116,155,166,173,183,223,225],"LLM-specific":[106],"pathway":[108],"(LCP)":[109],"define":[111],"that":[115],"doctor":[117,167,184],"agent":[118,185],"should":[119],"possess.":[120],"Then,":[121],"Standardized":[122],"Patients":[123],"(SPs)":[124],"from":[125],"education":[128],"introduced":[130],"as":[131],"guideline":[133],"collecting":[135],"data":[137],"evaluation,":[139],"which":[140,169],"can":[141,194],"well":[142],"ensure":[143],"completeness":[145],"procedure.":[149],"Leveraging":[150],"these":[151],"steps,":[152],"develop":[154],"multi-agent":[156],"framework":[157],"simulate":[159],"interactive":[161],"environment":[162],"between":[163],"SPs":[164,226],"agent,":[168],"is":[170],"equipped":[171],"with":[172,189],"Retrieval-Augmented":[174],"Evaluation":[175],"(RAE)":[176],"determine":[178],"whether":[179],"behaviors":[181],"accordance":[188],"LCP.":[190],"above":[192],"be":[195],"extended":[196],"any":[198],"similar":[199],"scenarios":[201],"automatically":[203],"evaluate":[204],"capabilities.":[208],"Applying":[209],"such":[210],"paradigm,":[211],"construct":[213],"benchmark":[216],"field":[219],"urology,":[221],"including":[222],"LCP,":[224],"dataset,":[227],"automated":[230],"RAE.":[231],"Extensive":[232],"experiments":[233],"conducted":[235],"demonstrate":[237],"effectiveness":[239],"proposed":[242],"approach,":[243],"providing":[244],"more":[245],"insights":[246],"deployments":[252],"practice.":[255]},"counts_by_year":[],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2024-03-27T00:00:00"}
