{"id":"https://openalex.org/W6948336340","doi":"https://doi.org/10.48550/arxiv.2504.14239","title":"InfiGUI-R1: Advancing Multimodal GUI Agents from Reactive Actors to Deliberative Reasoners","display_name":"InfiGUI-R1: Advancing Multimodal GUI Agents from Reactive Actors to Deliberative Reasoners","publication_year":2025,"publication_date":"2025-04-19","ids":{"openalex":"https://openalex.org/W6948336340","doi":"https://doi.org/10.48550/arxiv.2504.14239"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2504.14239","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2504.14239","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2504.14239","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Liu, Yuhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Yuhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Pengxiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Pengxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xie, Congkai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Congkai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Hu, Xavier","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Xavier","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Han, Xiaotian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Xiaotian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Shengyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shengyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang, Hongxia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Hongxia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Wu, Fei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Fei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10941","display_name":"Musicology and Musical Analysis","score":0.6902999877929688,"subfield":{"id":"https://openalex.org/subfields/1210","display_name":"Music"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10941","display_name":"Musicology and Musical Analysis","score":0.6902999877929688,"subfield":{"id":"https://openalex.org/subfields/1210","display_name":"Music"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13754","display_name":"Central European and Russian historical studies","score":0.008899999782443047,"subfield":{"id":"https://openalex.org/subfields/3320","display_name":"Political Science and International Relations"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T14046","display_name":"Bach Studies and Logistics Development","score":0.007699999958276749,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantic-reasoner","display_name":"Semantic reasoner","score":0.8263999819755554},{"id":"https://openalex.org/keywords/qualitative-reasoning","display_name":"Qualitative reasoning","score":0.43650001287460327},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.4147999882698059},{"id":"https://openalex.org/keywords/graphical-user-interface","display_name":"Graphical user interface","score":0.40230000019073486},{"id":"https://openalex.org/keywords/reasoning-system","display_name":"Reasoning system","score":0.40139999985694885},{"id":"https://openalex.org/keywords/deliberation","display_name":"Deliberation","score":0.39169999957084656},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.361299991607666},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.3409999907016754},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.32690000534057617},{"id":"https://openalex.org/keywords/model-based-reasoning","display_name":"Model-based reasoning","score":0.31470000743865967},{"id":"https://openalex.org/keywords/logical-reasoning","display_name":"Logical reasoning","score":0.31209999322891235}],"concepts":[{"id":"https://openalex.org/C9616225","wikidata":"https://www.wikidata.org/wiki/Q3929429","display_name":"Semantic reasoner","level":2,"score":0.8263999819755554},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7875999808311462},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.6015999913215637},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5185999870300293},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.43650001287460327},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.4147999882698059},{"id":"https://openalex.org/C37789001","wikidata":"https://www.wikidata.org/wiki/Q782543","display_name":"Graphical user interface","level":2,"score":0.40230000019073486},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.40139999985694885},{"id":"https://openalex.org/C2776946740","wikidata":"https://www.wikidata.org/wiki/Q358652","display_name":"Deliberation","level":3,"score":0.39169999957084656},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.361299991607666},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.3409999907016754},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.32690000534057617},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.31470000743865967},{"id":"https://openalex.org/C43971567","wikidata":"https://www.wikidata.org/wiki/Q3142865","display_name":"Logical reasoning","level":2,"score":0.31209999322891235},{"id":"https://openalex.org/C97364631","wikidata":"https://www.wikidata.org/wiki/Q484284","display_name":"Deductive reasoning","level":2,"score":0.30799999833106995},{"id":"https://openalex.org/C155846161","wikidata":"https://www.wikidata.org/wiki/Q1143367","display_name":"Graphical model","level":2,"score":0.3021000027656555},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.2955000102519989},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.290800005197525},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C107848011","wikidata":"https://www.wikidata.org/wiki/Q4680756","display_name":"Adaptive reasoning","level":4,"score":0.28200000524520874},{"id":"https://openalex.org/C192327766","wikidata":"https://www.wikidata.org/wiki/Q1038799","display_name":"Cognitive robotics","level":3,"score":0.27570000290870667},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.27000001072883606},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.2671999931335449},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.26440000534057617},{"id":"https://openalex.org/C62230096","wikidata":"https://www.wikidata.org/wiki/Q275969","display_name":"Crowdsourcing","level":2,"score":0.26330000162124634},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2590999901294708},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.2574000060558319},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2574000060558319},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2540000081062317},{"id":"https://openalex.org/C103057564","wikidata":"https://www.wikidata.org/wiki/Q4751139","display_name":"Analytic reasoning","level":3,"score":0.24709999561309814},{"id":"https://openalex.org/C103683099","wikidata":"https://www.wikidata.org/wiki/Q5370102","display_name":"Embodied agent","level":3,"score":0.2467000037431717},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.24629999697208405},{"id":"https://openalex.org/C13687954","wikidata":"https://www.wikidata.org/wiki/Q4826847","display_name":"Autonomous agent","level":2,"score":0.24580000340938568},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.24580000340938568},{"id":"https://openalex.org/C161301231","wikidata":"https://www.wikidata.org/wiki/Q3478658","display_name":"Knowledge representation and reasoning","level":2,"score":0.24310000240802765},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.2313999980688095},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.22990000247955322},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.21549999713897705},{"id":"https://openalex.org/C192906763","wikidata":"https://www.wikidata.org/wiki/Q3376512","display_name":"Practical reason","level":2,"score":0.21549999713897705}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2504.14239","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2504.14239","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2504.14239","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2504.14239","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(MLLMs)":[4],"have":[5,22],"powered":[6],"Graphical":[7],"User":[8],"Interface":[9],"(GUI)":[10],"Agents,":[11],"showing":[12],"promise":[13],"in":[14,26,45,234],"automating":[15],"tasks":[16,28,80],"on":[17,37,70,101,142],"computing":[18],"devices.":[19],"Recent":[20],"works":[21],"begun":[23],"exploring":[24],"reasoning":[25,40,46,72,156,167,178],"GUI":[27,56,79,113,173,235],"with":[29,165,176],"encouraging":[30],"results.":[31],"However,":[32],"many":[33],"current":[34],"approaches":[35],"rely":[36],"manually":[38],"designed":[39,125],"templates,":[41],"which":[42,205,218],"may":[43,74],"result":[44],"that":[47,73,88],"is":[48],"not":[49],"sufficiently":[50],"robust":[51],"and":[52,83,213,237],"adaptive":[53],"for":[54,78,208],"complex":[55],"environments.":[57],"Meanwhile,":[58],"some":[59],"existing":[60],"agents":[61,91,129],"continue":[62],"to":[63,126,133,152,161,171],"operate":[64],"as":[65],"Reactive":[66,131],"Actors,":[67],"relying":[68],"primarily":[69],"implicit":[71],"lack":[75],"sufficient":[76],"depth":[77],"demanding":[81],"planning":[82],"error":[84],"recovery.":[85],"We":[86,147],"argue":[87],"advancing":[89],"these":[90],"requires":[92],"a":[93,120,144,192],"shift":[94],"from":[95,130,158,223],"reactive":[96],"acting":[97,99],"towards":[98],"based":[100],"deliberate":[102],"reasoning.":[103],"To":[104],"facilitate":[105],"this":[106],"transformation,":[107],"we":[108],"introduce":[109],"InfiGUI-R1,":[110],"an":[111],"MLLM-based":[112],"agent":[114],"developed":[115],"through":[116,163],"our":[117],"Actor2Reasoner":[118],"framework,":[119],"reasoning-centric,":[121],"two-stage":[122],"training":[123,221],"approach":[124],"progressively":[127],"evolve":[128],"Actors":[132],"Deliberative":[134],"Reasoners.":[135],"The":[136,182],"first":[137],"stage,":[138,184],"Reasoning":[139,150],"Injection,":[140],"focuses":[141],"establishing":[143],"basic":[145,189],"reasoner.":[146],"employ":[148],"Spatial":[149],"Distillation":[151],"transfer":[153],"cross-modal":[154],"spatial":[155],"capabilities":[157],"teacher":[159],"models":[160,170,207],"MLLMs":[162],"trajectories":[164],"explicit":[166],"steps,":[168],"enabling":[169],"integrate":[172],"visual-spatial":[174],"information":[175],"logical":[177],"before":[179],"action":[180],"generation.":[181],"second":[183],"Deliberation":[185],"Enhancement,":[186],"refines":[187],"the":[188],"reasoner":[190],"into":[191],"deliberative":[193],"one":[194],"using":[195],"Reinforcement":[196],"Learning.":[197],"This":[198],"stage":[199],"introduces":[200],"two":[201],"approaches:":[202],"Sub-goal":[203],"Guidance,":[204],"rewards":[206],"generating":[209],"accurate":[210],"intermediate":[211],"sub-goals,":[212],"Error":[214],"Recovery":[215],"Scenario":[216],"Construction,":[217],"creates":[219],"failure-and-recovery":[220],"scenarios":[222],"identified":[224],"prone-to-error":[225],"steps.":[226],"Experimental":[227],"results":[228],"show":[229],"InfiGUI-R1":[230],"achieves":[231],"strong":[232],"performance":[233],"grounding":[236],"trajectory":[238],"tasks.":[239],"Resources":[240],"at":[241],"https://github.com/Reallm-Labs/InfiGUI-R1.":[242]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
