{"id":"https://openalex.org/W7106290254","doi":"https://doi.org/10.48550/arxiv.2511.15669","title":"DeepThinkVLA: Enhancing Reasoning Capability of Vision-Language-Action Models","display_name":"DeepThinkVLA: Enhancing Reasoning Capability of Vision-Language-Action Models","publication_year":2025,"publication_date":"2025-10-31","ids":{"openalex":"https://openalex.org/W7106290254","doi":"https://doi.org/10.48550/arxiv.2511.15669"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2511.15669","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.15669","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2511.15669","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yin, Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yin, Cheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lin, Yankai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Yankai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xu, Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Tam, Sikyuen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tam, Sikyuen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zeng, Xiangrui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Xiangrui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liu, Zhiyuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhiyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Yin, Zhouping","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Zhouping","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8784999847412109,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8784999847412109,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.03519999980926514,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.008999999612569809,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/path","display_name":"Path (computing)","score":0.5507000088691711},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5304999947547913},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.4982999861240387},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.48989999294281006},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.48579999804496765},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.44119998812675476},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.41499999165534973},{"id":"https://openalex.org/keywords/causal-reasoning","display_name":"Causal reasoning","score":0.41119998693466187}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.725600004196167},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.5507000088691711},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5304999947547913},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5242999792098999},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.4982999861240387},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.48989999294281006},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.48579999804496765},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.44119998812675476},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.41499999165534973},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.41119998693466187},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.38269999623298645},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.3702000081539154},{"id":"https://openalex.org/C148047603","wikidata":"https://www.wikidata.org/wiki/Q1014612","display_name":"Parallelizable manifold","level":2,"score":0.33649998903274536},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.31940001249313354},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3140999972820282},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3100999891757965},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.30559998750686646},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.3046000003814697},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.2596000134944916}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2511.15669","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.15669","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2511.15669","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.15669","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Does":[0],"Chain-of-Thought":[1],"(CoT)":[2],"reasoning":[3,113],"genuinely":[4],"improve":[5],"Vision-Language-Action":[6],"(VLA)":[7],"models,":[8],"or":[9],"does":[10],"it":[11],"merely":[12,79],"add":[13],"overhead?":[14],"Existing":[15],"CoT-VLA":[16],"systems":[17],"report":[18],"limited":[19],"and":[20,31,62,190],"inconsistent":[21],"gains,":[22],"yet":[23],"no":[24,112],"prior":[25],"work":[26],"has":[27],"rigorously":[28],"diagnosed":[29],"when":[30],"why":[32],"CoT":[33,51,61,94,108],"helps":[34],"robots":[35],"act.":[36],"Through":[37],"systematic":[38],"experiments,":[39],"we":[40,138,203],"identify":[41],"two":[42],"necessary":[43],"conditions":[44],"that":[45],"must":[46,64,95],"be":[47,53,65,96],"jointly":[48],"satisfied":[49],"for":[50,150,155,188],"to":[52,99,126],"effective":[54],"in":[55],"VLA:":[56],"(1)":[57],"Decoding":[58],"Alignment":[59,92],"--":[60,93],"actions":[63],"generated":[66],"with":[67,152,172],"modality-appropriate":[68],"mechanisms;":[69],"forcing":[70],"both":[71],"through":[72,211],"a":[73,120,131,141,160],"single":[74],"autoregressive":[75],"decoder":[76,143],"is":[77,109],"not":[78],"suboptimal":[80],"but":[81],"actively":[82],"harmful,":[83],"degrading":[84],"performance":[85,122],"by":[86,135,146,166,199],"4.2":[87],"percentage":[88],"points;":[89],"(2)":[90],"Causal":[91],"causally":[97],"linked":[98],"task":[100],"success":[101,179,192],"via":[102],"outcome-based":[103],"optimization;":[104],"without":[105],"it,":[106],"supervised":[107],"indistinguishable":[110],"from":[111],"at":[114,217],"all":[115],"under":[116],"distribution":[117],"shift,":[118],"exhibiting":[119],"32.0\\,pp":[121],"drop":[123,129],"nearly":[124],"identical":[125],"the":[127,168,196,205],"31.6\\,pp":[128],"of":[130,208],"reasoning-free":[132],"baseline.":[133],"Guided":[134],"these":[136],"findings,":[137],"build":[139],"DeepThinkVLA:":[140],"hybrid-attention":[142],"satisfies":[144,164],"Condition~1":[145],"pairing":[147],"causal":[148],"attention":[149,154],"language":[151],"bidirectional":[153],"parallel":[156],"action":[157],"decoding,":[158],"while":[159],"two-stage":[161],"SFT-then-RL":[162],"pipeline":[163],"Condition~2":[165],"aligning":[167],"full":[169],"reasoning--action":[170],"chain":[171],"sparse":[173],"task-success":[174],"rewards.":[175],"DeepThinkVLA":[176],"achieves":[177],"97.0\\%":[178],"on":[180,184,193],"LIBERO,":[181],"79.0\\%":[182],"robustness":[183],"LIBERO-Plus":[185],"(vs.\\":[186],"61.6\\%":[187],"$\u03c0_0$-FAST),":[189],"59.3\\%":[191],"RoboTwin~2.0,":[194],"exceeding":[195],"strongest":[197],"baseline":[198],"21.7":[200],"points.":[201],"Furthermore,":[202],"validate":[204],"practical":[206],"effectiveness":[207],"our":[209],"approach":[210],"real-world":[212],"robot":[213],"experiments.":[214],"Code":[215],"available":[216],"https://github.com/OpenBMB/DeepThinkVLA":[218]},"counts_by_year":[],"updated_date":"2026-04-22T06:01:30.510260","created_date":"2025-11-23T00:00:00"}
