{"id":"https://openalex.org/W4416777406","doi":"https://doi.org/10.48550/arxiv.2511.20718","title":"Stabilizing Off-Policy Training for Long-Horizon LLM Agent via Turn-Level Importance Sampling and Clipping-Triggered Normalization","display_name":"Stabilizing Off-Policy Training for Long-Horizon LLM Agent via Turn-Level Importance Sampling and Clipping-Triggered Normalization","publication_year":2025,"publication_date":"2025-11-25","ids":{"openalex":"https://openalex.org/W4416777406","doi":"https://doi.org/10.48550/arxiv.2511.20718"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2511.20718","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.20718","pdf_url":"https://arxiv.org/pdf/2511.20718","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2511.20718","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100734069","display_name":"Chenliang Li","orcid":"https://orcid.org/0000-0003-3144-6374"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Chenliang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120480730","display_name":"Adel Elmahdy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Elmahdy, Adel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034856555","display_name":"A. J. Boyd","orcid":"https://orcid.org/0000-0002-9725-508X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Boyd, Alex","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Zhongruo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhongruo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zeng, Siliang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Siliang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120549555","display_name":"Alfredo Garcia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Garcia, Alfredo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077875900","display_name":"Parminder Bhatia","orcid":"https://orcid.org/0000-0002-0038-5081"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhatia, Parminder","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009342484","display_name":"Taha Kass\u2010Hout","orcid":"https://orcid.org/0000-0002-0123-5157"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kass-Hout, Taha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006689987","display_name":"Cao Xiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao, Cao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100633783","display_name":"Mingyi Hong","orcid":"https://orcid.org/0000-0003-1263-9365"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hong, Mingyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5100734069"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7670000195503235,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7670000195503235,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.03319999948143959,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.03009999915957451,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7416999936103821},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.5101000070571899},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.46000000834465027},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.43059998750686646},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.4050999879837036},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.4016999900341034},{"id":"https://openalex.org/keywords/clipping","display_name":"Clipping (morphology)","score":0.3822000026702881},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.3790000081062317}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7416999936103821},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7142999768257141},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5406000018119812},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5347999930381775},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.5101000070571899},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.46000000834465027},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.43059998750686646},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.4050999879837036},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.4016999900341034},{"id":"https://openalex.org/C2776848632","wikidata":"https://www.wikidata.org/wiki/Q853463","display_name":"Clipping (morphology)","level":2,"score":0.3822000026702881},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3790000081062317},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.3625999987125397},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.3614000082015991},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.33379998803138733},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.3330000042915344},{"id":"https://openalex.org/C52740198","wikidata":"https://www.wikidata.org/wiki/Q1539564","display_name":"Importance sampling","level":3,"score":0.30149999260902405},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.2782999873161316},{"id":"https://openalex.org/C2987595161","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Optimization algorithm","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C55660270","wikidata":"https://www.wikidata.org/wiki/Q5164377","display_name":"Constrained optimization","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C193254401","wikidata":"https://www.wikidata.org/wiki/Q2160088","display_name":"Robust optimization","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C111696304","wikidata":"https://www.wikidata.org/wiki/Q2303697","display_name":"Sorting","level":2,"score":0.2565999925136566}],"mesh":[],"locations_count":3,"locations":[{"id":"pmh:oai:arXiv.org:2511.20718","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.20718","pdf_url":"https://arxiv.org/pdf/2511.20718","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:doi:10.48550/arxiv.2511.20718","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2511.20718","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.20718","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2511.20718","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.20718","pdf_url":"https://arxiv.org/pdf/2511.20718","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reinforcement":[0],"learning":[1,119,228],"(RL)":[2],"algorithms":[3,133],"such":[4],"as":[5],"PPO":[6,192],"and":[7,34,60,63,66,75,108,117,130,140,155,171,186,193,199,204,222],"GRPO":[8],"are":[9,35,134],"widely":[10],"used":[11],"to":[12,37,136],"train":[13],"large":[14],"language":[15],"models":[16],"(LLMs)":[17],"for":[18,90,225],"multi-turn":[19,106,161,230],"agentic":[20],"tasks.":[21,175],"However,":[22],"in":[23,50,190,229],"off-policy":[24,72,112],"training":[25,184],"pipelines,":[26],"these":[27,81],"methods":[28,181],"often":[29],"exhibit":[30],"unstable":[31],"optimization":[32,59,101,142,202],"dynamics":[33],"prone":[36],"performance":[38,187],"collapse.":[39],"Through":[40],"empirical":[41],"analysis,":[42],"we":[43,83,124],"identify":[44],"two":[45,126],"fundamental":[46],"sources":[47],"of":[48,105,160],"instability":[49],"this":[51,122],"setting:":[52],"(1)~a":[53],"granularity":[54],"mismatch":[55],"between":[56],"token-level":[57],"policy":[58,100],"turn-structured":[61],"interactions,":[62],"(2)":[64],"high-variance":[65],"unreliable":[67,111],"gradient":[68,138],"updates":[69],"induced":[70],"by":[71],"importance":[73],"sampling":[74],"inaccurate":[76],"advantage":[77],"estimation.":[78],"To":[79],"address":[80],"challenges,":[82],"propose":[84],"SORL,":[85],"\\underline{S}tabilizing":[86],"\\underline{O}ff-Policy":[87],"\\underline{R}einforcement":[88],"\\underline{L}earning":[89],"Long-Horizon":[91],"Agent":[92],"Training.":[93],"SORL":[94],"introduces":[95],"principled":[96],"mechanisms":[97],"that":[98,179,214],"align":[99],"with":[102],"the":[103,215],"structure":[104],"interactions":[107],"adaptively":[109],"suppress":[110],"updates,":[113],"yielding":[114],"more":[115,200],"conservative":[116],"robust":[118],"dynamics.":[120],"Within":[121],"framework,":[123],"instantiate":[125],"stabilized":[127],"algorithms:":[128],"SO-PPO":[129,154],"SO-GRPO.":[131],"Both":[132],"designed":[135],"mitigate":[137],"variance":[139],"prevent":[141,183],"collapse":[143],"without":[144],"requiring":[145],"careful":[146],"early":[147],"stopping":[148],"or":[149,207],"heuristic":[150],"tuning.":[151],"We":[152],"evaluate":[153],"SO-GRPO":[156],"on":[157],"a":[158,219],"range":[159],"search":[162],"benchmarks,":[163],"including":[164],"general":[165,223],"question":[166,169],"answering,":[167,170],"multi-hop":[168],"medical":[172],"multiple-choice":[173],"QA":[174],"Experimental":[176],"results":[177,212],"show":[178],"both":[180],"consistently":[182],"instabilities":[185],"collapses":[188],"observed":[189],"standard":[191],"GRPO,":[194],"maintain":[195],"lower":[196],"clipping":[197],"ratios":[198],"stable":[201],"trajectories,":[203],"achieve":[205],"superior":[206],"comparable":[208],"task":[209],"performance.":[210],"These":[211],"demonstrate":[213],"proposed":[216],"algorithm":[217],"provides":[218],"practical,":[220],"scalable,":[221],"framework":[224],"stabilizing":[226],"reinforcement":[227],"LLM":[231],"agent":[232],"training.":[233]},"counts_by_year":[],"updated_date":"2026-05-07T13:39:58.223016","created_date":"2025-11-28T00:00:00"}
