{"id":"https://openalex.org/W4417095764","doi":"https://doi.org/10.48550/arxiv.2506.05384","title":"Q-Ponder: A Unified Training Pipeline for Reasoning-based Visual Quality Assessment","display_name":"Q-Ponder: A Unified Training Pipeline for Reasoning-based Visual Quality Assessment","publication_year":2025,"publication_date":"2025-06-03","ids":{"openalex":"https://openalex.org/W4417095764","doi":"https://doi.org/10.48550/arxiv.2506.05384"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2506.05384","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.05384","pdf_url":"https://arxiv.org/pdf/2506.05384","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2506.05384","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Cai, Zhuoxuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Cai, Zhuoxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100410025","display_name":"Jian Zhang","orcid":"https://orcid.org/0000-0002-9129-255X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015965681","display_name":"Xinbin Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Xinbin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062345781","display_name":"Peng-Tao Jiang","orcid":"https://orcid.org/0000-0002-1786-4943"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Peng-Tao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100687441","display_name":"Wen-Xiang Chen","orcid":"https://orcid.org/0000-0002-0560-8280"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Wenxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Tang, Bowen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Bowen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028538006","display_name":"Lujian Yao","orcid":"https://orcid.org/0000-0002-7571-1339"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Lujian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101562761","display_name":"Qiyuan Wang","orcid":"https://orcid.org/0000-0002-7158-3886"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qiyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101906593","display_name":"Jinwen Chen","orcid":"https://orcid.org/0000-0003-0012-1438"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jinwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5032220526","display_name":"Bo Li","orcid":"https://orcid.org/0000-0002-4973-1969"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Bo","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9290000200271606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9290000200271606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.010999999940395355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.010300000198185444,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8507000207901001},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.6596999764442444},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6416000127792358},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.6399999856948853},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5921000242233276},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5507000088691711},{"id":"https://openalex.org/keywords/disjoint-sets","display_name":"Disjoint sets","score":0.35249999165534973},{"id":"https://openalex.org/keywords/quality-score","display_name":"Quality Score","score":0.3481999933719635}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8507000207901001},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.772599995136261},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.6941999793052673},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6722999811172485},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.6596999764442444},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6416000127792358},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.6399999856948853},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5921000242233276},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5507000088691711},{"id":"https://openalex.org/C45340560","wikidata":"https://www.wikidata.org/wiki/Q215382","display_name":"Disjoint sets","level":2,"score":0.35249999165534973},{"id":"https://openalex.org/C2779346075","wikidata":"https://www.wikidata.org/wiki/Q7268763","display_name":"Quality Score","level":3,"score":0.3481999933719635},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3237999975681305},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.3158999979496002},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.29820001125335693},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.2824999988079071},{"id":"https://openalex.org/C58166","wikidata":"https://www.wikidata.org/wiki/Q224821","display_name":"Fuzzy logic","level":2,"score":0.27889999747276306},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.2718000113964081},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26600000262260437},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.25870001316070557},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C3020493868","wikidata":"https://www.wikidata.org/wiki/Q55631277","display_name":"Real world data","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2506.05384","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.05384","pdf_url":"https://arxiv.org/pdf/2506.05384","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2506.05384","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.05384","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2506.05384","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.05384","pdf_url":"https://arxiv.org/pdf/2506.05384","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"studies":[1],"demonstrate":[2],"that":[3,156],"multimodal":[4],"large":[5],"language":[6],"models":[7,38,51,143],"(MLLMs)":[8],"can":[9],"proficiently":[10],"evaluate":[11],"visual":[12,63],"quality":[13,22,41,64,163],"through":[14,107],"interpretable":[15],"assessments.":[16],"However,":[17],"existing":[18],"approaches":[19],"typically":[20],"treat":[21],"scoring":[23,135],"and":[24,68,88,137,151,192],"reasoning":[25,42,111,138],"descriptions":[26,43],"as":[27,149],"separate":[28],"tasks":[29],"with":[30,45,126],"disjoint":[31],"optimization":[32],"objectives,":[33],"leading":[34],"to":[35,132,169],"a":[36,79,85,89,104,123],"trade-off:":[37],"adept":[39],"at":[40],"struggle":[44],"precise":[46],"score":[47,164],"regression,":[48],"while":[49],"score-focused":[50],"lack":[52],"interpretability.":[53],"This":[54],"limitation":[55],"hinders":[56],"the":[57,96,118,142,195],"full":[58],"potential":[59,197],"of":[60],"MLLMs":[61],"in":[62,95,189],"assessment,":[65],"where":[66],"accuracy":[67,136,191],"interpretability":[69],"should":[70],"be":[71],"mutually":[72],"reinforcing.":[73],"To":[74],"address":[75],"this,":[76],"we":[77,99,121],"propose":[78],"unified":[80],"two-stage":[81],"training":[82],"framework":[83],"comprising":[84],"cold-start":[86],"stage":[87],"reinforcement":[90],"learning-based":[91],"fine-tuning":[92],"stage.":[93],"Specifically,":[94],"first":[97],"stage,":[98,120],"distill":[100],"high-quality":[101],"data":[102],"from":[103,145],"teacher":[105,185],"model":[106,186],"expert-designed":[108],"prompts,":[109],"initializing":[110],"capabilities":[112],"via":[113],"cross-entropy":[114],"loss":[115],"supervision.":[116],"In":[117],"second":[119],"introduce":[122],"novel":[124],"reward":[125],"Group":[127],"Relative":[128],"Policy":[129],"Optimization":[130],"(GRPO)":[131],"jointly":[133],"optimize":[134],"consistency.":[139],"We":[140],"designate":[141],"derived":[144],"these":[146],"two":[147],"stages":[148],"Q-Ponder-CI":[150],"Q-Ponder.":[152],"Extensive":[153],"experiments":[154],"show":[155],"Q-Ponder":[157,177],"achieves":[158],"state-of-the-art":[159],"(SOTA)":[160],"performance":[161],"on":[162,173],"regression":[165],"benchmarks,":[166],"delivering":[167],"up":[168],"6.5%":[170],"higher":[171],"SRCC":[172],"cross-domain":[174],"datasets.":[175],"Furthermore,":[176],"significantly":[178],"outperforms":[179],"description-based":[180],"SOTA":[181],"models,":[182],"including":[183],"its":[184],"Qwen-2.5-VL-72B,":[187],"particularly":[188],"description":[190],"reasonableness,":[193],"demonstrating":[194],"generalization":[196],"over":[198],"diverse":[199],"tasks.":[200]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
