{"id":"https://openalex.org/W7128551650","doi":"https://doi.org/10.48550/arxiv.2602.08683","title":"OneVision-Encoder: Codec-Aligned Sparsity as a Foundational Principle for Multimodal Intelligence","display_name":"OneVision-Encoder: Codec-Aligned Sparsity as a Foundational Principle for Multimodal Intelligence","publication_year":2026,"publication_date":"2026-02-09","ids":{"openalex":"https://openalex.org/W7128551650","doi":"https://doi.org/10.48550/arxiv.2602.08683"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.08683","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125594031","display_name":"Feilong Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Tang, Feilong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123224636","display_name":"Xiang An","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"An, Xiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063890395","display_name":"Yunyao Yan","orcid":"https://orcid.org/0009-0005-0830-1830"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Yunyao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102498940","display_name":"Yin Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Yin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125549532","display_name":"Bin Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Bin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125582582","display_name":"Kaicheng Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Kaicheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125492872","display_name":"Yifei Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Yifei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125493344","display_name":"Yuanhan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yuanhan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125575892","display_name":"Chunyuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chunyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095102135","display_name":"Shikun Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Shikun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035590797","display_name":"Changrui Chen","orcid":"https://orcid.org/0000-0002-1324-7454"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Changrui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112749105","display_name":"Huajie Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Huajie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125545630","display_name":"Ming Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Ming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125494181","display_name":"Manyuan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Manyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125518536","display_name":"Bo Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Bo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024836571","display_name":"Ziyong Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Ziyong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125519060","display_name":"Ziwei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125528492","display_name":"Zongyuan Ge","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ge, Zongyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125566705","display_name":"Jiankang Deng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deng, Jiankang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":19,"corresponding_author_ids":["https://openalex.org/A5125594031"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.6699000000953674,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.6699000000953674,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.0551999993622303,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.04089999943971634,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6377999782562256},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.474700003862381},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.4189000129699707},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.37610000371932983},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.3659999966621399},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.35100001096725464},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.32510000467300415},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.3231000006198883},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.3206000030040741}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8120999932289124},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6769999861717224},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6377999782562256},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.474700003862381},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.4189000129699707},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3946000039577484},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.38029998540878296},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.37610000371932983},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.3659999966621399},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.35100001096725464},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.3231000006198883},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.3206000030040741},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.3061000108718872},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3050000071525574},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.30239999294281006},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.296099990606308},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.2948000133037567},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2924000024795532},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2849999964237213},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2782000005245209},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.2702000141143799},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.26429998874664307}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.08683","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.08683","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.08683","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.08683","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.6400437951087952}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Hypothesis.":[0],"Artificial":[1],"general":[2],"intelligence":[3],"is,":[4],"at":[5],"its":[6,20],"core,":[7],"a":[8,145,153,184,240,246],"compression":[9,12],"problem.":[10],"Effective":[11],"demands":[13],"resonance:":[14],"deep":[15],"learning":[16],"scales":[17],"best":[18],"when":[19],"architecture":[21],"aligns":[22],"with":[23,93,152],"the":[24,28,32,52,73,94,125],"fundamental":[25,33],"structure":[26,109],"of":[27,97,127,232],"data.":[29,221],"These":[30],"are":[31,46,182,187],"principles.":[34],"Yet,":[35],"modern":[36],"vision":[37,198],"architectures":[38,92],"have":[39],"strayed":[40],"from":[41],"these":[42],"truths:":[43],"visual":[44,86,108,217,251],"signals":[45],"highly":[47],"redundant,":[48],"while":[49],"discriminative":[50],"information,":[51],"surprise,":[53],"is":[54,150,239],"sparse.":[55],"Current":[56],"models":[57],"process":[58],"dense":[59],"pixel":[60],"grids":[61],"uniformly,":[62],"wasting":[63],"vast":[64],"compute":[65],"on":[66,72,124,223],"static":[67],"background":[68],"rather":[69],"than":[70,160],"focusing":[71],"predictive":[74,107],"residuals":[75],"that":[76,83],"define":[77],"motion":[78,170],"and":[79,136,149,169,180,203,209,219],"meaning.":[80,112],"We":[81],"argue":[82],"to":[84,121],"solve":[85],"understanding,":[87],"we":[88],"must":[89],"align":[90],"our":[91,176],"information-theoretic":[95],"principles":[96],"video,":[98,208],"i.e.,":[99],"Codecs.":[100],"Method.":[101],"OneVision-Encoder":[102,143],"encodes":[103],"video":[104,224],"by":[105],"compressing":[106],"into":[110,192],"semantic":[111,163],"By":[113],"adopting":[114],"Codec":[115],"Patchification,":[116],"OV-Encoder":[117,227,244],"abandons":[118],"uniform":[119],"computation":[120],"focus":[122],"exclusively":[123],"3.1%-25%":[126],"regions":[128],"rich":[129],"in":[130],"signal":[131],"entropy.":[132],"To":[133],"unify":[134],"spatial":[135],"temporal":[137],"reasoning":[138],"under":[139],"irregular":[140],"token":[141],"layouts,":[142],"employs":[144],"shared":[146],"3D":[147],"RoPE":[148],"trained":[151],"large-scale":[154],"cluster":[155],"discrimination":[156],"objective":[157],"over":[158,234],"more":[159],"one":[161],"million":[162],"concepts,":[164],"jointly":[165],"capturing":[166],"object":[167],"permanence":[168],"dynamics.":[171],"Evidence.":[172],"The":[173],"results":[174],"validate":[175],"core":[177],"hypothesis:":[178],"efficiency":[179],"accuracy":[181],"not":[183],"trade-off;":[185],"they":[186],"positively":[188],"correlated.":[189],"When":[190],"integrated":[191],"LLM,":[193],"it":[194],"consistently":[195],"outperforms":[196],"strong":[197],"backbones":[199],"such":[200],"as":[201,245],"Qwen3-ViT":[202],"SigLIP2":[204],"across":[205],"16":[206],"image,":[207],"document":[210],"understanding":[211,225],"benchmarks,":[212],"despite":[213],"using":[214],"substantially":[215],"fewer":[216],"tokens":[218],"pretraining":[220],"Notably,":[222],"tasks,":[226],"achieves":[228],"an":[229],"average":[230],"improvement":[231],"4.1%":[233],"Qwen3-ViT.":[235],"Codec-aligned,":[236],"patch-level":[237],"sparsity":[238],"foundational":[241],"principle,":[242],"enabling":[243],"scalable":[247],"engine":[248],"for":[249],"next-generation":[250],"generalists.":[252]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-02-11T00:00:00"}
