{"id":"https://openalex.org/W6891898869","doi":"https://doi.org/10.48550/arxiv.2503.06960","title":"A Data-Centric Revisit of Pre-Trained Vision Models for Robot Learning","display_name":"A Data-Centric Revisit of Pre-Trained Vision Models for Robot Learning","publication_year":2025,"publication_date":"2025-03-10","ids":{"openalex":"https://openalex.org/W6891898869","doi":"https://doi.org/10.48550/arxiv.2503.06960"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2503.06960","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.06960","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2503.06960","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Wen, Xin","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wen, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhao, Bingchen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Bingchen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chen, Yilun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yilun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Pang, Jiangmiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pang, Jiangmiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Qi, Xiaojuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qi, Xiaojuan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.2361000031232834,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.2361000031232834,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.1256999969482422,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.08699999749660492,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5889000296592712},{"id":"https://openalex.org/keywords/robotics","display_name":"Robotics","score":0.5770999789237976},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5601999759674072},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.5357999801635742},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4993000030517578},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.47429999709129333},{"id":"https://openalex.org/keywords/flexibility","display_name":"Flexibility (engineering)","score":0.46070000529289246},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.46070000529289246},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.37400001287460327},{"id":"https://openalex.org/keywords/machine-vision","display_name":"Machine vision","score":0.3450999855995178}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7294999957084656},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6690000295639038},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5889000296592712},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.5770999789237976},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5601999759674072},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.5357999801635742},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4993000030517578},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.47429999709129333},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.46070000529289246},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.46070000529289246},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4334999918937683},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.37400001287460327},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.3450999855995178},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.34310001134872437},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.32749998569488525},{"id":"https://openalex.org/C2983761899","wikidata":"https://www.wikidata.org/wiki/Q604674","display_name":"Robot vision","level":4,"score":0.3156999945640564},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.28349998593330383},{"id":"https://openalex.org/C188888258","wikidata":"https://www.wikidata.org/wiki/Q7353390","display_name":"Robot learning","level":4,"score":0.28110000491142273},{"id":"https://openalex.org/C200220432","wikidata":"https://www.wikidata.org/wiki/Q7936208","display_name":"Vision science","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2687999904155731},{"id":"https://openalex.org/C28063669","wikidata":"https://www.wikidata.org/wiki/Q7167042","display_name":"Perceptual system","level":3,"score":0.25679999589920044},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.25119999051094055},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.2506999969482422},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.25029999017715454},{"id":"https://openalex.org/C2776010242","wikidata":"https://www.wikidata.org/wiki/Q4677575","display_name":"Active perception","level":3,"score":0.24740000069141388},{"id":"https://openalex.org/C162947575","wikidata":"https://www.wikidata.org/wiki/Q2005645","display_name":"Social robot","level":5,"score":0.24629999697208405},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.24210000038146973},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.24009999632835388},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.23340000212192535},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.22859999537467957},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.22529999911785126},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.22130000591278076},{"id":"https://openalex.org/C22033958","wikidata":"https://www.wikidata.org/wiki/Q7167036","display_name":"Perceptual learning","level":3,"score":0.21969999372959137},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.21610000729560852},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.20389999449253082},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.1995999962091446},{"id":"https://openalex.org/C193611912","wikidata":"https://www.wikidata.org/wiki/Q4677596","display_name":"Active vision","level":2,"score":0.19509999454021454},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.1835000067949295},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.18289999663829803},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.1826999932527542}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2503.06960","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.06960","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2503.06960","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.06960","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Pre-trained":[0],"vision":[1],"models":[2,167],"(PVMs)":[3],"are":[4,168],"fundamental":[5],"to":[6,48,58,70,92,98],"modern":[7],"robotics,":[8],"yet":[9],"their":[10,45],"optimal":[11],"configuration":[12],"remains":[13],"unclear.":[14],"Through":[15],"systematic":[16],"evaluation,":[17],"we":[18,78],"find":[19],"that":[20,55,83],"while":[21],"DINO":[22],"and":[23,30,122,133,145,162,166],"iBOT":[24],"outperform":[25],"MAE":[26],"across":[27],"visuomotor":[28],"control":[29],"perception":[31],"tasks,":[32],"they":[33],"struggle":[34],"when":[35],"trained":[36],"on":[37,118],"non-(single-)object-centric":[38],"(NOC)":[39],"data--a":[40],"limitation":[41],"strongly":[42],"correlated":[43],"with":[44,152],"diminished":[46],"ability":[47,57],"learn":[49],"object-centric":[50,60,85],"representations.":[51],"This":[52],"investigation":[53],"indicates":[54],"the":[56,63,68,94,100],"form":[59],"representations":[61,86,132],"from":[62],"non-object-centric":[64],"robotics":[65],"dataset":[66],"is":[67],"key":[69],"success":[71],"for":[72,110],"PVMs.":[73],"Motivated":[74],"by":[75,87],"this":[76],"discovery,":[77],"designed":[79],"SlotMIM,":[80],"a":[81,89],"method":[82,156],"induces":[84],"introducing":[88],"semantic":[90],"bottleneck":[91],"reduce":[93],"number":[95],"of":[96,102],"prototypes":[97],"encourage":[99],"emergence":[101],"objectness":[103],"as":[104,106],"well":[105],"cross-view":[107],"consistency":[108],"regularization":[109],"encouraging":[111],"multiview":[112],"invariance.":[113],"Our":[114,164],"experiments":[115],"encompass":[116],"pre-training":[117],"object-centric,":[119],"scene-centric,":[120],"web-crawled,":[121],"ego-centric":[123],"data.":[124],"Across":[125],"all":[126],"settings,":[127],"our":[128,155],"approach":[129],"learns":[130],"transferrable":[131],"achieves":[134],"significant":[135],"improvements":[136],"over":[137],"prior":[138],"work":[139],"in":[140],"image":[141],"recognition,":[142],"scene":[143],"understanding,":[144],"robot":[146],"learning":[147],"evaluations.":[148],"When":[149],"scaled":[150],"up":[151],"million-scale":[153],"datasets,":[154],"also":[157],"demonstrates":[158],"superior":[159],"data":[160],"efficiency":[161],"scalability.":[163],"code":[165],"publicly":[169],"available":[170],"at":[171],"https://github.com/CVMI-Lab/SlotMIM.":[172]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
