{"id":"https://openalex.org/W7101583646","doi":"https://doi.org/10.48550/arxiv.2510.22706","title":"IGGT: Instance-Grounded Geometry Transformer for Semantic 3D Reconstruction","display_name":"IGGT: Instance-Grounded Geometry Transformer for Semantic 3D Reconstruction","publication_year":2025,"publication_date":"2025-10-26","ids":{"openalex":"https://openalex.org/W7101583646","doi":"https://doi.org/10.48550/arxiv.2510.22706"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2510.22706","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2510.22706","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2510.22706","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Li, Hao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zou, Zhengyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zou, Zhengyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liu, Fangfu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Fangfu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Xuanyang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xuanyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Hong, Fangzhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hong, Fangzhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cao, Yukang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yukang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lan, Yushi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lan, Yushi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Manyuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Manyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yu, Gang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Gang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Dingwen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Dingwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Liu, Ziwei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziwei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10719","display_name":"3D Shape Modeling and Analysis","score":0.6687999963760376,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10719","display_name":"3D Shape Modeling and Analysis","score":0.6687999963760376,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.09220000356435776,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.06069999933242798,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5228999853134155},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.4717000126838684},{"id":"https://openalex.org/keywords/adaptability","display_name":"Adaptability","score":0.45010000467300415},{"id":"https://openalex.org/keywords/3d-reconstruction","display_name":"3D reconstruction","score":0.44940000772476196},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.4205999970436096},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4120999872684479},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.3652999997138977},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.35499998927116394},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.34950000047683716},{"id":"https://openalex.org/keywords/active-perception","display_name":"Active perception","score":0.34220001101493835}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6679999828338623},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5871999859809875},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5228999853134155},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.4717000126838684},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.45010000467300415},{"id":"https://openalex.org/C109950114","wikidata":"https://www.wikidata.org/wiki/Q4464732","display_name":"3D reconstruction","level":2,"score":0.44940000772476196},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.4205999970436096},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4120999872684479},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3702999949455261},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3652999997138977},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.35499998927116394},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.34950000047683716},{"id":"https://openalex.org/C2776010242","wikidata":"https://www.wikidata.org/wiki/Q4677575","display_name":"Active perception","level":3,"score":0.34220001101493835},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.3416000008583069},{"id":"https://openalex.org/C3019007443","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3d model","level":2,"score":0.33899998664855957},{"id":"https://openalex.org/C108882727","wikidata":"https://www.wikidata.org/wiki/Q2991685","display_name":"Solid modeling","level":2,"score":0.33230000734329224},{"id":"https://openalex.org/C2777897806","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3D modeling","level":2,"score":0.3305000066757202},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3301999866962433},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.32030001282691956},{"id":"https://openalex.org/C131979681","wikidata":"https://www.wikidata.org/wiki/Q1899648","display_name":"Point cloud","level":2,"score":0.3125},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.30329999327659607},{"id":"https://openalex.org/C104065381","wikidata":"https://www.wikidata.org/wiki/Q1002535","display_name":"Geometric modeling","level":2,"score":0.30169999599456787},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2939999997615814},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.28850001096725464},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.2761000096797943},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.26649999618530273},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C141379421","wikidata":"https://www.wikidata.org/wiki/Q6094427","display_name":"Iterative reconstruction","level":2,"score":0.2565999925136566},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2563000023365021},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.25189998745918274},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.251800000667572},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.2513999938964844}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2510.22706","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2510.22706","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2510.22706","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2510.22706","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Humans":[0],"naturally":[1],"perceive":[2],"the":[3,45,88,114],"geometric":[4,141],"structure":[5],"and":[6,18,37,59,92,120,143,188],"semantic":[7],"content":[8],"of":[9,21,53,156],"a":[10,127,137,161,178,194],"3D":[11,35,66,78,163],"world":[12],"as":[13],"intertwined":[14],"dimensions,":[15],"enabling":[16],"coherent":[17,162],"accurate":[19],"understanding":[20,41,67],"complex":[22],"scenes.":[23],"However,":[24],"most":[25],"prior":[26],"approaches":[27],"prioritize":[28],"training":[29],"large":[30,109],"geometry":[31],"models":[32,79],"for":[33,116],"low-level":[34],"reconstruction":[36,119],"treat":[38],"high-level":[39],"spatial":[40,118],"in":[42,64],"isolation,":[43],"overlooking":[44],"crucial":[46],"interplay":[47],"between":[48],"these":[49],"two":[50],"fundamental":[51],"aspects":[52],"3D-scene":[54],"analysis,":[55],"thereby":[56],"limiting":[57,93],"generalization":[58],"leading":[60],"to":[61,87,95,112,135],"poor":[62],"performance":[63],"downstream":[65,96],"tasks.":[68,97],"Recent":[69],"attempts":[70],"have":[71],"mitigated":[72],"this":[73,99,172],"issue":[74],"by":[75],"simply":[76],"aligning":[77],"with":[80,140,165,181,193],"specific":[81],"language":[82],"models,":[83],"thus":[84],"restricting":[85],"perception":[86],"aligned":[89],"model's":[90],"capacity":[91],"adaptability":[94],"In":[98],"paper,":[100],"we":[101,125,174],"propose":[102],"InstanceGrounded":[103],"Geometry":[104],"Transformer":[105],"(IGGT),":[106],"an":[107],"end-to-end":[108],"unified":[110,138],"transformer":[111],"unify":[113],"knowledge":[115],"both":[117],"instance-level":[121,190],"contextual":[122],"understanding.":[123],"Specifically,":[124],"design":[126],"3D-Consistent":[128],"Contrastive":[129],"Learning":[130],"strategy":[131],"that":[132],"guides":[133],"IGGT":[134],"encode":[136],"representation":[139,152],"structures":[142],"instance-grounded":[144],"clustering":[145],"through":[146],"only":[147],"2D":[148,157],"visual":[149,158],"inputs.":[150],"This":[151],"supports":[153],"consistent":[154],"lifting":[155],"inputs":[159],"into":[160],"scene":[164],"explicitly":[166],"distinct":[167],"object":[168],"instances.":[169],"To":[170],"facilitate":[171],"task,":[173],"further":[175],"construct":[176],"InsScene-15K,":[177],"large-scale":[179],"dataset":[180],"high-quality":[182],"RGB":[183],"images,":[184],"poses,":[185],"depth":[186],"maps,":[187],"3D-consistent":[189],"mask":[191],"annotations":[192],"novel":[195],"data":[196],"curation":[197],"pipeline.":[198]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-29T00:00:00"}
