{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:30:17Z","timestamp":1777656617838,"version":"3.51.4"},"reference-count":88,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,2,26]],"date-time":"2025-02-26T00:00:00Z","timestamp":1740528000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,2,26]],"date-time":"2025-02-26T00:00:00Z","timestamp":1740528000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,2,26]]},"DOI":"10.1109\/wacv61041.2025.00220","type":"proceedings-article","created":{"date-parts":[[2025,4,8]],"date-time":"2025-04-08T17:08:13Z","timestamp":1744132093000},"page":"2195-2206","source":"Crossref","is-referenced-by-count":11,"title":["Scene-LLM: Extending Language Model for 3D Visual Reasoning"],"prefix":"10.1109","author":[{"given":"Rao","family":"Fu","sequence":"first","affiliation":[{"name":"Brown University"}]},{"given":"Jingyu","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Chicago"}]},{"given":"Xilun","family":"Chen","sequence":"additional","affiliation":[{"name":"Meta AI"}]},{"given":"Yixin","family":"Nie","sequence":"additional","affiliation":[{"name":"Meta AI"}]},{"given":"Wenhan","family":"Xiong","sequence":"additional","affiliation":[{"name":"Meta AI"}]}],"member":"263","reference":[{"key":"ref1","article-title":"AI2-THOR: An Interactive 3D Environment for Visual AI","volume":"abs\/1712.05474","year":"2017","journal-title":"ArXiv"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"ref3","first-page":"46","article-title":"Taskography: Evaluating robot task planning over large 3d scene graphs","volume-title":"Conference on Robot Learning","author":"Agia","year":"2022"},{"key":"ref4","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"Ahn","year":"2022","journal-title":"arXiv preprint"},{"key":"ref5","first-page":"23716","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01854"},{"key":"ref8","first-page":"706","article-title":"A persistent spatial semantic representation for high-level natural language instruction execution","volume-title":"Conference on Robot Learning","author":"Blukis","year":"2022"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.3389\/frobt.2023.1221739"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161534"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"ref12","article-title":"Minigpt-v2: large language model as a unified interface for vision-language multitask learning","author":"Chen","year":"2023","journal-title":"arXiv preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00321"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref15","article-title":"Palm-e: An embodied multimodal language model","author":"Driess","year":"2023","journal-title":"arXiv preprint"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00370"},{"key":"ref17","article-title":"Point-bind & point-llm: Aligning point cloud with multimodality for 3d understanding, generation, and instruction following","author":"Guo","year":"2023","journal-title":"arXiv preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00888"},{"key":"ref19","article-title":"3d-llm: Injecting the 3d world into large language models","author":"Hong","year":"2023","journal-title":"arXiv preprint"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160969"},{"key":"ref21","article-title":"Chat-3d v2: Bridging 3d scene and large language models with object identifiers","author":"Huang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref22","article-title":"An embodied generalist agent in 3d world","author":"Huang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"ref24","first-page":"9118","article-title":"Language models as zero-shot planners: Extracting actionable knowledge for embodied agents","volume-title":"International Conference on Machine Learning","author":"Huang","year":"2022"},{"key":"ref25","article-title":"Voxposer: Composable 3d value maps for robotic manipulation with language models","author":"Huang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00188"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.066"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"ref29","article-title":"Vima: General robot manipulation with multimodal prompts","author":"Jiang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_7"},{"key":"ref32","first-page":"17976","article-title":"Modeling dynamic environments with scene graph memory","volume-title":"International Conference on Machine Learning","author":"Kurenkov","year":"2023"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"ref34","article-title":"Blip-2: Bootstrapping language-image pretraining with frozen image encoders and large language models","author":"Li","year":"2023","journal-title":"arXiv preprint"},{"key":"ref35","first-page":"31199","article-title":"Pretrained language models for interactive decision-making","volume":"35","author":"Li","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.1025"},{"key":"ref37","article-title":"Instruction-following agents with jointly pretrained vision-language models","author":"Liu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref38","article-title":"Visual instruction tuning","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref39","article-title":"Reflect: Summa-rizing robot experiences for failure explanation and correction","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref40","article-title":"Point-voxel cnn for efficient 3d deep learning","volume":"32","author":"Liu","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref41","article-title":"Neuro-symbolic procedural planning with commonsense prompting","author":"Lu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref42","article-title":"Sqa3d: Situated question answering in 3d scenes","author":"Ma","year":"2022","journal-title":"arXiv preprint"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00459"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"ref45","article-title":"FILM: Following instructions in language with modular methods","volume-title":"International Conference on Learning Representations","author":"Min","year":"2022"},{"key":"ref46","article-title":"Mapping instructions and visual observations to actions with rein-forcement learning","author":"Misra","year":"2017","journal-title":"arXiv preprint"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01564"},{"key":"ref48","article-title":"Kosmos-2: Grounding multimodal large language models to the world","author":"Peng","year":"2023","journal-title":"arXiv preprint"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00886"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00937"},{"key":"ref51","first-page":"652","article-title":"Pointnet: Deep learning on point sets for 3d classification and segmentation","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"Qi","year":"2017"},{"key":"ref52","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford","year":"2021"},{"key":"ref53","article-title":"Habitat-matterport 3d dataset (hm3d): 1000 large-scale 3d environments for embodied ai","author":"Ramakrishnan","year":"2021","journal-title":"arXiv preprint"},{"issue":"2","key":"ref54","first-page":"3","article-title":"Hierarchical text-conditional image generation with clip latents","volume":"1","author":"Ramesh","year":"2022","journal-title":"arXiv preprint"},{"key":"ref55","article-title":"Sayplan: Grounding large language models using 3d scene graphs for scalable task planning","author":"Rana","year":"2023","journal-title":"arXiv preprint"},{"key":"ref56","article-title":"A generalist agent","author":"Reed","year":"2022","journal-title":"arXiv preprint"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.701"},{"key":"ref58","first-page":"9512","article-title":"Object scene representation transformer","volume":"35","author":"Sajjadi","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref59","first-page":"492","article-title":"Lm-nav: Robotic navigation with large pretrained models of language, vision, and action","volume-title":"Conference on Robot Learning","author":"Shah","year":"2023"},{"key":"ref60","first-page":"6087","article-title":"Deep marching tetrahedra: a hybrid representation for high-resolution 3d shape synthesis","volume":"34","author":"Shen","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01075"},{"key":"ref62","article-title":"Alfworld: Aligning text and embodied environments for interactive learning","author":"Shridhar","year":"2020","journal-title":"arXiv preprint"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.00280"},{"key":"ref65","article-title":"Embodied bert: A transformer model for embodied, language-guided visual task completion","author":"Suglia","year":"2021","journal-title":"arXiv preprint"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00687"},{"key":"ref67","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref68","first-page":"200","article-title":"Multimodal few-shot learning with frozen language models","volume":"34","author":"Tsimpoukelli","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3592131"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073608"},{"key":"ref71","article-title":"Visionllm: Large language model is also an openended decoder for vision-centric tasks","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref72","article-title":"Chat-3d: Data-efficiently tuning large language model for universal dialogue of 3d scenes","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref73","article-title":"Language models meet world models: Embodied experiences enhance language models","author":"Xiang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref74","article-title":"Translating natural language to planning goals with large-language models","author":"Xie","year":"2023","journal-title":"arXiv preprint"},{"key":"ref75","article-title":"Mm-react: Prompting chatgpt for multimodal reasoning and action","author":"Yang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2022.3225327"},{"key":"ref77","article-title":"Ferret: Refer and ground anything anywhere at any granularity","author":"You","year":"2023","journal-title":"arXiv preprint"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.368"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.368"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01397"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_19"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1145\/3381866"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01595"},{"key":"ref85","article-title":"Jarvis: A neuro-symbolic commonsense reasoning framework for conversational embodied agents","author":"Zheng","year":"2022","journal-title":"arXiv preprint"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00472"},{"key":"ref87","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref88","article-title":"3d-vista: Pretrained transformer for 3d vision and text alignment","author":"Zhu","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","location":"Tucson, AZ, USA","start":{"date-parts":[[2025,2,26]]},"end":{"date-parts":[[2025,3,6]]}},"container-title":["2025 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10943266\/10943193\/10943341.pdf?arnumber=10943341","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T05:27:00Z","timestamp":1744176420000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10943341\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,26]]},"references-count":88,"URL":"https:\/\/doi.org\/10.1109\/wacv61041.2025.00220","relation":{},"subject":[],"published":{"date-parts":[[2025,2,26]]}}}