{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:19:47Z","timestamp":1778080787842,"version":"3.51.4"},"reference-count":53,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,16]]},"DOI":"10.1109\/cvpr52733.2024.01934","type":"proceedings-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T17:34:53Z","timestamp":1726508093000},"page":"20465-20474","source":"Crossref","is-referenced-by-count":19,"title":["GenZI: Zero-Shot 3D Human-Scene Interaction Generation"],"prefix":"10.1109","author":[{"given":"Lei","family":"Li","sequence":"first","affiliation":[{"name":"Technical University of Munich"}]},{"given":"Angela","family":"Dai","sequence":"additional","affiliation":[{"name":"Technical University of Munich"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Stable Diffusion v2","year":"2023"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33783-3_46"},{"key":"ref3","article-title":"SMPLer-X: Scaling up expressive human pose and shape estimation","author":"Cai","year":"2023","journal-title":"arXiv"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3592116"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58607-2_2"},{"key":"ref6","article-title":"Alpha-Pose: Whole-body regional multi-person pose estimation and tracking in real-time","author":"Fang","year":"2022","journal-title":"IEEE TPAMI"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1006\/cviu.1998.0716"},{"key":"ref8","volume-title":"Statistical methods for tomographic image reconstruction","author":"Geman","year":"1987"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.2307\/1574154"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00237"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01447"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591525"},{"key":"ref13","article-title":"Prompt-to-prompt image editing with cross-attention control","author":"Hertz","year":"2023","journal-title":"ICLR"},{"key":"ref14","article-title":"Denoising diffusion probabilistic models","author":"Ho","year":"2020","journal-title":"NeurIPS"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00727"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01607"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.4324\/9781315009292"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00234"},{"key":"ref19","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022","journal-title":"ICML"},{"key":"ref20","article-title":"BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023","journal-title":"ICML"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01265"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02027"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/2816795.2818013"},{"key":"ref24","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2019","journal-title":"ICLR"},{"key":"ref25","article-title":"DPM-Solver++: Fast solver for guided sampling of diffusion probabilistic models","author":"Lu","year":"2022","journal-title":"arXiv"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2006.08.002"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3322961"},{"key":"ref28","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"Paszke","year":"2019","journal-title":"NeurIPS"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01123"},{"key":"ref30","article-title":"State of the art on diffusion models for visual computing","author":"Po","year":"2023","journal-title":"arXiv"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2006.10.016"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00370"},{"key":"ref33","article-title":"Learning transferable visual models from natural language super-vision","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00201"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_8"},{"key":"ref37","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","author":"Saharia","year":"2022","journal-title":"NeurIPS"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.09.002"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925867"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-8659.2005.00829.x"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0895-4"},{"key":"ref42","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01981"},{"key":"ref44","article-title":"HUMANISE: Language-conditioned human motion generation in 3D scenes","author":"Wang","year":"2022","journal-title":"NeurIPS"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02035"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02153"},{"key":"ref48","doi-asserted-by":"crossref","DOI":"10.1109\/TPAMI.2023.3271691\/mm1","article-title":"PyMAF-X: To-wards well-aligned full-body model regression from monocular images","author":"Zhang","year":"2023","journal-title":"IEEE TPAMI"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/3DV50981.2020.00074"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00623"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_18"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00589"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00478"}],"event":{"name":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Seattle, WA, USA","start":{"date-parts":[[2024,6,16]]},"end":{"date-parts":[[2024,6,22]]}},"container-title":["2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10654794\/10654797\/10657372.pdf?arnumber=10657372","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,21]],"date-time":"2024-09-21T05:53:08Z","timestamp":1726897988000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10657372\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,16]]},"references-count":53,"URL":"https:\/\/doi.org\/10.1109\/cvpr52733.2024.01934","relation":{},"subject":[],"published":{"date-parts":[[2024,6,16]]}}}