{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:46:29Z","timestamp":1777657589977,"version":"3.51.4"},"reference-count":75,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,16]]},"DOI":"10.1109\/cvpr52733.2024.00828","type":"proceedings-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T17:34:53Z","timestamp":1726508093000},"page":"8671-8681","source":"Crossref","is-referenced-by-count":25,"title":["TRIP: Temporal Residual Learning with Image Noise Prior for Image-to-Video Diffusion Models"],"prefix":"10.1109","author":[{"given":"Zhongwei","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China,Hefei,China"}]},{"given":"Fuchen","family":"Long","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc."}]},{"given":"Yingwei","family":"Pan","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc."}]},{"given":"Zhaofan","family":"Qiu","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc."}]},{"given":"Ting","family":"Yao","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc."}]},{"given":"Yang","family":"Cao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,Hefei,China"}]},{"given":"Tao","family":"Mei","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc."}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref3","article-title":"In-structPix2Pix: Learning to Follow Image Editing Instructions","author":"Brooks","year":"2023","journal-title":"In CVPR"},{"key":"ref4","article-title":"Stable-Video: Text-driven Consistency-aware Diffusion Video Editing","author":"Chai","year":"2023","journal-title":"In ICCV"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01305"},{"key":"ref6","article-title":"VidEdit: Zero-Shot and Spatially Aware Text-Driven Video Editing","author":"Couairon","year":"2023","journal-title":"arXiv preprint"},{"key":"ref7","article-title":"Diffusion Models Beat GANs on Image Synthesis","volume-title":"In NeurIPS","author":"Dhariwal","year":"2022"},{"key":"ref8","article-title":"DreamArtist: Towards Controllable One-Shot Text-to-Image Generation via Positive-Negative Prompt-Tuning","author":"Dong","year":"2022","journal-title":"arXiv preprint"},{"key":"ref9","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR46437.2021.00374","article-title":"Stochastic Image-to- Video Synthesis using cINN s","volume-title":"In CVPR","author":"Dorkenwald","year":"2021"},{"key":"ref10","article-title":"An Image is Worth 16\u00d716 Words: Transformers for Image Recognition at Scale","author":"Dosovitskiy","year":"2021","journal-title":"In ICLR"},{"key":"ref11","article-title":"An-imating Landscape: Self-Supervised Learning of Decoupled Motion and Appearance for Single-Image Video Synthesis","volume-title":"TOG","author":"Endo","year":"2019"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01459"},{"key":"ref13","article-title":"An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion","author":"Gal","year":"2023","journal-title":"In ICLR"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"ref15","article-title":"TokenFlow: Consistent Diffusion Features for Consistent Video Editing","author":"Geyer","year":"2023","journal-title":"arXiv preprint"},{"key":"ref16","article-title":"Generative Adversarial Networks","author":"Goodfellow","year":"2014","journal-title":"In NeurIPS"},{"key":"ref17","article-title":"Seer: Language Instructed Video Prediction with Latent Diffusion Models","author":"Gu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref18","article-title":"AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning","author":"Guo","year":"2023","journal-title":"arXiv preprint"},{"key":"ref19","article-title":"Flexible Diffusion Modeling of Long Videos","author":"Harvey","year":"2022","journal-title":"In NeurIPS"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref21","article-title":"Animate-A- Story: Sto-rytelling with Retrieval-Augmented Video Generation","author":"He","year":"2023","journal-title":"arXiv preprint"},{"key":"ref22","article-title":"Prompt-to-Prompt Image Editing with Cross-Attention Control","author":"Hertz","year":"2023","journal-title":"In ICLR"},{"key":"ref23","article-title":"GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium","author":"Heusel","year":"2017","journal-title":"In NeuIPS"},{"key":"ref24","article-title":"Denoising Dif-fusion Probabilistic Models","author":"Ho","year":"2020","journal-title":"In NeurIPS"},{"key":"ref25","article-title":"Imagen Video: High Definition Video Generation with Diffusion Models","author":"Ho","year":"2022","journal-title":"In CVPR"},{"key":"ref26","article-title":"Video Dif-fusion Models","author":"Ho","year":"2022","journal-title":"In NeurIPS"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00575"},{"key":"ref28","article-title":"CogVideo: Large-Scale Pretraining for Text-to-Video Generation via Transformers","author":"Hong","year":"2023","journal-title":"In ICLR"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref30","article-title":"A New Large Scale Dy-namic Texture Dataset with Application to Convnet Under-standing","author":"Isma","year":"2018","journal-title":"In ECCV"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02073"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_37"},{"key":"ref35","article-title":"Contextual Transformer Networks for Visual Recognition","author":"Li","year":"2022","journal-title":"IEEE Trans. on PAMI"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00319"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_28"},{"key":"ref38","author":"Long","year":"2024","journal-title":"Video-Drafter: Content-Consistent Multi -Scene Video Generation with LLM"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00984"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00365"},{"key":"ref41","article-title":"Stochastic Variational Video Prediction","author":"Mohammad","year":"2018","journal-title":"In ICLR"},{"key":"ref42","article-title":"T21-Adapter: Learning Adapters to Dig out More Controllable Ability for Text-to-Image Diffusion Models","author":"Mou","year":"2023","journal-title":"arXiv preprint"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01769"},{"key":"ref44","article-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models","author":"Nichol","year":"2022","journal-title":"In ICML"},{"key":"ref45","article-title":"CoDeF: Content Deformation Fields for Temporally Consistent Video Processing","author":"Ouyang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591513"},{"key":"ref47","article-title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","author":"Podell","year":"2023","journal-title":"arXiv preprint"},{"key":"ref48","article-title":"Learning Transferable Visual Models From Natural Language Supervision","author":"Radford","year":"2021","journal-title":"In ICML"},{"key":"ref49","article-title":"Hierarchical Text-Conditional Image Generation with CLIP Latents","author":"Ramesh","year":"2022","journal-title":"arXiv preprint"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"ref52","article-title":"Make-a- Video: Text-to- Video Generation without Text-Video Data","author":"Singer","year":"2023","journal-title":"In ICLR"},{"key":"ref53","article-title":"Denoising Diffusion Implicit Models","author":"Song","year":"2021","journal-title":"In ICLR"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref55","article-title":"FVD: A new Metric for Video Generation","volume-title":"In ICLR Deep-GenStruct Workshop","author":"Unterthiner","year":"2019"},{"key":"ref56","article-title":"Attention Is All You Need","author":"Vaswani","year":"2017","journal-title":"In NeurIPS"},{"key":"ref57","author":"Von Platen","journal-title":"Diffusers: State-Of-The-Art Diffusion Models"},{"key":"ref58","article-title":"Gen-L-Video: Multi-Text to Long Video Generation via Temporal Co- Denoising","author":"Wang","year":"2023","journal-title":"ar Xiv preprint"},{"key":"ref59","article-title":"ModelScope Text-to- Video Technical Report","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref60","article-title":"VideoComposer: Compositional Video Synthesis with Motion Controllability","volume-title":"In NeurIPS","author":"Wang","year":"2023"},{"key":"ref61","article-title":"Latent Image Animator: Learning to Animate Images via Latent Space Navigation","author":"Wang","year":"2022","journal-title":"In ICLR"},{"key":"ref62","article-title":"LEO: Generative Latent Image Animator for Human Video Synthesis","author":"Wang","year":"2023","journal-title":"ar Xiv preprint"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"ref64","article-title":"A Survey on Video Diffusion Models","author":"Xing","year":"2023","journal-title":"arXiv preprint"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00748"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00251"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref68","article-title":"Understanding and Improving Layer Normalization","author":"Xu","year":"2019","journal-title":"In NeurIPS"},{"key":"ref69","article-title":"MagicProp: Diffusion-based Video Editing via Motion-aware Appearance Propagation","author":"Yan","year":"2023","journal-title":"ar Xiv preprint"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.3390\/e25101469"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19806-9_19"},{"key":"ref72","article-title":"DragNUWA: Fine-grained Control in Video Generation by Integrating Text, Image, and Trajectory","author":"Yin","year":"2023","journal-title":"arXiv preprint"},{"key":"ref73","article-title":"MagicAvatar: Multimodal Avatar Generation and Animation","author":"Zhang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"}],"event":{"name":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Seattle, WA, USA","start":{"date-parts":[[2024,6,16]]},"end":{"date-parts":[[2024,6,22]]}},"container-title":["2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10654794\/10654797\/10656358.pdf?arnumber=10656358","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,21]],"date-time":"2024-09-21T05:44:33Z","timestamp":1726897473000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10656358\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,16]]},"references-count":75,"URL":"https:\/\/doi.org\/10.1109\/cvpr52733.2024.00828","relation":{},"subject":[],"published":{"date-parts":[[2024,6,16]]}}}