{"id":"https://openalex.org/W4400720185","doi":"https://doi.org/10.48550/arxiv.2407.10937","title":"IDOL: Unified Dual-Modal Latent Diffusion for Human-Centric Joint Video-Depth Generation","display_name":"IDOL: Unified Dual-Modal Latent Diffusion for Human-Centric Joint Video-Depth Generation","publication_year":2024,"publication_date":"2024-07-15","ids":{"openalex":"https://openalex.org/W4400720185","doi":"https://doi.org/10.48550/arxiv.2407.10937"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2407.10937","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.10937","pdf_url":"https://arxiv.org/pdf/2407.10937","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2407.10937","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024925731","display_name":"Yuanhao Zhai","orcid":"https://orcid.org/0000-0002-3277-3329"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhai, Yuanhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074764224","display_name":"Kevin Lin","orcid":"https://orcid.org/0000-0002-1236-9847"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Kevin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100657555","display_name":"Linjie Li","orcid":"https://orcid.org/0000-0003-0867-8863"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Linjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100913271","display_name":"Chung-Ching Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Chung-Ching","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023824529","display_name":"Jianfeng Wang","orcid":"https://orcid.org/0000-0002-7323-7143"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jianfeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050209478","display_name":"Zhengyuan Yang","orcid":"https://orcid.org/0000-0002-5808-0889"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhengyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003875781","display_name":"David Doermann","orcid":"https://orcid.org/0000-0003-1639-4561"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Doermann, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085245110","display_name":"Junsong Yuan","orcid":"https://orcid.org/0000-0002-7901-8793"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Junsong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048295582","display_name":"Zicheng Liu","orcid":"https://orcid.org/0000-0002-2342-2340"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zicheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5001425662","display_name":"Lijuan Wang","orcid":"https://orcid.org/0000-0001-5318-7911"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Lijuan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5024925731"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.6795490980148315},{"id":"https://openalex.org/keywords/dual","display_name":"Dual (grammatical number)","score":0.6718804836273193},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6253873705863953},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.5197895765304565},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5116684436798096},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3986419141292572},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.37220966815948486},{"id":"https://openalex.org/keywords/art","display_name":"Art","score":0.159587562084198},{"id":"https://openalex.org/keywords/materials-science","display_name":"Materials science","score":0.14391779899597168},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.14099529385566711},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.13119298219680786},{"id":"https://openalex.org/keywords/structural-engineering","display_name":"Structural engineering","score":0.09373465180397034}],"concepts":[{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.6795490980148315},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.6718804836273193},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6253873705863953},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.5197895765304565},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5116684436798096},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3986419141292572},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37220966815948486},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.159587562084198},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.14391779899597168},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.14099529385566711},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.13119298219680786},{"id":"https://openalex.org/C66938386","wikidata":"https://www.wikidata.org/wiki/Q633538","display_name":"Structural engineering","level":1,"score":0.09373465180397034},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C124952713","wikidata":"https://www.wikidata.org/wiki/Q8242","display_name":"Literature","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2407.10937","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.10937","pdf_url":"https://arxiv.org/pdf/2407.10937","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2407.10937","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2407.10937","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2407.10937","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.10937","pdf_url":"https://arxiv.org/pdf/2407.10937","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1099903527","display_name":null,"funder_award_id":"HR001120C0124","funder_id":"https://openalex.org/F4320332180","funder_display_name":"Defense Advanced Research Projects Agency"}],"funders":[{"id":"https://openalex.org/F4320332180","display_name":"Defense Advanced Research Projects Agency","ror":"https://ror.org/02caytj08"},{"id":"https://openalex.org/F4320332815","display_name":"Advanced Research Projects Agency","ror":"https://ror.org/02caytj08"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4400720185.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W1996130883","https://openalex.org/W2748574964","https://openalex.org/W2888483922","https://openalex.org/W2317351040","https://openalex.org/W2379392295","https://openalex.org/W3160965418","https://openalex.org/W4396737233","https://openalex.org/W613940353","https://openalex.org/W4285447065","https://openalex.org/W2367747139"],"abstract_inverted_index":{"Significant":[0],"advances":[1],"have":[2,35],"been":[3],"made":[4],"in":[5,184],"human-centric":[6,55],"video":[7,77,92,131,156,187],"generation,":[8,80],"yet":[9],"the":[10,38,73,101,108,130,151,155,161,171],"joint":[11,56,91],"video-depth":[12,57,117],"generation":[13,70],"problem":[14],"remains":[15],"underexplored.":[16],"Most":[17],"existing":[18,182],"monocular":[19],"depth":[20,79,94,133,162,190],"estimation":[21],"methods":[22,34,183],"may":[23],"not":[24],"generalize":[25],"well":[26],"to":[27,67,113,138,149],"synthesized":[28],"images":[29],"or":[30],"videos,":[31],"and":[32,41,71,78,93,104,132,173,189],"multi-view-based":[33],"difficulty":[36],"controlling":[37],"human":[39],"appearance":[40],"motion.":[42],"In":[43],"this":[44],"work,":[45],"we":[46,81,120],"present":[47],"IDOL":[48,60],"(unIfied":[49],"Dual-mOdal":[50],"Latent":[51],"diffusion)":[52],"for":[53,90],"high-quality":[54],"generation.":[58],"Our":[59],"consists":[61],"of":[62,154,160,186],"two":[63],"novel":[64],"designs.":[65],"First,":[66],"enable":[68],"dual-modal":[69,85],"maximize":[72],"information":[74,110],"exchange":[75],"between":[76,129],"propose":[82,121],"a":[83,87,97,115,122,142],"unified":[84],"U-Net,":[86],"parameter-sharing":[88],"framework":[89],"denoising,":[95,163],"wherein":[96],"modality":[98],"label":[99],"guides":[100],"denoising":[102,157],"target,":[103],"cross-modal":[105],"attention":[106],"enables":[107],"mutual":[109],"flow.":[111],"Second,":[112],"ensure":[114],"precise":[116],"spatial":[118,166],"alignment,":[119],"motion":[123,135],"consistency":[124,128,145],"loss":[125,146],"that":[126,159],"enforces":[127],"feature":[134],"fields,":[136],"leading":[137],"harmonized":[139],"outputs.":[140],"Additionally,":[141],"cross-attention":[143,152],"map":[144,153],"is":[147],"applied":[148],"align":[150],"with":[158],"further":[164],"facilitating":[165],"alignment.":[167],"Extensive":[168],"experiments":[169],"on":[170],"TikTok":[172],"NTU120":[174],"datasets":[175],"show":[176],"our":[177],"superior":[178],"performance,":[179],"significantly":[180],"surpassing":[181],"terms":[185],"FVD":[188],"accuracy.":[191]},"counts_by_year":[],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2024-07-17T00:00:00"}
