{"id":"https://openalex.org/W4400600814","doi":"https://doi.org/10.48550/arxiv.2407.07356","title":"Video In-context Learning: Autoregressive Transformers are Zero-Shot Video Imitators","display_name":"Video In-context Learning: Autoregressive Transformers are Zero-Shot Video Imitators","publication_year":2024,"publication_date":"2024-07-10","ids":{"openalex":"https://openalex.org/W4400600814","doi":"https://doi.org/10.48550/arxiv.2407.07356"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2407.07356","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.07356","pdf_url":"https://arxiv.org/pdf/2407.07356","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2407.07356","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111196099","display_name":"Wentao Zhang","orcid":"https://orcid.org/0009-0003-7828-6096"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Wentao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055122985","display_name":"Junliang Guo","orcid":"https://orcid.org/0000-0001-8360-5483"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Junliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103151411","display_name":"Tianyu He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Tianyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108050232","display_name":"Li Zhao","orcid":"https://orcid.org/0000-0001-5169-9438"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112773421","display_name":"Linli Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Linli","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5103079061","display_name":"Jiang Bian","orcid":"https://orcid.org/0000-0002-9034-4960"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bian, Jiang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5111196099"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.45509999990463257,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.45509999990463257,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6093255281448364},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4928629398345947},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.3205454349517822},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.12387290596961975}],"concepts":[{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6093255281448364},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4928629398345947},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3205454349517822},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.12387290596961975},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2407.07356","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.07356","pdf_url":"https://arxiv.org/pdf/2407.07356","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2407.07356","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2407.07356","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2407.07356","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.07356","pdf_url":"https://arxiv.org/pdf/2407.07356","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"People":[0],"interact":[1,32],"with":[2,33,132],"the":[3,34,60,68,76,84,92,104,133,138,146,150],"real-world":[4],"largely":[5],"dependent":[6],"on":[7,50],"visual":[8,23,43],"signal,":[9],"which":[10],"are":[11],"ubiquitous":[12],"and":[13,74,115,141,154],"illustrate":[14],"detailed":[15],"demonstrations.":[16],"In":[17],"this":[18],"paper,":[19],"we":[20,37,57,107,142],"explore":[21],"utilizing":[22],"signals":[24],"as":[25,40],"a":[26,41,54,63,71],"new":[27],"interface":[28],"for":[29],"models":[30,85,123,155],"to":[31,66,78,86],"environment.":[35],"Specifically,":[36],"choose":[38],"videos":[39],"representative":[42],"signal.":[44],"And":[45],"by":[46,90,137],"training":[47],"autoregressive":[48],"Transformers":[49],"video":[51,94,127],"datasets":[52],"in":[53,95],"self-supervised":[55],"objective,":[56],"find":[58],"that":[59,121,129,145],"model":[61],"emerges":[62],"zero-shot":[64],"capability":[65],"infer":[67],"semantics":[69,77],"from":[70],"demonstration":[72,93,139],"video,":[73],"imitate":[75],"an":[79,96],"unseen":[80,88],"scenario.":[81],"This":[82],"allows":[83],"perform":[87],"tasks":[89],"watching":[91],"in-context":[97],"manner,":[98],"without":[99],"further":[100],"fine-tuning.":[101],"To":[102],"validate":[103],"imitation":[105,147],"capacity,":[106],"design":[108],"various":[109],"evaluation":[110],"metrics":[111],"including":[112],"both":[113],"objective":[114],"subjective":[116],"measures.":[117],"The":[118],"results":[119],"show":[120,144],"our":[122],"can":[124],"generate":[125],"high-quality":[126],"clips":[128],"accurately":[130],"align":[131],"semantic":[134],"guidance":[135],"provided":[136],"videos,":[140],"also":[143],"capacity":[148],"follows":[149],"scaling":[151],"law.":[152],"Code":[153],"have":[156],"been":[157],"open-sourced.":[158]},"counts_by_year":[],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2024-07-13T00:00:00"}
