{"id":"https://openalex.org/W4415539239","doi":"https://doi.org/10.1145/3746027.3755402","title":"OV-VOD: Open-Vocabulary Video Object Detection","display_name":"OV-VOD: Open-Vocabulary Video Object Detection","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415539239","doi":"https://doi.org/10.1145/3746027.3755402"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755402","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755402","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101841289","display_name":"Zhihong Zheng","orcid":"https://orcid.org/0009-0004-4081-5703"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhihong Zheng","raw_affiliation_strings":["Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China and Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"],"raw_orcid":"https://orcid.org/0009-0004-4081-5703","affiliations":[{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China and Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yang Cao","orcid":"https://orcid.org/0009-0008-5723-4381"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Cao","raw_affiliation_strings":["Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China and Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"],"raw_orcid":"https://orcid.org/0009-0008-5723-4381","affiliations":[{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China and Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016259963","display_name":"Junlong Gao","orcid":"https://orcid.org/0000-0002-8734-1021"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junlong Gao","raw_affiliation_strings":["Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China and Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"],"raw_orcid":"https://orcid.org/0000-0002-8734-1021","affiliations":[{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China and Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5044594971","display_name":"Hanzi Wang","orcid":"https://orcid.org/0000-0002-6913-9786"},"institutions":[{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hanzi Wang","raw_affiliation_strings":["Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China and Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"],"raw_orcid":"https://orcid.org/0000-0002-6913-9786","affiliations":[{"raw_affiliation_string":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China and Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5101841289"],"corresponding_institution_ids":["https://openalex.org/I191208505"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.28814159,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"489","last_page":"498"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.7045000195503235},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.6513000130653381},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5656999945640564},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4936000108718872},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.43799999356269836},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.38909998536109924},{"id":"https://openalex.org/keywords/viola\u2013jones-object-detection-framework","display_name":"Viola\u2013Jones object detection framework","score":0.3885999917984009},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.3880999982357025}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7972000241279602},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7566999793052673},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.7045000195503235},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.6513000130653381},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.592199981212616},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5656999945640564},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4936000108718872},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.43799999356269836},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.38909998536109924},{"id":"https://openalex.org/C182521987","wikidata":"https://www.wikidata.org/wiki/Q2493877","display_name":"Viola\u2013Jones object detection framework","level":5,"score":0.3885999917984009},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3880999982357025},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.38510000705718994},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3785000145435333},{"id":"https://openalex.org/C71681937","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object-class detection","level":5,"score":0.3765999972820282},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.3684000074863434},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.31529998779296875},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C203595873","wikidata":"https://www.wikidata.org/wiki/Q25389927","display_name":"Change detection","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2743000090122223},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.2736999988555908},{"id":"https://openalex.org/C2776650193","wikidata":"https://www.wikidata.org/wiki/Q264661","display_name":"Obstacle","level":2,"score":0.2728999853134155},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C32653426","wikidata":"https://www.wikidata.org/wiki/Q3813641","display_name":"Background subtraction","level":3,"score":0.25949999690055847},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755402","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755402","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2022121445","display_name":null,"funder_award_id":"U21A20514","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W2117539524","https://openalex.org/W2194775991","https://openalex.org/W2336589871","https://openalex.org/W2552900565","https://openalex.org/W2948672349","https://openalex.org/W2963150697","https://openalex.org/W2964286567","https://openalex.org/W2982242214","https://openalex.org/W2983827899","https://openalex.org/W2983943451","https://openalex.org/W3034368386","https://openalex.org/W3034467781","https://openalex.org/W3096609285","https://openalex.org/W3159619744","https://openalex.org/W3195000282","https://openalex.org/W3198377975","https://openalex.org/W4221166276","https://openalex.org/W4312310776","https://openalex.org/W4312563428","https://openalex.org/W4312747482","https://openalex.org/W4317795300","https://openalex.org/W4383899713","https://openalex.org/W4391547487","https://openalex.org/W4392172801","https://openalex.org/W4402753899","https://openalex.org/W4403182220","https://openalex.org/W4408609670"],"related_works":[],"abstract_inverted_index":{"Traditional":[0],"Video":[1,35],"Object":[2,36,143],"Detection":[3,37],"(VOD)":[4],"is":[5],"limited":[6],"by":[7,64],"pre-defined":[8,95],"closed-set":[9],"categories,":[10,51],"restricting":[11],"its":[12],"ability":[13],"to":[14,72,134,177],"detect":[15],"novel":[16,53],"objects":[17,45,91],"in":[18,46,92],"real-world":[19],"scenarios.":[20],"To":[21],"address":[22],"this":[23,78],"limitation,":[24],"we":[25,31,59,82,116,139],"make":[26],"three":[27],"key":[28],"contributions.":[29],"First,":[30],"formally":[32],"define":[33],"Open-Vocabulary":[34,86],"(Open-Vocabulary":[38],"VOD)":[39],"as":[40],"the":[41,74,100,109],"task":[42],"of":[43,102],"detecting":[44],"video":[47,165],"streams":[48],"from":[49],"open-set":[50],"including":[52],"categories":[54,97],"unseen":[55],"during":[56,159],"training.":[57],"Second,":[58],"establish":[60],"an":[61,85],"evaluation":[62],"benchmark":[63],"utilizing":[65],"existing":[66,178],"datasets":[67,166],"(LV-VIS,":[68],"BURST,":[69],"and":[70,98,112,155],"TAO)":[71],"bridge":[73],"data":[75],"gap":[76],"for":[77],"new":[79],"task.":[80],"Third,":[81],"propose":[83,140],"OV-VOD,":[84],"VOD":[87],"method":[88],"that":[89,124,150,168],"detects":[90],"videos":[93],"beyond":[94],"training":[96],"addresses":[99],"shortcomings":[101],"image-level":[103,179],"open-vocabulary":[104,180],"detectors,":[105],"which":[106],"generally":[107],"neglect":[108],"essential":[110],"temporal":[111,136],"spatial":[113,153],"information.":[114],"Specifically,":[115],"design":[117],"a":[118,131,141],"Semantic-Presence":[119],"Memory":[120],"Tracking":[121],"(SPMT)":[122],"module":[123],"propagates":[125],"object":[126,181],"features":[127],"across":[128],"frames":[129],"through":[130],"memory":[132],"bank":[133],"leverage":[135],"consistency.":[137],"Moreover,":[138],"Spatial":[142],"Relationship":[144],"Distillation":[145],"loss":[146],"(L":[147],"SR":[148],")":[149],"captures":[151],"inter-object":[152],"dependencies":[154],"enhances":[156],"knowledge":[157],"transfer":[158],"feature":[160],"distillation.":[161],"Experiments":[162],"on":[163],"multiple":[164],"demonstrate":[167],"our":[169],"OV-VOD":[170],"exhibits":[171],"superior":[172],"zero-shot":[173],"generalization":[174],"capability":[175],"compared":[176],"detection":[182],"methods.":[183]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-25T00:00:00"}
