{"id":"https://openalex.org/W4379919681","doi":"https://doi.org/10.48550/arxiv.2306.03413","title":"DVIS: Decoupled Video Instance Segmentation Framework","display_name":"DVIS: Decoupled Video Instance Segmentation Framework","publication_year":2023,"publication_date":"2023-06-06","ids":{"openalex":"https://openalex.org/W4379919681","doi":"https://doi.org/10.48550/arxiv.2306.03413"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2306.03413","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2306.03413","pdf_url":"https://arxiv.org/pdf/2306.03413","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2306.03413","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100375792","display_name":"Tao Zhang","orcid":"https://orcid.org/0000-0002-2980-6281"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Tao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101211399","display_name":"Xingye Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Xingye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101242732","display_name":"Yu Wu","orcid":"https://orcid.org/0009-0004-4319-1991"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031588692","display_name":"Shunping Ji","orcid":"https://orcid.org/0000-0002-3088-1481"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Shunping","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104219038","display_name":"Xuebo Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xuebo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100368775","display_name":"Yuan Zhang","orcid":"https://orcid.org/0009-0000-8003-0725"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101928510","display_name":"Pengfei Wan","orcid":"https://orcid.org/0000-0001-7225-565X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wan, Pengfei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100375792"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9887999892234802,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9837999939918518,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7901029586791992},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6990387439727783},{"id":"https://openalex.org/keywords/decoupling","display_name":"Decoupling (probability)","score":0.5903874039649963},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5524277091026306},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5279737114906311},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5085631608963013},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.46412065625190735},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4319571554660797},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4240909814834595},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.324135959148407}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7901029586791992},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6990387439727783},{"id":"https://openalex.org/C205606062","wikidata":"https://www.wikidata.org/wiki/Q5249645","display_name":"Decoupling (probability)","level":2,"score":0.5903874039649963},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5524277091026306},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5279737114906311},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5085631608963013},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.46412065625190735},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4319571554660797},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4240909814834595},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.324135959148407},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C133731056","wikidata":"https://www.wikidata.org/wiki/Q4917288","display_name":"Control engineering","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2306.03413","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2306.03413","pdf_url":"https://arxiv.org/pdf/2306.03413","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2306.03413","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2306.03413","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2306.03413","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2306.03413","pdf_url":"https://arxiv.org/pdf/2306.03413","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4379919681.pdf","grobid_xml":"https://content.openalex.org/works/W4379919681.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2366107444","https://openalex.org/W4388145910","https://openalex.org/W2381570729","https://openalex.org/W1976205134","https://openalex.org/W4248336175","https://openalex.org/W2031260042","https://openalex.org/W2391445434","https://openalex.org/W3009369890","https://openalex.org/W4312490297","https://openalex.org/W2152950565"],"abstract_inverted_index":{"Video":[0],"instance":[1],"segmentation":[2],"(VIS)":[3],"is":[4,227],"a":[5,85,142,219],"critical":[6],"task":[7],"with":[8,222],"diverse":[9],"applications,":[10],"including":[11],"autonomous":[12],"driving":[13],"and":[14,23,49,99,123,146,164,174,180,188,200,216],"video":[15],"editing.":[16],"Existing":[17],"methods":[18,36,71,170],"often":[19],"underperform":[20],"on":[21,108,132,177,218],"complex":[22],"long":[24],"videos":[25],"in":[26,161],"real":[27],"world,":[28],"primarily":[29],"due":[30],"to":[31,59,149,193],"two":[32,109],"factors.":[33],"Firstly,":[34],"offline":[35],"are":[37,184,203],"limited":[38],"by":[39,90,171],"the":[40,51,60,104,125,133,151,167,178,185,194,197,209],"tightly-coupled":[41],"modeling":[42],"paradigm,":[43],"which":[44,183],"treats":[45],"all":[46],"frames":[47],"equally":[48],"disregards":[50],"interdependencies":[52],"between":[53],"adjacent":[54],"frames.":[55],"Consequently,":[56],"this":[57],"leads":[58],"introduction":[61],"of":[62,76,103,128,208],"excessive":[63],"noise":[64],"during":[65,121,138],"long-term":[66,115],"temporal":[67,77,129,147,201],"alignment.":[68],"Secondly,":[69],"online":[70],"suffer":[72],"from":[73],"inadequate":[74],"utilization":[75,127],"information.":[78],"To":[79],"tackle":[80],"these":[81],"challenges,":[82],"we":[83],"propose":[84],"decoupling":[86,105,195],"strategy":[87,106],"for":[88,213],"VIS":[89,163],"dividing":[91],"it":[92],"into":[93],"three":[94],"independent":[95],"sub-tasks:":[96],"segmentation,":[97],"tracking,":[98,122],"refinement.":[100,139],"The":[101,225],"efficacy":[102],"relies":[107],"crucial":[110],"elements:":[111],"1)":[112],"attaining":[113],"precise":[114],"alignment":[116,136],"outcomes":[117,137],"via":[118],"frame-by-frame":[119],"association":[120],"2)":[124],"effective":[126],"information":[130],"predicated":[131],"aforementioned":[134],"accurate":[135],"We":[140],"introduce":[141],"novel":[143],"referring":[144,198],"tracker":[145,199],"refiner":[148,202],"construct":[150],"\\textbf{D}ecoupled":[152],"\\textbf{VIS}":[153],"framework":[154],"(\\textbf{DVIS}).":[155],"DVIS":[156],"achieves":[157],"new":[158],"SOTA":[159,169],"performance":[160],"both":[162],"VPS,":[165],"surpassing":[166],"current":[168],"7.3":[172],"AP":[173],"9.6":[175],"VPQ":[176],"OVIS":[179],"VIPSeg":[181],"datasets,":[182],"most":[186],"challenging":[187],"realistic":[189],"benchmarks.":[190],"Moreover,":[191],"thanks":[192],"strategy,":[196],"super":[204],"light-weight":[205],"(only":[206],"1.69\\%":[207],"segmenter":[210],"FLOPs),":[211],"allowing":[212],"efficient":[214],"training":[215],"inference":[217],"single":[220],"GPU":[221],"11G":[223],"memory.":[224],"code":[226],"available":[228],"at":[229],"\\href{https://github.com/zhang-tao-whu/DVIS}{https://github.com/zhang-tao-whu/DVIS}.":[230]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
