{"id":"https://openalex.org/W6948092331","doi":"https://doi.org/10.48550/arxiv.2502.16786","title":"SwimVG: Step-wise Multimodal Fusion and Adaption for Visual Grounding","display_name":"SwimVG: Step-wise Multimodal Fusion and Adaption for Visual Grounding","publication_year":2025,"publication_date":"2025-02-24","ids":{"openalex":"https://openalex.org/W6948092331","doi":"https://doi.org/10.48550/arxiv.2502.16786"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2502.16786","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2502.16786","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2502.16786","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Shi, Liangtao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shi, Liangtao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liu, Ting","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ting","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Hu, Xiantao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Xiantao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Hu, Yue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Yue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yin, Quanjun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Quanjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Hong, Richang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hong, Richang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10895","display_name":"Species Distribution and Climate Change","score":0.3801000118255615,"subfield":{"id":"https://openalex.org/subfields/2302","display_name":"Ecological Modeling"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10895","display_name":"Species Distribution and Climate Change","score":0.3801000118255615,"subfield":{"id":"https://openalex.org/subfields/2302","display_name":"Ecological Modeling"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12859","display_name":"Cell Image Analysis Techniques","score":0.03889999911189079,"subfield":{"id":"https://openalex.org/subfields/1304","display_name":"Biophysics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11937","display_name":"Research Data Management Practices","score":0.03319999948143959,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.6823999881744385},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.5910999774932861},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5633000135421753},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.4999000132083893},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.35929998755455017},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.349700003862381}],"concepts":[{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.6823999881744385},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6255999803543091},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.5910999774932861},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5633000135421753},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5239999890327454},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.4999000132083893},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.35929998755455017},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.349700003862381},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.34929999709129333},{"id":"https://openalex.org/C2777877512","wikidata":"https://www.wikidata.org/wiki/Q1116097","display_name":"Common ground","level":2,"score":0.3422999978065491},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3328000009059906},{"id":"https://openalex.org/C69744172","wikidata":"https://www.wikidata.org/wiki/Q860822","display_name":"Image fusion","level":3,"score":0.3156999945640564},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.29100000858306885},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.2766000032424927},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.2718999981880188},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.27079999446868896}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2502.16786","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2502.16786","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2502.16786","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2502.16786","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5862460732460022}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Visual":[0],"grounding":[1],"aims":[2],"to":[3,61,144],"ground":[4],"an":[5],"image":[6],"region":[7],"through":[8],"natural":[9],"language,":[10],"which":[11],"heavily":[12],"relies":[13],"on":[14,150],"cross-modal":[15,84,127,140],"alignment.":[16],"Most":[17],"existing":[18],"methods":[19],"transfer":[20],"visual/linguistic":[21],"knowledge":[22],"separately":[23],"by":[24,31,111,126],"fully":[25],"fine-tuning":[26],"uni-modal":[27],"pre-trained":[28],"models,":[29],"followed":[30],"a":[32,67,114],"simple":[33],"stack":[34],"of":[35,165],"visual-language":[36],"transformers":[37],"for":[38,88,96],"multimodal":[39,69,80,97,124],"fusion.":[40,98],"However,":[41],"these":[42,63],"approaches":[43],"not":[44],"only":[45],"limit":[46],"adequate":[47],"interaction":[48],"between":[49,104],"visual":[50,89],"and":[51,71,83,107,130,136,160],"linguistic":[52],"contexts,":[53],"but":[54],"also":[55],"incur":[56],"significant":[57],"computational":[58],"costs.":[59],"Therefore,":[60],"address":[62],"issues,":[64],"we":[65],"explore":[66],"step-wise":[68,79],"fusion":[70,116,125],"adaption":[72],"framework,":[73],"namely":[74],"SwimVG.":[75],"Specifically,":[76],"SwimVG":[77,156],"proposes":[78],"prompts":[81],"(Swip)":[82],"interactive":[85],"adapters":[86],"(CIA)":[87],"grounding,":[90],"replacing":[91],"the":[92,105,139],"cumbersome":[93],"transformer":[94],"stacks":[95],"Swip":[99,129],"can":[100],"improve":[101],"{the}":[102],"alignment":[103],"vision":[106],"language":[108],"representations":[109],"step":[110],"step,":[112],"in":[113,163],"token-level":[115],"manner.":[117],"In":[118],"addition,":[119],"weight-level":[120],"CIA":[121,131],"further":[122],"promotes":[123],"interaction.":[128],"are":[132],"both":[133],"parameter-efficient":[134],"paradigms,":[135],"they":[137],"fuse":[138],"features":[141],"from":[142],"shallow":[143],"deep":[145],"layers":[146],"gradually.":[147],"Experimental":[148],"results":[149],"four":[151],"widely-used":[152],"benchmarks":[153],"demonstrate":[154],"that":[155],"achieves":[157],"remarkable":[158],"abilities":[159],"considerable":[161],"benefits":[162],"terms":[164],"efficiency.":[166],"Our":[167],"code":[168],"is":[169],"available":[170],"at":[171],"https://github.com/liuting20/SwimVG.":[172]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
