{"id":"https://openalex.org/W4417155795","doi":"https://doi.org/10.48550/arxiv.2503.17080","title":"Seeing What Matters: Empowering CLIP with Patch Generation-to-Selection","display_name":"Seeing What Matters: Empowering CLIP with Patch Generation-to-Selection","publication_year":2025,"publication_date":"2025-03-21","ids":{"openalex":"https://openalex.org/W4417155795","doi":"https://doi.org/10.48550/arxiv.2503.17080"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2503.17080","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.17080","pdf_url":"https://arxiv.org/pdf/2503.17080","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2503.17080","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5082756213","display_name":"Gensheng Pei","orcid":"https://orcid.org/0000-0002-7677-7487"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Pei, Gensheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052423709","display_name":"Tao Chen","orcid":"https://orcid.org/0000-0003-2051-7798"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Tao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100767800","display_name":"Yujia Wang","orcid":"https://orcid.org/0000-0002-6402-3514"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yujia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078057978","display_name":"Xinhao Cai","orcid":"https://orcid.org/0009-0009-0467-6597"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Xinhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040437528","display_name":"Xiangbo Shu","orcid":"https://orcid.org/0000-0003-4902-4663"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shu, Xiangbo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091518967","display_name":"Tianfei Zhou","orcid":"https://orcid.org/0000-0001-5475-1473"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Tianfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5027545344","display_name":"Yazhou Yao","orcid":"https://orcid.org/0000-0002-0337-9410"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Yazhou","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5082756213"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7723000049591064,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7723000049591064,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.15060000121593475,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.016599999740719795,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sobel-operator","display_name":"Sobel operator","score":0.48399999737739563},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.4702000021934509},{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.43369999527931213},{"id":"https://openalex.org/keywords/semantic-similarity","display_name":"Semantic similarity","score":0.391400009393692},{"id":"https://openalex.org/keywords/masking","display_name":"Masking (illustration)","score":0.3862000107765198},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.3788999915122986},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3781999945640564},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3695000112056732},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.3677999973297119}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7883999943733215},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6682000160217285},{"id":"https://openalex.org/C30703548","wikidata":"https://www.wikidata.org/wiki/Q1757673","display_name":"Sobel operator","level":5,"score":0.48399999737739563},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.4702000021934509},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.43369999527931213},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.391400009393692},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.3862000107765198},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.3788999915122986},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3781999945640564},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.37470000982284546},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3695000112056732},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3677999973297119},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.35989999771118164},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.3325999975204468},{"id":"https://openalex.org/C42023084","wikidata":"https://www.wikidata.org/wiki/Q5249231","display_name":"Decision boundary","level":3,"score":0.32919999957084656},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.3176000118255615},{"id":"https://openalex.org/C75294576","wikidata":"https://www.wikidata.org/wiki/Q5165192","display_name":"Contextual image classification","level":3,"score":0.29170000553131104},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.29159998893737793},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.290800005197525},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2897999882698059},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.2881999909877777},{"id":"https://openalex.org/C193536780","wikidata":"https://www.wikidata.org/wiki/Q1513153","display_name":"Edge detection","level":4,"score":0.2863999903202057},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.2782999873161316},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.2752000093460083},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2653000056743622},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.26510000228881836},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2639999985694885},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26260000467300415},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.26080000400543213},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.2540999948978424},{"id":"https://openalex.org/C67174900","wikidata":"https://www.wikidata.org/wiki/Q178022","display_name":"Minutiae","level":4,"score":0.2533999979496002},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.25110000371932983}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2503.17080","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.17080","pdf_url":"https://arxiv.org/pdf/2503.17080","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2503.17080","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.17080","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2503.17080","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.17080","pdf_url":"https://arxiv.org/pdf/2503.17080","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"CLIP":[1],"model":[2],"has":[3],"demonstrated":[4],"significant":[5],"advancements":[6],"in":[7,75,114,188,197],"aligning":[8],"visual":[9,79],"and":[10,23,42,81,161,191,200],"language":[11,201],"modalities":[12],"through":[13],"large-scale":[14],"pre-training":[15],"on":[16,26,38,54],"image-text":[17],"pairs,":[18],"enabling":[19],"strong":[20],"zero-shot":[21,189],"classification":[22,190],"retrieval":[24,192],"capabilities":[25],"various":[27],"domains.":[28],"However,":[29],"CLIP's":[30,99],"training":[31,63,100],"remains":[32],"computationally":[33],"intensive,":[34],"with":[35,167],"high":[36],"demands":[37],"both":[39],"data":[40],"processing":[41],"memory.":[43],"To":[44],"address":[45],"these":[46,67],"challenges,":[47],"recent":[48],"masking":[49,112],"strategies":[50],"have":[51],"emerged,":[52],"focusing":[53],"the":[55,136,146,149,157,172],"selective":[56],"removal":[57],"of":[58,119,148],"image":[59,138],"patches":[60,121,160,164],"to":[61,97,139,175],"improve":[62],"efficiency.":[64],"Although":[65],"effective,":[66],"methods":[68],"often":[69],"compromise":[70],"key":[71],"semantic":[72,105],"information,":[73],"resulting":[74],"suboptimal":[76],"alignment":[77],"between":[78,156],"features":[80],"text":[82],"descriptions.":[83],"In":[84],"this":[85],"work,":[86],"we":[87,130],"present":[88],"a":[89,110,116,177],"concise":[90],"yet":[91],"effective":[92],"approach":[93],"called":[94],"Patch":[95],"Generation-to-Selection":[96],"enhance":[98],"efficiency":[101],"while":[102],"preserving":[103],"critical":[104],"content.":[106],"Our":[107,181],"method":[108],"introduces":[109],"gradual":[111],"process":[113,174],"which":[115],"small":[117],"set":[118],"candidate":[120,158],"is":[122],"first":[123],"pre-selected":[124],"as":[125],"potential":[126],"mask":[127,143,159],"regions.":[128],"Then,":[129],"apply":[131],"Sobel":[132],"edge":[133,142],"detection":[134],"across":[135],"entire":[137],"generate":[140],"an":[141],"that":[144],"prioritizes":[145],"retention":[147],"primary":[150],"object":[151],"areas.":[152],"Finally,":[153],"similarity":[154,179],"scores":[155],"their":[162],"neighboring":[163],"are":[165],"computed,":[166],"optimal":[168],"transport":[169],"normalization":[170],"refining":[171],"selection":[173],"ensure":[176],"balanced":[178],"matrix.":[180],"approach,":[182],"CLIP-PGS,":[183],"sets":[184],"new":[185],"state-of-the-art":[186],"results":[187],"tasks,":[193],"achieving":[194],"superior":[195],"performance":[196],"robustness":[198],"evaluation":[199],"compositionality":[202],"benchmarks.":[203]},"counts_by_year":[],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
