{"id":"https://openalex.org/W6929376851","doi":"https://doi.org/10.48550/arxiv.2501.06828","title":"GeoPix: Multi-Modal Large Language Model for Pixel-level Image Understanding in Remote Sensing","display_name":"GeoPix: Multi-Modal Large Language Model for Pixel-level Image Understanding in Remote Sensing","publication_year":2025,"publication_date":"2025-01-12","ids":{"openalex":"https://openalex.org/W6929376851","doi":"https://doi.org/10.48550/arxiv.2501.06828"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2501.06828","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2501.06828","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2501.06828","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Ou, Ruizhe","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ou, Ruizhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Hu, Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Yuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Fan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chen, Jiaxin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jiaxin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Liu, Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9889000058174133,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9889000058174133,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.0020000000949949026,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0020000000949949026,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dysgeusia","display_name":"Dysgeusia","score":0.311599999666214},{"id":"https://openalex.org/keywords/nucleofection","display_name":"Nucleofection","score":0.28790000081062317},{"id":"https://openalex.org/keywords/hyporeflexia","display_name":"Hyporeflexia","score":0.2752000093460083},{"id":"https://openalex.org/keywords/fusible-alloy","display_name":"Fusible alloy","score":0.27320000529289246},{"id":"https://openalex.org/keywords/tubulopathy","display_name":"Tubulopathy","score":0.2671000063419342},{"id":"https://openalex.org/keywords/durvalumab","display_name":"Durvalumab","score":0.26350000500679016}],"concepts":[{"id":"https://openalex.org/C2777054765","wikidata":"https://www.wikidata.org/wiki/Q6402731","display_name":"Dysgeusia","level":3,"score":0.311599999666214},{"id":"https://openalex.org/C144251240","wikidata":"https://www.wikidata.org/wiki/Q7068229","display_name":"Nucleofection","level":4,"score":0.28790000081062317},{"id":"https://openalex.org/C2777158700","wikidata":"https://www.wikidata.org/wiki/Q1419356","display_name":"Hyporeflexia","level":3,"score":0.2752000093460083},{"id":"https://openalex.org/C133074676","wikidata":"https://www.wikidata.org/wiki/Q428729","display_name":"Fusible alloy","level":2,"score":0.27320000529289246},{"id":"https://openalex.org/C2776356786","wikidata":"https://www.wikidata.org/wiki/Q1048573","display_name":"Tubulopathy","level":3,"score":0.2671000063419342},{"id":"https://openalex.org/C2777742743","wikidata":"https://www.wikidata.org/wiki/Q19904005","display_name":"Durvalumab","level":5,"score":0.26350000500679016},{"id":"https://openalex.org/C180938184","wikidata":"https://www.wikidata.org/wiki/Q2142270","display_name":"Liquation","level":3,"score":0.24979999661445618},{"id":"https://openalex.org/C145741570","wikidata":"https://www.wikidata.org/wiki/Q7251534","display_name":"Proteogenomics","level":5,"score":0.2393999993801117},{"id":"https://openalex.org/C18743360","wikidata":"https://www.wikidata.org/wiki/Q1208096","display_name":"Diafiltration","level":4,"score":0.23309999704360962},{"id":"https://openalex.org/C2778902089","wikidata":"https://www.wikidata.org/wiki/Q5608642","display_name":"Subpoena","level":2,"score":0.23180000483989716}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2501.06828","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2501.06828","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2501.06828","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2501.06828","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.41922488808631897,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multi-modal":[0],"large":[1],"language":[2],"models":[3],"(MLLMs)":[4],"have":[5],"achieved":[6,70],"remarkable":[7],"success":[8],"in":[9,103,185,198,207],"image-":[10,208],"and":[11,26,119,153,165,182,194,209],"region-level":[12,210],"remote":[13],"sensing":[14],"(RS)":[15],"image":[16,21,61],"understanding":[17,62],"tasks,":[18,201],"such":[19],"as":[20],"captioning,":[22],"visual":[23,27,81],"question":[24],"answering,":[25],"grounding.":[28],"However,":[29],"existing":[30],"RS":[31,57,104,143],"MLLMs":[32],"lack":[33],"the":[34,65,73,84,91,98,114,124,128,135,147,176,192],"pixel-level":[35,142,199],"dialogue":[36],"capability,":[37],"which":[38,79],"involves":[39],"responding":[40],"to":[41,64,117,133,174],"user":[42],"instructions":[43],"with":[44,75,156,160],"segmentation":[45,93,99,200],"masks":[46,88,183],"for":[47,140],"specific":[48],"instances.":[49],"In":[50,131],"this":[51],"paper,":[52],"we":[53,145,168],"propose":[54],"GeoPix,":[55],"a":[56,76,106,170],"MLLM":[58,74],"that":[59],"extends":[60],"capabilities":[63],"pixel":[66],"level.":[67],"This":[68],"is":[69,111],"by":[71],"equipping":[72],"mask":[77,115],"predictor,":[78],"transforms":[80],"features":[82],"from":[83],"vision":[85],"encoder":[86],"into":[87,113],"conditioned":[89],"on":[90],"LLM's":[92],"token":[94],"embeddings.":[95],"To":[96],"facilitate":[97],"of":[100,137,179,196],"multi-scale":[101],"objects":[102],"imagery,":[105],"class-wise":[107,121],"learnable":[108],"memory":[109],"module":[110],"integrated":[112],"predictor":[116],"capture":[118],"store":[120],"geo-context":[122],"at":[123],"instance":[125,158],"level":[126],"across":[127],"entire":[129],"dataset.":[130],"addition,":[132],"address":[134],"absence":[136],"large-scale":[138],"datasets":[139],"training":[141,172],"MLLMs,":[144],"construct":[146],"GeoPixInstruct":[148],"dataset,":[149],"comprising":[150],"65,463":[151],"images":[152],"140,412":[154],"instances,":[155],"each":[157],"annotated":[159],"text":[161,180],"descriptions,":[162],"bounding":[163],"boxes,":[164],"masks.":[166],"Furthermore,":[167],"develop":[169],"two-stage":[171],"strategy":[173],"balance":[175],"distinct":[177],"requirements":[178],"generation":[181],"prediction":[184],"multi-modal":[186],"multi-task":[187],"optimization.":[188],"Extensive":[189],"experiments":[190],"verify":[191],"effectiveness":[193],"superiority":[195],"GeoPix":[197],"while":[202],"also":[203],"maintaining":[204],"competitive":[205],"performance":[206],"benchmarks.":[211]},"counts_by_year":[],"updated_date":"2026-03-03T08:47:05.690250","created_date":"2025-10-10T00:00:00"}
