{"id":"https://openalex.org/W4416526742","doi":"https://doi.org/10.48550/arxiv.2504.03661","title":"MILLION: Mastering Long-Context LLM Inference Via Outlier-Immunized KV Product Quantization","display_name":"MILLION: Mastering Long-Context LLM Inference Via Outlier-Immunized KV Product Quantization","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4416526742","doi":"https://doi.org/10.48550/arxiv.2504.03661"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2504.03661","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.03661","pdf_url":"https://arxiv.org/pdf/2504.03661","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2504.03661","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5107269495","display_name":"Zongwu Wang","orcid":"https://orcid.org/0009-0003-2157-4927"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Zongwu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018569941","display_name":"Peng Xu","orcid":"https://orcid.org/0000-0003-0822-700X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017670541","display_name":"Fangxin Liu","orcid":"https://orcid.org/0000-0002-8769-293X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Fangxin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101443061","display_name":"Yiwei Hu","orcid":"https://orcid.org/0000-0002-6713-4148"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Yiwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060999547","display_name":"Qingxiao Sun","orcid":"https://orcid.org/0000-0003-2927-362X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Qingxiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009211176","display_name":"Gezi Li","orcid":"https://orcid.org/0000-0002-4101-1043"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Gezi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106084673","display_name":"Cheng Li","orcid":"https://orcid.org/0000-0002-8123-2190"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Cheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100328984","display_name":"Xuan Wang","orcid":"https://orcid.org/0000-0002-0790-5641"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071012979","display_name":"Li Jiang","orcid":"https://orcid.org/0000-0003-2724-0605"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5049487451","display_name":"Haibing Guan","orcid":"https://orcid.org/0000-0002-4714-7400"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guan, Haibing","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5107269495"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.17579999566078186,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.17579999566078186,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.12620000541210175,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.12020000070333481,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.7422999739646912},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6920999884605408},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.46320000290870667},{"id":"https://openalex.org/keywords/vector-quantization","display_name":"Vector quantization","score":0.3935999870300293},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.3813999891281128},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.3336000144481659},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.3244999945163727},{"id":"https://openalex.org/keywords/programmer","display_name":"Programmer","score":0.31949999928474426}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7638000249862671},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.7422999739646912},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6920999884605408},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.46320000290870667},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.40549999475479126},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.398499995470047},{"id":"https://openalex.org/C199833920","wikidata":"https://www.wikidata.org/wiki/Q612536","display_name":"Vector quantization","level":2,"score":0.3935999870300293},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.3813999891281128},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3336000144481659},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C2778514511","wikidata":"https://www.wikidata.org/wiki/Q1374194","display_name":"Programmer","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3131999969482422},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3059999942779541},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.29809999465942383},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2964000105857849},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.2824999988079071},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.27410000562667847},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.2718999981880188},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.2709999978542328},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.2694000005722046},{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2599000036716461},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.25760000944137573}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2504.03661","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.03661","pdf_url":"https://arxiv.org/pdf/2504.03661","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2504.03661","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2504.03661","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2504.03661","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.03661","pdf_url":"https://arxiv.org/pdf/2504.03661","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,16],"(LLMs)":[3],"are":[4],"increasingly":[5],"utilized":[6],"for":[7,62,157],"complex":[8],"tasks":[9],"requiring":[10],"longer":[11],"context":[12,195],"lengths,":[13],"with":[14,150,181],"some":[15],"supporting":[17],"up":[18],"to":[19,42,66],"128K":[20],"or":[21],"1M":[22],"tokens.":[23],"This":[24],"trend,":[25],"however,":[26],"presents":[27],"significant":[28,76],"challenges":[29],"in":[30,83],"inference":[31,148,168],"speed":[32],"and":[33,50,73,154,163,184,187],"memory":[34,51],"management.":[35],"Quantization":[36],"emerges":[37],"as":[38],"a":[39,96,110,127,145],"promising":[40],"approach":[41],"address":[43],"the":[44,118],"widening":[45],"gap":[46],"between":[47],"LLM":[48],"size":[49],"capacity.":[52],"However,":[53],"traditional":[54],"quantization":[55,72,98,122,129,180],"schemes":[56],"often":[57],"yield":[58],"suboptimal":[59],"compression":[60],"results":[61,172],"KV":[63,84,102,114],"caches":[64],"due":[65],"two":[67],"key":[68],"factors:":[69],"i)":[70],"On-the-fly":[71],"de-quantization,":[74],"causing":[75],"performance":[77,191],"overhead;":[78],"ii)":[79],"Prevalence":[80],"of":[81,113,120],"outliers":[82],"values,":[85],"challenging":[86],"low-bitwidth":[87,101],"uniform":[88],"quantization.":[89,106],"To":[90],"this":[91],"end,":[92],"we":[93,108,125,143],"propose":[94],"MILLION,":[95],"novel":[97],"framework":[99,149],"achieving":[100],"cache":[103,115],"through":[104],"product":[105,133],"First,":[107],"conduct":[109],"thorough":[111],"analysis":[112],"distribution,":[116],"revealing":[117],"limitations":[119],"existing":[121],"schemes.":[123],"Second,":[124],"introduce":[126],"non-uniform":[128],"algorithm":[130],"based":[131],"on":[132],"quantization,":[134,165],"which":[135],"efficiently":[136],"compresses":[137],"data":[138],"while":[139],"preserving":[140],"accuracy.":[141],"Third,":[142],"develop":[144],"high-performance":[146],"GPU":[147],"efficient":[151],"attention":[152],"kernel":[153],"pipeline":[155],"design":[156],"MILLION":[158,175],"that":[159,174],"leverages":[160],"sparse":[161],"computation":[162],"asynchronous":[164],"significantly":[166],"enhancing":[167],"speed.":[169],"Comprehensive":[170],"evaluation":[171],"demonstrate":[173],"can":[176],"achieve":[177,188],"4":[178],"bits":[179],"trivial":[182],"perplexity":[183],"accuracy":[185],"loss,":[186],"2.09x":[189],"end-to-end":[190],"gains":[192],"at":[193,200],"32K":[194],"length.":[196],"Code":[197],"is":[198],"released":[199],"https://github.com/ZongwuWang/MILLION.":[201]},"counts_by_year":[],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
