{"id":"https://openalex.org/W4396815596","doi":"https://doi.org/10.48550/arxiv.2405.04532","title":"QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving","display_name":"QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving","publication_year":2024,"publication_date":"2024-05-07","ids":{"openalex":"https://openalex.org/W4396815596","doi":"https://doi.org/10.48550/arxiv.2405.04532"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2405.04532","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.04532","pdf_url":"https://arxiv.org/pdf/2405.04532","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2405.04532","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5037235064","display_name":"Yujun Lin","orcid":"https://orcid.org/0000-0001-8313-4642"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lin, Yujun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101819029","display_name":"Haotian Tang","orcid":"https://orcid.org/0000-0001-6580-3881"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Haotian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101467057","display_name":"Yang Shang","orcid":"https://orcid.org/0000-0002-5836-5016"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Shang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065492534","display_name":"Zhekai Zhang","orcid":"https://orcid.org/0000-0002-9777-3792"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhekai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009724487","display_name":"Guangxuan Xiao","orcid":"https://orcid.org/0000-0002-7182-9284"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao, Guangxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040877128","display_name":"Chuang Gan","orcid":"https://orcid.org/0000-0003-4031-5886"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gan, Chuang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5070926896","display_name":"Song Han","orcid":"https://orcid.org/0000-0002-4186-7618"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Song","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5037235064"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11249","display_name":"Wireless Power Transfer Systems","score":0.9412000179290771,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11249","display_name":"Wireless Power Transfer Systems","score":0.9412000179290771,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.5456767678260803},{"id":"https://openalex.org/keywords/co-design","display_name":"Co-design","score":0.46970388293266296},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4304051995277405},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.18222948908805847},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.11882302165031433}],"concepts":[{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.5456767678260803},{"id":"https://openalex.org/C180962459","wikidata":"https://www.wikidata.org/wiki/Q1038171","display_name":"Co-design","level":2,"score":0.46970388293266296},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4304051995277405},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.18222948908805847},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.11882302165031433}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2405.04532","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.04532","pdf_url":"https://arxiv.org/pdf/2405.04532","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2405.04532","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2405.04532","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2405.04532","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.04532","pdf_url":"https://arxiv.org/pdf/2405.04532","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4396815596.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"Quantization":[0],"can":[1,145,237],"accelerate":[2,30],"large":[3],"language":[4],"model":[5],"(LLM)":[6],"inference.":[7],"Going":[8],"beyond":[9],"INT8":[10],"quantization,":[11],"the":[12,102,117,160,169,194,206,250],"research":[13],"community":[14],"is":[15,99,115,124,259],"actively":[16],"exploring":[17],"even":[18,239],"lower":[19],"precision,":[20],"such":[21],"as":[22],"INT4.":[23],"Nonetheless,":[24],"state-of-the-art":[25],"INT4":[26,51],"quantization":[27,52,78,143],"techniques":[28],"only":[29],"low-batch,":[31],"edge":[32],"LLM":[33,43,120,254],"inference,":[34],"failing":[35],"to":[36,157,183,230],"deliver":[37],"performance":[38,195],"gains":[39],"in":[40,96,137,150],"large-batch,":[41],"cloud-based":[42],"serving.":[44],"We":[45,187],"uncover":[46],"a":[47,76,202],"critical":[48],"issue:":[49],"existing":[50],"methods":[53],"suffer":[54],"from":[55],"significant":[56],"runtime":[57],"overhead":[58,149],"(20-90%)":[59],"when":[60],"dequantizing":[61],"either":[62],"weights":[63],"or":[64],"partial":[65],"sums":[66],"on":[67,122,129,215,218,224,227,234,244],"GPUs.":[68],"To":[69],"address":[70],"this":[71,135],"challenge,":[72],"we":[73,140,154,172],"introduce":[74,141],"QoQ,":[75],"W4A8KV4":[77],"algorithm":[79],"with":[80],"4-bit":[81,86,165],"weight,":[82],"8-bit":[83],"activation,":[84],"and":[85,177,220],"KV":[87,166],"cache.":[88],"QoQ":[89,98,138],"stands":[90],"for":[91],"quattuor-octo-quattuor,":[92],"which":[93],"represents":[94],"4-8-4":[95],"Latin.":[97],"implemented":[100],"by":[101,127,164,198,213,222,256],"QServe":[103,114,170,204,233,247],"inference":[104],"library":[105],"that":[106,116,144],"achieves":[107],"measured":[108],"speedup.":[109],"The":[110],"key":[111],"insight":[112],"driving":[113],"efficiency":[118],"of":[119,180,211,253],"serving":[121,209,255],"GPUs":[123],"critically":[125],"influenced":[126],"operations":[128],"low-throughput":[130],"CUDA":[131],"cores.":[132],"Building":[133],"upon":[134],"insight,":[136],"algorithm,":[139],"progressive":[142],"allow":[146],"low":[147],"dequantization":[148,185],"W4A8":[151],"GEMM.":[152],"Additionally,":[153],"develop":[155],"SmoothAttention":[156],"effectively":[158,248],"mitigate":[159],"accuracy":[161],"degradation":[162],"incurred":[163],"quantization.":[167,200],"In":[168],"system,":[171],"perform":[173],"compute-aware":[174],"weight":[175],"reordering":[176],"take":[178],"advantage":[179],"register-level":[181],"parallelism":[182],"reduce":[184],"latency.":[186],"also":[188],"make":[189],"fused":[190],"attention":[191],"memory-bound,":[192],"harnessing":[193],"gain":[196],"brought":[197],"KV4":[199],"As":[201],"result,":[203],"improves":[205],"maximum":[207],"achievable":[208],"throughput":[210,241],"Llama-3-8B":[212],"1.2x":[214],"A100,":[216,225],"1.4x":[217],"L40S;":[219],"Qwen1.5-72B":[221],"2.4x":[223],"3.5x":[226],"L40S,":[228],"compared":[229],"TensorRT-LLM.":[231],"Remarkably,":[232],"L40S":[235],"GPU":[236],"achieve":[238],"higher":[240],"than":[242],"TensorRT-LLM":[243],"A100.":[245],"Thus,":[246],"reduces":[249],"dollar":[251],"cost":[252],"3x.":[257],"Code":[258],"available":[260],"at":[261],"https://github.com/mit-han-lab/omniserve.":[262]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
