{"id":"https://openalex.org/W4394867990","doi":"https://doi.org/10.48550/arxiv.2404.09336","title":"Self-Selected Attention Span for Accelerating Large Language Model Inference","display_name":"Self-Selected Attention Span for Accelerating Large Language Model Inference","publication_year":2024,"publication_date":"2024-04-14","ids":{"openalex":"https://openalex.org/W4394867990","doi":"https://doi.org/10.48550/arxiv.2404.09336"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2404.09336","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.09336","pdf_url":"https://arxiv.org/pdf/2404.09336","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2404.09336","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073863021","display_name":"Jin Tian","orcid":"https://orcid.org/0000-0001-5313-1600"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jin, Tian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048530115","display_name":"Wanzin Yazar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yazar, Wanzin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073664360","display_name":"Zifei Xu","orcid":"https://orcid.org/0000-0003-2661-517X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Zifei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061847219","display_name":"Sayeh Sharify","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sharify, Sayeh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100327839","display_name":"Xin Wang","orcid":"https://orcid.org/0000-0001-8246-0606"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5073863021"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9577999711036682,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.8459053039550781},{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.8132010698318481},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7806838750839233},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6080682277679443},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5474988222122192},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5246456265449524},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.5147143006324768},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4930524528026581},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.48424097895622253},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.40731891989707947},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.1938442885875702}],"concepts":[{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.8459053039550781},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.8132010698318481},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7806838750839233},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6080682277679443},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5474988222122192},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5246456265449524},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.5147143006324768},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4930524528026581},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.48424097895622253},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40731891989707947},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.1938442885875702},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2404.09336","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.09336","pdf_url":"https://arxiv.org/pdf/2404.09336","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2404.09336","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2404.09336","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2404.09336","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.09336","pdf_url":"https://arxiv.org/pdf/2404.09336","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6000000238418579,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4394867990.pdf","grobid_xml":"https://content.openalex.org/works/W4394867990.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2366403280","https://openalex.org/W1495108544","https://openalex.org/W2091301346","https://openalex.org/W3148229873","https://openalex.org/W4389760904","https://openalex.org/W2150160875","https://openalex.org/W4242223894","https://openalex.org/W4306886878","https://openalex.org/W1517524280","https://openalex.org/W2076165488"],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"can":[4],"solve":[5,90],"challenging":[6],"tasks.":[7,190],"However,":[8],"their":[9,46,180],"inference":[10,165,186],"computation":[11],"on":[12,40],"modern":[13],"GPUs":[14],"is":[15,81,120],"highly":[16],"inefficient":[17],"due":[18],"to":[19,28,44,73,84,89,98,101,122,142,149,178],"the":[20,86,91,103,112,117,146,161],"increasing":[21],"number":[22],"of":[23,79,111,145,163],"tokens":[24],"they":[25,30],"must":[26],"attend":[27,150],"as":[29],"generate":[31],"new":[32],"ones.":[33],"To":[34],"address":[35],"this":[36,156],"inefficiency,":[37],"we":[38,69],"capitalize":[39],"LLMs'":[41],"problem-solving":[42],"capabilities":[43],"optimize":[45],"own":[47],"inference-time":[48],"efficiency.":[49],"We":[50,136,152],"demonstrate":[51,153],"with":[52],"two":[53],"specific":[54],"tasks:":[55],"(a)":[56],"evaluating":[57],"complex":[58],"arithmetic":[59],"expressions":[60],"and":[61,96],"(b)":[62],"summarizing":[63],"news":[64],"articles.":[65],"For":[66],"both":[67],"tasks,":[68],"create":[70],"custom":[71,139,157],"datasets":[72],"fine-tune":[74],"an":[75,171],"LLM.":[76],"The":[77],"goal":[78],"fine-tuning":[80],"twofold:":[82],"first,":[83],"make":[85],"LLM":[87,164],"learn":[88],"evaluation":[92],"or":[93],"summarization":[94],"task,":[95],"second,":[97],"train":[99],"it":[100],"identify":[102],"minimal":[104,126],"attention":[105,127,131,181],"spans":[106,128,182],"required":[107],"for":[108],"each":[109],"step":[110],"task.":[113],"As":[114],"a":[115,138],"result,":[116],"fine-tuned":[118],"model":[119],"able":[121],"convert":[123],"these":[124],"self-identified":[125],"into":[129],"sparse":[130],"masks":[132],"on-the-fly":[133],"during":[134],"inference.":[135],"develop":[137],"CUDA":[140,158],"kernel":[141,159],"take":[143],"advantage":[144],"reduced":[147],"context":[148],"to.":[151],"that":[154,175],"using":[155],"improves":[160],"throughput":[162],"by":[166],"28%.":[167],"Our":[168],"work":[169],"presents":[170],"end-to-end":[172],"demonstration":[173],"showing":[174],"training":[176],"LLMs":[177],"self-select":[179],"speeds":[183],"up":[184],"autoregressive":[185],"in":[187],"solving":[188],"real-world":[189]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2024-04-17T00:00:00"}
