{"id":"https://openalex.org/W4389650400","doi":"https://doi.org/10.48550/arxiv.2312.05639","title":"JITSPMM: Just-in-Time Instruction Generation for Accelerated Sparse Matrix-Matrix Multiplication","display_name":"JITSPMM: Just-in-Time Instruction Generation for Accelerated Sparse Matrix-Matrix Multiplication","publication_year":2023,"publication_date":"2023-12-09","ids":{"openalex":"https://openalex.org/W4389650400","doi":"https://doi.org/10.48550/arxiv.2312.05639"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2312.05639","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.05639","pdf_url":"https://arxiv.org/pdf/2312.05639","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2312.05639","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060623990","display_name":"Qiang Fu","orcid":"https://orcid.org/0000-0002-0594-8832"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Fu, Qiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026169135","display_name":"Thomas B. Rolinger","orcid":"https://orcid.org/0000-0001-8383-4737"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rolinger, Thomas B.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5002254350","display_name":"H. Howie Huang","orcid":"https://orcid.org/0000-0001-8588-7680"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, H. Howie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5060623990"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.9839000105857849,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8635767698287964},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6735218167304993},{"id":"https://openalex.org/keywords/simd","display_name":"SIMD","score":0.6268039345741272},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.6071965098381042},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.5397515892982483},{"id":"https://openalex.org/keywords/instruction-set","display_name":"Instruction set","score":0.49572455883026123},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.41930705308914185},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.4164816737174988},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.34154775738716125},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.22523435950279236}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8635767698287964},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6735218167304993},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.6268039345741272},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.6071965098381042},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.5397515892982483},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.49572455883026123},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.41930705308914185},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.4164816737174988},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.34154775738716125},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.22523435950279236},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2312.05639","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.05639","pdf_url":"https://arxiv.org/pdf/2312.05639","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2312.05639","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2312.05639","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2312.05639","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.05639","pdf_url":"https://arxiv.org/pdf/2312.05639","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4389650400.pdf","grobid_xml":"https://content.openalex.org/works/W4389650400.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2111180768","https://openalex.org/W2162270818","https://openalex.org/W1992352827","https://openalex.org/W2994245508","https://openalex.org/W4242172182","https://openalex.org/W2082875307","https://openalex.org/W4237780868","https://openalex.org/W4285302443","https://openalex.org/W2019451907","https://openalex.org/W4229042072"],"abstract_inverted_index":{"Achieving":[0],"high":[1],"performance":[2,185],"for":[3,35,55,125],"Sparse":[4],"MatrixMatrix":[5],"Multiplication":[6],"(SpMM)":[7],"has":[8],"received":[9],"increasing":[10],"research":[11],"attention,":[12],"especially":[13],"on":[14,104],"multi-core":[15,105],"CPUs,":[16],"due":[17],"to":[18,80,100,127,150,164,169,178],"the":[19,39,74,113,137,156,203,212],"large":[20],"input":[21],"data":[22,168],"size":[23],"in":[24],"applications":[25],"such":[26],"as":[27],"graph":[28],"neural":[29],"networks":[30],"(GNNs).":[31],"Most":[32],"existing":[33,198],"solutions":[34],"SpMM":[36,56,81,102,126,199,214],"computation":[37,103],"follow":[38],"aheadof-time":[40],"(AOT)":[41],"compilation":[42,54],"approach,":[43],"which":[44],"compiles":[45],"a":[46,93,144,184],"program":[47],"entirely":[48],"before":[49],"it":[50,191],"is":[51,82],"executed.":[52],"AOT":[53,193],"faces":[57],"three":[58,120],"key":[59],"limitations:":[60],"unnecessary":[61],"memory":[62,171],"access,":[63],"additional":[64],"branch":[65],"overhead,":[66],"and":[67,173,189,231],"redundant":[68],"instructions.":[69],"These":[70],"limitations":[71],"stem":[72],"from":[73],"fact":[75],"that":[76,223],"crucial":[77],"information":[78],"pertaining":[79],"not":[83],"known":[84],"until":[85],"runtime.":[86],"In":[87],"this":[88],"paper,":[89],"we":[90],"propose":[91],"JITSPMM,":[92],"just-in-time":[94],"(JIT)":[95],"assembly":[96,115],"code":[97,116],"generation":[98,117],"framework":[99],"accelerated":[101],"CPUs":[106],"with":[107,136,207],"SIMD":[108,176],"extensions.":[109],"First,":[110],"JITSPMM":[111,142,160,188,224],"integrates":[112],"JIT":[114],"technique":[118],"into":[119],"widely-used":[121],"workload":[122,130],"division":[123],"methods":[124],"achieve":[128],"balanced":[129],"distribution":[131],"among":[132],"CPU":[133],"threads.":[134],"Next,":[135],"availability":[138],"of":[139,187,229],"runtime":[140],"information,":[141],"employs":[143,174],"novel":[145],"technique,":[146],"coarse-grain":[147],"column":[148],"merging,":[149],"maximize":[151],"instruction-level":[152],"parallelism":[153],"by":[154,217],"unrolling":[155],"performance-critical":[157],"loop.":[158],"Furthermore,":[159],"intelligently":[161],"allocates":[162],"registers":[163],"cache":[165],"frequently":[166],"accessed":[167],"minimizing":[170],"accesses,":[172],"selected":[175],"instructions":[177],"enhance":[179],"arithmetic":[180],"throughput.":[181],"We":[182],"conduct":[183],"evaluation":[186],"compare":[190],"two":[192],"baselines.":[194],"The":[195,209],"first":[196],"involves":[197],"implementations":[200],"compiled":[201],"using":[202],"Intel":[204,218],"icc":[205],"compiler":[206],"auto-vectorization.":[208],"second":[210],"utilizes":[211],"highly-optimized":[213],"routine":[215],"provided":[216],"MKL.":[219],"Our":[220],"results":[221],"show":[222],"provides":[225],"an":[226],"average":[227],"improvement":[228],"3.8x":[230],"1.4x,":[232],"respectively.":[233]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-10-10T00:00:00"}
