{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T11:19:31Z","timestamp":1764587971520,"version":"3.37.3"},"reference-count":57,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2019,1,25]],"date-time":"2019-01-25T00:00:00Z","timestamp":1548374400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61572508","61672526","61402488"],"award-info":[{"award-number":["61572508","61672526","61402488"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key R&D Program of China","award":["2017YFB0202003"],"award-info":[{"award-number":["2017YFB0202003"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2019,8]]},"DOI":"10.1007\/s11227-018-2694-x","type":"journal-article","created":{"date-parts":[[2019,1,25]],"date-time":"2019-01-25T09:57:18Z","timestamp":1548410238000},"page":"4710-4730","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Application-aware NoC management in GPUs multitasking"],"prefix":"10.1007","volume":"75","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7758-9771","authenticated-orcid":false,"given":"Zhen","family":"Xu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xia","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiying","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Canqun","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,1,25]]},"reference":[{"key":"2694_CR1","unstructured":"Nvidia (2009) NVIDIA\u2019s next generation CUDA compute architecture: Fermi. http:\/\/www.nvidia.com\/content\/PDF\/fermi_white_papers\/P.Glaskowsky_NVIDIA\u2019s_Fermi-The_First_Complete_GPU_Architecture.pdf . Accessed July 2018"},{"key":"2694_CR2","unstructured":"Nvidia (2016) NVIDIA GP100 Pascal architecture. White paper. http:\/\/www.nvidia.com\/object\/pascal-architecture-whitepaper.html . Accessed July 2018"},{"key":"2694_CR3","doi-asserted-by":"publisher","first-page":"278","DOI":"10.1109\/JETCAS.2012.2193936","volume":"2","author":"K Sewell","year":"2012","unstructured":"Sewell K, Dreslinski RG, Manville T, Satpathy S, Pinckney N, Blake G, Cieslak M, Das R, Wenisch TF, Sylvester D, Blaauw D, Mudge T (2012) Swizzle-switch networks for many-core systems. IEEE J Emerg Sel Top Circuits Syst 2:278\u2013294","journal-title":"IEEE J Emerg Sel Top Circuits Syst"},{"key":"2694_CR4","doi-asserted-by":"crossref","unstructured":"Bakhoda A, Kim J, Aamodt TM (2010) Throughput-effective on-chip networks for manycore accelerators. In: Proceedings of the International Symposium on Microarchitecture (MICRO), pp 421\u2013432","DOI":"10.1109\/MICRO.2010.50"},{"key":"2694_CR5","doi-asserted-by":"crossref","unstructured":"Kim H, Kim J, Seo W, Cho Y, Ryu S (2012) Providing cost-effective on-chip network bandwidth in GPGPUs. In: Proceedings of the International Conference on Computer Design (ICCD), pp 407\u2013412","DOI":"10.1109\/ICCD.2012.6378671"},{"key":"2694_CR6","doi-asserted-by":"crossref","unstructured":"Jang H, Kim J, Gratz P, Yum KH, Kim EJ (2015) Bandwidth-efficient on-chip interconnect designs for GPGPUs. In: Proceedings of the Design Automation Conference (DAC), pp 9:1\u20139:6","DOI":"10.1145\/2744769.2744803"},{"key":"2694_CR7","doi-asserted-by":"crossref","unstructured":"Zhao X, Ma S, Li C, Eeckhout L, Wang Z (2016) A heterogeneous low-cost and low-latency ring-chain network for GPGPUs. In: Proceedings of the International Conference on Computer Design (ICCD), pp 472\u2013479","DOI":"10.1109\/ICCD.2016.7753329"},{"key":"2694_CR8","doi-asserted-by":"crossref","unstructured":"Adriaens JT, Compton K, Kim NS, Schulte MJ (2012) The case for GPGPU spatial multitasking. In: Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA), pp 1\u201312","DOI":"10.1109\/HPCA.2012.6168946"},{"key":"2694_CR9","unstructured":"Nvidia (2017) NVIDIA Tesla V100 GPU architecture the world\u2019s most advanced data center GPU. White paper. http:\/\/www.nvidia.com\/object\/volta-architecture-whitepaper.html"},{"key":"2694_CR10","doi-asserted-by":"crossref","unstructured":"Jog A, Kayiran O, Kesten T, Pattnaik A, Bolotin E, Chatterjee N, Keckler SW, Kandemir MT, Das CR (2015) Anatomy of GPU memory system for multi-application execution. In: Proceedings of the 2015 International Symposium on Memory Systems, MEMSYS","DOI":"10.1145\/2818950.2818979"},{"key":"2694_CR11","doi-asserted-by":"crossref","unstructured":"Park JJK, Park Y, Mahlke S (2015) Chimera: collaborative preemption for multitasking on a shared GPU. In: Proceedings of the International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), pp 593\u2013606","DOI":"10.1145\/2694344.2694346"},{"key":"2694_CR12","doi-asserted-by":"crossref","unstructured":"Wang B, Yu W, Sun X-H, Wang X (2015) DaCache: memory divergence-aware GPU cache management. In: Proceedings of the International Conference on Supercomputing (ICS), pp 89\u201398","DOI":"10.1145\/2751205.2751239"},{"key":"2694_CR13","doi-asserted-by":"crossref","unstructured":"Sethia A, Jamshidi DA, Mahlke S (2015) Mascar: speeding up GPU warps by reducing memory pitstops. In: Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp 174\u2013185","DOI":"10.1109\/HPCA.2015.7056031"},{"key":"2694_CR14","doi-asserted-by":"crossref","unstructured":"Abts D, Enright\u00a0Jerger ND, Kim J, Gibson D, Lipasti MH (2009) Achieving predictable performance through better memory controller placement in many-core CMPs. In: Proceedings of the International Symposium on Computer Architecture, pp 451\u2013461","DOI":"10.1145\/1555815.1555810"},{"key":"2694_CR15","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-031-01755-1","volume-title":"On-chip networks","author":"N\u00a0E Jerger","year":"2017","unstructured":"Jerger N\u00a0E, Krishna T, Peh L (2017) On-chip networks, 2nd edn. Morgan & Claypool Publishers, Williston","edition":"2"},{"key":"2694_CR16","doi-asserted-by":"crossref","unstructured":"Tanasic I, Gelado I, Cabezas J, Ramirez A, Navarro N, Valero M (2014) Enabling preemptive multiprogramming on GPUs. In: Proceeding of the International Symposium on Computer Architecture (ISCA), pp 193\u2013204","DOI":"10.1145\/2678373.2665702"},{"key":"2694_CR17","unstructured":"Rezazad M, Sarbazi-azad H (2005) The effect of virtual channel organization on the performance of interconnection networks. In: Proceedings of the International Parallel and Distributed Processing Symposium (IPDPS)"},{"key":"2694_CR18","doi-asserted-by":"crossref","unstructured":"Lee J, Kim H (2012) TAP: a TLP-aware cache management policy for a CPU-GPU heterogeneous architecture. In: Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp 1\u201312","DOI":"10.1109\/HPCA.2012.6168947"},{"key":"2694_CR19","doi-asserted-by":"crossref","unstructured":"Grauer-Gray S, Xu L, Searles R, Ayalasomayajula S, Cavazos J (2012) Auto-tuning a high-level language targeted to GPU codes. In: Innovative Parallel Computing (InPar), pp 1\u201310","DOI":"10.1109\/InPar.2012.6339595"},{"key":"2694_CR20","doi-asserted-by":"crossref","unstructured":"He B, Fang W, Luo Q, Govindaraju NK, Wang T (2008) Mars: a MapReduce framework on graphics processors. In: Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT), pp 260\u2013269","DOI":"10.1145\/1454115.1454152"},{"key":"2694_CR21","doi-asserted-by":"crossref","unstructured":"Che S, Boyer M, Meng J, Tarjan D, Sheaffer JW, Lee S-H, Skadron K (2009) Rodinia: a benchmark suite for heterogeneous computing. In: Proceedings of the International Symposium on Workload Characterization (IISWC), pp 44\u201354","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"2694_CR22","unstructured":"NVIDIA CUDA SDK Code Samples. https:\/\/developer.nvidia.com\/cuda-downloads"},{"key":"2694_CR23","doi-asserted-by":"crossref","unstructured":"Bakhoda A, Yuan GL, Fung WWL, Wong H, Aamodt TM (2009) Analyzing CUDA workloads using a detailed GPU simulator. In: Proceeding of the International Symposium on Performance Analysis of Systems and Software (ISPASS), pp 163\u2013174","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"2694_CR24","unstructured":"Stratton JA, Rodrigues C, Sung I-J, Obeid N, Chang L-W, Anssari N, Liu GD, Hwu WMW (2012) Parboil: a revised benchmark suite for scientific and commercial throughput computing. Technical report"},{"key":"2694_CR25","doi-asserted-by":"crossref","unstructured":"Wang Z, Yang J, Melhem R, Childers B, Zhang Y, Guo M (2016) Simultaneous multikernel GPU: multi-tasking throughput processors via fine-grained sharing. In: Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp 358\u2013369","DOI":"10.1109\/HPCA.2016.7446078"},{"key":"2694_CR26","doi-asserted-by":"crossref","unstructured":"Xu Q, Jeon H, Kim K, Ro WW, Annavaram M (2016) Warped-slicer: efficient intra-SM slicing through dynamic resource partitioning for GPU multiprogramming. In: Proceedings of the International Symposium on Computer Architecture (ISCA), pp 230\u2013242","DOI":"10.1145\/3007787.3001161"},{"key":"2694_CR27","doi-asserted-by":"crossref","unstructured":"Zhao X, Wang Z, Eeckhout L (2018) Classification-driven search for effective SM partitioning in GPU multitasking. In: Proceedings of the International Conference on Supercomputing (ICS)","DOI":"10.1145\/3205289.3205311"},{"issue":"3","key":"2694_CR28","doi-asserted-by":"publisher","first-page":"42","DOI":"10.1109\/MM.2008.44","volume":"28","author":"S Eyerman","year":"2008","unstructured":"Eyerman S, Eeckhout L (2008) System-level performance metrics for multiprogram workloads. IEEE Micro 28(3):42\u201353","journal-title":"IEEE Micro"},{"key":"2694_CR29","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1111\/j.1467-8659.1986.tb00296.x","volume":"5","author":"HR Arabnia","year":"1986","unstructured":"Arabnia HR, Oliver MA (1986) Fast operations on raster images with SIMD machine architectures. Comput Graph Forum 5:179\u2013188","journal-title":"Comput Graph Forum"},{"key":"2694_CR30","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1111\/j.1467-8659.1987.tb00340.x","volume":"6","author":"HR Arabnia","year":"1987","unstructured":"Arabnia HR, Oliver MA (1987) Arbitrary rotation of raster images with SIMD machine architectures. Comput Graph Forum 6:3\u201311","journal-title":"Comput Graph Forum"},{"issue":"5","key":"2694_CR31","doi-asserted-by":"publisher","first-page":"425","DOI":"10.1093\/comjnl\/30.5.425","volume":"30","author":"HR Arabnia","year":"1987","unstructured":"Arabnia HR, Oliver MA (1987) A transputer network for the arbitrary rotation of digitised images. Comput J 30(5):425\u2013432","journal-title":"Comput J"},{"key":"2694_CR32","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1111\/j.1467-8659.1989.tb00448.x","volume":"8","author":"HR Arabnia","year":"1989","unstructured":"Arabnia HR, Oliver MA (1989) A transputer network for fast operations on digitised images. Comput Graph Forum 8:3\u201311","journal-title":"Comput Graph Forum"},{"issue":"2","key":"2694_CR33","doi-asserted-by":"publisher","first-page":"188","DOI":"10.1016\/0743-7315(90)90028-N","volume":"10","author":"HR Arabnia","year":"1990","unstructured":"Arabnia HR (1990) A parallel algorithm for the arbitrary rotation of digitized images using process-and-data-decomposition approach. J Parallel Distrib Comput 10(2):188\u2013192","journal-title":"J Parallel Distrib Comput"},{"issue":"8","key":"2694_CR34","doi-asserted-by":"publisher","first-page":"707","DOI":"10.1016\/S0140-3664(96)01104-8","volume":"19","author":"HR Arabnia","year":"1996","unstructured":"Arabnia HR (1996) Distributed stereo-correlation algorithm. Comput Commun 19(8):707\u2013711","journal-title":"Comput Commun"},{"key":"2694_CR35","doi-asserted-by":"publisher","first-page":"243","DOI":"10.1007\/BF00130109","volume":"10","author":"HR Arabnia","year":"1996","unstructured":"Arabnia HR, Bhandarkar SM (1996) Parallel stereocorrelation on a reconfigurable multi-ring network. J Supercomput 10:243\u2013269","journal-title":"J Supercomput"},{"issue":"1\u20132","key":"2694_CR36","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1023\/A:1019119117297","volume":"10","author":"HR Arabnia","year":"1998","unstructured":"Arabnia HR, Taha TR (1998) A parallel numerical algorithm on a reconfigurable multi-ring network. Telecommun Syst 10(1\u20132):185\u2013202","journal-title":"Telecommun Syst"},{"key":"2694_CR37","doi-asserted-by":"crossref","unstructured":"Ziabari AK, Abell\u00e1n JL, Ma Y, Joshi A, Kaeli D (2015) Asymmetric NoC architectures for GPU systems. In: Proceedings of the International Symposium on Networks-on-Chip (NoCs), pp 25:1\u201325:8","DOI":"10.1145\/2786572.2786596"},{"key":"2694_CR38","doi-asserted-by":"crossref","unstructured":"Zhao X, Ma S, Liu Y, Eeckhout L, Wang Z (2016) A low-cost conflict-free NoC for GPGPUs. In: Proceedings of the Design Automation Conference (DAC), pp 34:1\u201334:6","DOI":"10.1145\/2897937.2897963"},{"key":"2694_CR39","doi-asserted-by":"crossref","unstructured":"Cheng X, Zhao Y, Zhao H, Xie Y (2018) Packet pump: overcoming network bottleneck in on-chip interconnects for GPGPUs. In: Proceedings of the Design Automation Conference (DAC), pp 84:1\u201384:6","DOI":"10.1109\/DAC.2018.8465889"},{"key":"2694_CR40","doi-asserted-by":"crossref","unstructured":"Aguilera P, Morrow K, Kim NS (2014) Fair share: allocation of GPU resources for both performance and fairness. In: The 32nd IEEE International Conference on Computer Design, ICCD","DOI":"10.1109\/ICCD.2014.6974717"},{"key":"2694_CR41","doi-asserted-by":"crossref","unstructured":"Wang H, Luo F, Ibrahim M, Kayiran O, Jog A (2018) Efficient and fair multi-programming in GPUs via effective bandwidth management. In: Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp 247\u2013258","DOI":"10.1109\/HPCA.2018.00030"},{"key":"2694_CR42","doi-asserted-by":"crossref","unstructured":"Ausavarungnirun R, Landgraf J, Miller V, Ghose S, Gandhi J, Rossbach CJ, Mutlu O (2017) Mosaic: a GPU memory manager with application-transparent support for multiple page sizes. In: Proceedings of the International Symposium on Microarchitecture (MICRO), pp 136\u2013150","DOI":"10.1145\/3123939.3123975"},{"key":"2694_CR43","doi-asserted-by":"crossref","unstructured":"Dai H, Lin Z, Li C, Zhao C, Wang F, Zheng N, Zhou H (2018) Accelerate GPU concurrent kernel execution by mitigating memory pipeline stalls. In: Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp 208\u2013220","DOI":"10.1109\/HPCA.2018.00027"},{"key":"2694_CR44","doi-asserted-by":"crossref","unstructured":"Liu Y, Yu Z, Eeckhout L, Reddi VJ, Luo Y, Wang X, Wang Z, Xu C (2016) Barrier-aware warp scheduling for throughput processors. In: Proceedings of the International Conference on Supercomputing (ICS), pp 42:1\u201342:12","DOI":"10.1145\/2925426.2926267"},{"key":"2694_CR45","doi-asserted-by":"crossref","unstructured":"Jog A, Kayiran O, Mishra AK, andemir MT, Mutlu O, Iyer R, Das CR (2013) Orchestrated scheduling and prefetching for GPGPUs. In: ACM SIGARCH Computer Architecture News, vol 41, pp 332\u2013343. ACM","DOI":"10.1145\/2508148.2485951"},{"key":"2694_CR46","doi-asserted-by":"crossref","unstructured":"Wang B, Zhu Y, Yu W (2016) OAWS: memory occlusion aware warp scheduling. In: Proceedings of the International Conference on Parallel Architecture and Compilation Techniques (PACT), pp 45\u201355","DOI":"10.1145\/2967938.2967947"},{"key":"2694_CR47","doi-asserted-by":"crossref","unstructured":"Rogers TG, O\u2019Connor M, Aamodt TM (2012) Cache-conscious wavefront scheduling. In: Proceedings of the International Symposium on Microarchitecture (MICRO), pp 72\u201383","DOI":"10.1109\/MICRO.2012.16"},{"key":"2694_CR48","unstructured":"Lee S-Y, Arunkumar A, Wu C-J (2015) CAWA: coordinated warp scheduling and cache prioritization for critical warp acceleration of GPGPU workloads. In: Proceedings of the International Symposium on Computer Architecture (ISCA), pp 515\u2013527"},{"key":"2694_CR49","doi-asserted-by":"crossref","unstructured":"Xie X, Liang Y, Wang Y, Sun G, Wang T (2015) Coordinated static and dynamic cache bypassing for GPUs. In: Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp 76\u201388","DOI":"10.1109\/HPCA.2015.7056023"},{"key":"2694_CR50","doi-asserted-by":"crossref","unstructured":"Jia W, Shaw KA, Martonosi M (2014) MRPB: memory request prioritization for massively parallel processors. In: Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp 272\u2013283","DOI":"10.1109\/HPCA.2014.6835938"},{"key":"2694_CR51","doi-asserted-by":"crossref","unstructured":"Jeon H, Ravi GS, Kim NS, Annavaram M (2015) GPU register file virtualization. In: Proceedings of the International Symposium on Microarchitecture (MICRO), pp 420\u2013432","DOI":"10.1145\/2830772.2830784"},{"key":"2694_CR52","doi-asserted-by":"crossref","unstructured":"Abdel-Majeed M, Annavaram M (2013) Warped register file: a power efficient register file for GPGPUs. In: Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp 412\u2013423","DOI":"10.1109\/HPCA.2013.6522337"},{"key":"2694_CR53","doi-asserted-by":"crossref","unstructured":"Jing N, Shen Y, Lu Y, Ganapathy S, Mao Z, Guo M, Canal R, Liang X (2013) An energy-efficient and scalable eDRAM-based register file architecture for GPGPU. In: Proceedings of the International Symposium on Computer Architecture (ISCA), pp 344\u2013355","DOI":"10.1145\/2508148.2485952"},{"key":"2694_CR54","unstructured":"Yoon M\u00a0K, Kim K, Lee S, Ro WW, Annavaram M (2016) Virtual thread: maximizing thread-level parallelism beyond GPU scheduling limit. In: Proceedings of the International Symposium on Computer Architecture (ISCA), pp 609\u2013621"},{"key":"2694_CR55","doi-asserted-by":"crossref","unstructured":"Vijaykumar N, Hsieh K, Pekhimenko G, Khan S, Shrestha A, Ghose S, Jog A, Gibbons PB, Mutlu O (2016) Zorua: a holistic approach to resource virtualization in GPUs. In: Proceedings of the International Symposium on Microarchitecture (MICRO), pp 1\u201314","DOI":"10.1109\/MICRO.2016.7783718"},{"key":"2694_CR56","doi-asserted-by":"crossref","unstructured":"Arunkumar A, Bolotin E, Cho B, Milic U, Ebrahimi E, Villa O, Jaleel A, Wu C-J, Nellans D (2017) MCM-GPU: multi-chip-module GPUs for continued performance scalability. In: Proceedings of the International Symposium on Computer Architecture (ISCA), pp 320\u2013332","DOI":"10.1145\/3140659.3080231"},{"key":"2694_CR57","doi-asserted-by":"crossref","unstructured":"Milic U, Villa O, Bolotin E, Arunkumar A, Ebrahimi E, Jaleel A, Ramirez A, Nellans D (2017) Beyond the socket: NUMA-aware GPUs. In: Proceedings of the International Symposium on Microarchitecture (MICRO), pp 123\u2013135","DOI":"10.1145\/3123939.3124534"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-018-2694-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-018-2694-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-018-2694-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,10]],"date-time":"2022-09-10T23:23:09Z","timestamp":1662852189000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-018-2694-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,1,25]]},"references-count":57,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2019,8]]}},"alternative-id":["2694"],"URL":"https:\/\/doi.org\/10.1007\/s11227-018-2694-x","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2019,1,25]]},"assertion":[{"value":"25 January 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}