BUPTBachelorThesis/ref.bib


@inproceedings{auerbach_compiler_2012,
	address = {San Francisco California},
	title = {A compiler and runtime for heterogeneous computing},
	isbn = {978-1-4503-1199-1},
	url = {https://dl.acm.org/doi/10.1145/2228360.2228411},
	doi = {10.1145/2228360.2228411},
	abstract = {Heterogeneous systems show a lot of promise for extracting highperformance by combining the beneﬁts of conventional architectures with specialized accelerators in the form of graphics processors (GPUs) and reconﬁgurable hardware (FPGAs). Extracting this performance often entails programming in disparate languages and models, making it hard for a programmer to work equally well on all aspects of an application. Further, relatively little attention is paid to co-execution—the problem of orchestrating program execution using multiple distinct computational elements that work seamlessly together.},
	language = {en},
	urldate = {2024-07-16},
	booktitle = {Proceedings of the 49th {Annual} {Design} {Automation} {Conference}},
	publisher = {ACM},
	author = {Auerbach, Joshua and Bacon, David F. and Burcea, Ioana and Cheng, Perry and Fink, Stephen J. and Rabbah, Rodric and Shukla, Sunil},
	month = jun,
	year = {2012},
	pages = {271--276},
	file = {Auerbach et al. - 2012 - A compiler and runtime for heterogeneous computing.pdf:/home/ricardo/Zotero/storage/LCRKBKYC/Auerbach et al. - 2012 - A compiler and runtime for heterogeneous computing.pdf:application/pdf},
}

@article{y_y_2014,
	title = {异构并行编程模型研究与进展},
	volume = {25},
	issn = {1000-9825},
	url = {https://kns.cnki.net/kcms2/article/abstract?v=Dm4VI7mKrXMfvAZUNMUgX8reCA9i2gYJadV_oeNwrIXov3W3N3cznGwXoHcCBEa4U5IUycTU9RRAyeLGki8bNkCldPuZc4yQ0E68KW7fvo9-mj97g39uJA==&uniplatform=NZKPT&language=gb},
	doi = {10.13328/j.cnki.jos.004608},
	abstract = {近年来,异构系统硬件飞速发展.为了解决相应的编程和执行效率问题,异构并行编程模型已被广泛使用和研究.从异构并行编程接口与编译/运行时支持系统两个角度总结了异构并行编程模型最新的研究成果,它们为异构架构和上层应用带来的技术挑战提供了相应的解决方案.最后,结合目前的研究现状以及异构系统的发展,提出了异构并行编程模型的未来方向.},
	language = {中文;},
	number = {7},
	journal = {软件学报},
	author = {刘, 颖 and 吕, 方 and 王, 蕾 and 陈, 莉 and 崔, 慧敏 and 冯, 晓兵},
	year = {2014},
	keywords = {GPU, 异构并行编程模型, 异构系统, 编程接口, 编译, 运行时系统},
	pages = {1459--1475},
	file = {异构并行编程模型研究与进展_刘颖:/home/ricardo/Zotero/storage/GJDAISVR/异构并行编程模型研究与进展_刘颖.pdf:application/pdf},
}

@article{cai_coala_2024,
	title = {{COALA}: {A} {Compiler}-{Assisted} {Adaptive} {Library} {Routines} {Allocation} {Framework} for {Heterogeneous} {Systems}},
	volume = {73},
	copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
	issn = {0018-9340, 1557-9956, 2326-3814},
	shorttitle = {{COALA}},
	url = {https://ieeexplore.ieee.org/document/10495065/},
	doi = {10.1109/TC.2024.3385269},
	language = {en},
	number = {7},
	urldate = {2024-10-14},
	journal = {IEEE Transactions on Computers},
	author = {Cai, Qinyun and Tan, Guanghua and Yang, Wangdong and He, Xianhao and Yan, Yuwei and Li, Keqin and Li, Kenli},
	month = jul,
	year = {2024},
	pages = {1724--1737},
	file = {PDF:/home/ricardo/Zotero/storage/LVBVKYIS/Cai et al. - 2024 - COALA A Compiler-Assisted Adaptive Library Routines Allocation Framework for Heterogeneous Systems.pdf:application/pdf},
}

@article{dubach_compiling_nodate,
	title = {Compiling a high-level language for {GPUs}: (via language support for architectures and compilers)},
	abstract = {Languages such as OpenCL and CUDA offer a standard interface for general-purpose programming of GPUs. However, with these languages, programmers must explicitly manage numerous lowlevel details involving communication and synchronization. This burden makes programming GPUs difﬁcult and error-prone, rendering these powerful devices inaccessible to most programmers.},
	language = {en},
	author = {Dubach, Christophe and Cheng, Perry and Rabbah, Rodric and Bacon, David F and Fink, Stephen J},
	file = {PDF:/home/ricardo/Zotero/storage/NXNGV5KB/Dubach et al. - Compiling a high-level language for GPUs (via language support for architectures and compilers).pdf:application/pdf},
}

@article{auerbach_lime_nodate,
	title = {Lime: a {Java}-compatible and synthesizable language for heterogeneous architectures},
	abstract = {The halt in clock frequency scaling has forced architects and language designers to look elsewhere for continued improvements in performance. We believe that extracting maximum performance will require compilation to highly heterogeneous architectures that include reconﬁgurable hardware.},
	language = {en},
	author = {Auerbach, Joshua and Bacon, David F and Cheng, Perry and Rabbah, Rodric},
	file = {PDF:/home/ricardo/Zotero/storage/F7TKF8C2/Auerbach et al. - Lime a Java-compatible and synthesizable language for heterogeneous architectures.pdf:application/pdf},
}

@article{besard_effective_2019,
	title = {Effective {Extensible} {Programming}: {Unleashing} {Julia} on {GPUs}},
	volume = {30},
	copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
	issn = {1045-9219, 1558-2183, 2161-9883},
	shorttitle = {Effective {Extensible} {Programming}},
	url = {https://ieeexplore.ieee.org/document/8471188/},
	doi = {10.1109/TPDS.2018.2872064},
	abstract = {GPUs and other accelerators are popular devices for accelerating compute-intensive, parallelizable applications. However, programming these devices is a difﬁcult task. Writing efﬁcient device code is challenging, and is typically done in a low-level programming language. High-level languages are rarely supported, or do not integrate with the rest of the high-level language ecosystem. To overcome this, we propose compiler infrastructure to efﬁciently add support for new hardware or environments to an existing programming language. We evaluate our approach by adding support for NVIDIA GPUs to the Julia programming language. By integrating with the existing compiler, we signiﬁcantly lower the cost to implement and maintain the new compiler, and facilitate reuse of existing application code. Moreover, use of the high-level Julia programming language enables new and dynamic approaches for GPU programming. This greatly improves programmer productivity, while maintaining application performance similar to that of the ofﬁcial NVIDIA CUDA toolkit.},
	language = {en},
	number = {4},
	urldate = {2024-10-20},
	journal = {IEEE Transactions on Parallel and Distributed Systems},
	author = {Besard, Tim and Foket, Christophe and De Sutter, Bjorn},
	month = apr,
	year = {2019},
	pages = {827--841},
	file = {PDF:/home/ricardo/Zotero/storage/7VH3HSRD/Besard et al. - 2019 - Effective Extensible Programming Unleashing Julia on GPUs.pdf:application/pdf},
}

@article{faingnaert_flexible_2022,
	title = {Flexible {Performant} {GEMM} {Kernels} on {GPUs}},
	volume = {33},
	copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
	issn = {1045-9219, 1558-2183, 2161-9883},
	url = {https://ieeexplore.ieee.org/document/9655458/},
	doi = {10.1109/TPDS.2021.3136457},
	abstract = {General Matrix Multiplication or GEMM kernels take centre place in high performance computing and machine learning. Recent NVIDIA GPUs include GEMM accelerators, such as NVIDIA’s Tensor Cores. Their exploitation is hampered by the twolanguage problem: it requires either low-level programming which implies low programmer productivity or using libraries that only offer a limited set of components. Because rephrasing algorithms in terms of established components often introduces overhead, the libraries’ lack of ﬂexibility limits the freedom to explore new algorithms. Researchers using GEMMs can hence not enjoy programming productivity, high performance, and research ﬂexibility at once. In this paper we solve this problem. We present three sets of abstractions and interfaces to program GEMMs within the scientiﬁc Julia programming language. The interfaces and abstractions are co-designed for researchers’ needs and Julia’s features to achieve sufﬁcient separation of concerns and ﬂexibility to easily extend basic GEMMs in many different ways without paying a performance price. Comparing our GEMMs to state-of-the-art libraries cuBLAS and CUTLASS, we demonstrate that our performance is in the same ballpark of the libraries, and in some cases even exceeds it, without having to write a single line of code in CUDA C++ or assembly, and without facing ﬂexibility limitations.},
	language = {en},
	number = {9},
	urldate = {2024-10-20},
	journal = {IEEE Transactions on Parallel and Distributed Systems},
	author = {Faingnaert, Thomas and Besard, Tim and De Sutter, Bjorn},
	month = sep,
	year = {2022},
	pages = {2230--2248},
	file = {PDF:/home/ricardo/Zotero/storage/JUHZTABS/Faingnaert et al. - 2022 - Flexible Performant GEMM Kernels on GPUs.pdf:application/pdf},
}

@inproceedings{tiotto_experiences_2024,
	address = {Edinburgh, United Kingdom},
	title = {Experiences {Building} an {MLIR}-{Based} {SYCL} {Compiler}},
	copyright = {https://doi.org/10.15223/policy-029},
	isbn = {9798350395099},
	url = {https://ieeexplore.ieee.org/document/10444866/},
	doi = {10.1109/CGO57630.2024.10444866},
	abstract = {Similar to other programming models, compilers for SYCL, the open programming model for heterogeneous computing based on C++, would beneﬁt from access to higher-level intermediate representations. The loss of high-level structure and semantics caused by premature lowering to low-level intermediate representations and the inability to reason about host and device code simultaneously present major challenges for SYCL compilers. The MLIR compiler framework, through its dialect mechanism, allows to model domain-speciﬁc, high-level intermediate representations and provides the necessary facilities to address these challenges.},
	language = {en},
	urldate = {2024-10-29},
	booktitle = {2024 {IEEE}/{ACM} {International} {Symposium} on {Code} {Generation} and {Optimization} ({CGO})},
	publisher = {IEEE},
	author = {Tiotto, Ettore and Pérez, Víctor and Tsang, Whitney and Sommer, Lukas and Oppermann, Julian and Lomüller, Victor and Goli, Mehdi and Brodman, James},
	month = mar,
	year = {2024},
	pages = {399--410},
	file = {PDF:/home/ricardo/Zotero/storage/LJBFS32J/Tiotto et al. - 2024 - Experiences Building an MLIR-Based SYCL Compiler.pdf:application/pdf},
}

@article{perez_user-driven_2023,
	title = {User-driven {Online} {Kernel} {Fusion} for {SYCL}},
	volume = {20},
	issn = {1544-3566, 1544-3973},
	url = {https://dl.acm.org/doi/10.1145/3571284},
	doi = {10.1145/3571284},
	abstract = {Heterogeneous programming models are becoming increasingly popular to support the ever-evolving hardware architectures, especially for new and emerging specialized accelerators optimizing specific tasks. While such programs provide performance portability of the existing applications across various heterogeneous architectures to some extent, short-running device kernels can affect an application performance due to overheads of data transfer, synchronization, and kernel launch. While in applications with one or two short-running kernels the overhead can be negligible, it can be noticeable when these short-running kernels dominate the overall number of kernels in an application, as it is the case in graph-based neural network models, where there are several small memory-bound nodes alongside few large compute-bound nodes.

              To reduce the overhead, combining several kernels into a single, more optimized kernel is an active area of research. However, this task can be time-consuming and error-prone given the huge set of potential combinations. This can push programmers to seek a tradeoff between (a) task-specific kernels with low overhead but hard to maintain and (b) smaller modular kernels with higher overhead but easier to maintain. While there are DSL-based approaches, such as those provided for machine learning frameworks, which offer the possibility of such a fusion, they are limited to a particular domain and exploit specific knowledge of that domain and, as a consequence, are hard to port elsewhere. This study explores the feasibility of a user-driven
              kernel fusion
              through an extension to the SYCL API to address the automation of kernel fusion. The proposed solution requires programmers to define the subgraph regions that are potentially suitable for fusion without any modification to the kernel code or the function signature. We evaluate the performance benefit of our approach on common neural networks and study the performance improvement in detail.},
	language = {en},
	number = {2},
	urldate = {2024-10-29},
	journal = {ACM Transactions on Architecture and Code Optimization},
	author = {Pérez, Víctor and Sommer, Lukas and Lomüller, Victor and Narasimhan, Kumudha and Goli, Mehdi},
	month = jun,
	year = {2023},
	pages = {1--25},
	file = {PDF:/home/ricardo/Zotero/storage/MRYW3TTN/Pérez et al. - 2023 - User-driven Online Kernel Fusion for SYCL.pdf:application/pdf},
}

@incollection{hutchison_accull_2012,
	address = {Berlin, Heidelberg},
	title = {{accULL}: {An} {OpenACC} {Implementation} with {CUDA} and {OpenCL} {Support}},
	volume = {7484},
	isbn = {978-3-642-32819-0 978-3-642-32820-6},
	shorttitle = {{accULL}},
	url = {http://link.springer.com/10.1007/978-3-642-32820-6_86},
	abstract = {The irruption in the HPC scene of hardware accelerators, like GPUs, has made available unprecedented performance to developers. However, even expert developers may not be ready to exploit the new complex processor hierarchies. We need to ﬁnd a way to leverage the programming eﬀort in these devices at programming language level, otherwise, developers will spend most of their time focusing on device-speciﬁc code instead of implementing algorithmic enhancements. The recent advent of the OpenACC standard for heterogeneous computing represents an eﬀort in this direction. This initiative, combined with future releases of the OpenMP standard, will converge into a fully heterogeneous framework that will cope the programming requirements of future computer architectures. In this work we present accULL, a novel implementation of the OpenACC standard, based on the combination of a source to source compiler and a runtime library. To our knowledge, our approach is the ﬁrst providing support for both OpenCL and CUDA platforms under this new standard.},
	language = {en},
	urldate = {2024-11-06},
	booktitle = {Euro-{Par} 2012 {Parallel} {Processing}},
	publisher = {Springer Berlin Heidelberg},
	author = {Reyes, Ruymán and López-Rodríguez, Iván and Fumero, Juan J. and De Sande, Francisco},
	editor = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Doug and Vardi, Moshe Y. and Weikum, Gerhard and Kaklamanis, Christos and Papatheodorou, Theodore and Spirakis, Paul G.},
	year = {2012},
	doi = {10.1007/978-3-642-32820-6_86},
	note = {Series Title: Lecture Notes in Computer Science},
	pages = {871--882},
	file = {PDF:/home/ricardo/Zotero/storage/I3TR6EWF/Reyes et al. - 2012 - accULL An OpenACC Implementation with CUDA and OpenCL Support.pdf:application/pdf},
}

@incollection{malawski_sycl-bench_2020,
	address = {Cham},
	title = {{SYCL}-{Bench}: {A} {Versatile} {Cross}-{Platform} {Benchmark} {Suite} for {Heterogeneous} {Computing}},
	volume = {12247},
	isbn = {978-3-030-57674-5 978-3-030-57675-2},
	shorttitle = {{SYCL}-{Bench}},
	url = {https://link.springer.com/10.1007/978-3-030-57675-2_39},
	abstract = {The SYCL standard promises to enable high productivity in heterogeneous programming of a broad range of parallel devices, including multicore CPUs, GPUs, and FPGAs. Its modern and expressive C++ API design, as well as ﬂexible task graph execution model give rise to ample optimization opportunities at run-time, such as the overlapping of data transfers and kernel execution. However, it is not clear which of the existing SYCL implementations perform such scheduling optimizations, and to what extent. Furthermore, SYCL’s high level of abstraction may raise concerns about sacriﬁcing performance for ease of use. Benchmarks are required to accurately assess the performance behavior of high-level programming models such as SYCL. To this end, we present SYCLBench, a versatile benchmark suite for device characterization and runtime benchmarking, written in SYCL. We experimentally demonstrate the eﬀectiveness of SYCL-Bench by performing device characterization of the NVIDIA TITAN X GPU, and by evaluating the eﬃciency of the hipSYCL and ComputeCpp SYCL implementations.},
	language = {en},
	urldate = {2024-11-11},
	booktitle = {Euro-{Par} 2020: {Parallel} {Processing}},
	publisher = {Springer International Publishing},
	author = {Lal, Sohan and Alpay, Aksel and Salzmann, Philip and Cosenza, Biagio and Hirsch, Alexander and Stawinoga, Nicolai and Thoman, Peter and Fahringer, Thomas and Heuveline, Vincent},
	editor = {Malawski, Maciej and Rzadca, Krzysztof},
	year = {2020},
	doi = {10.1007/978-3-030-57675-2_39},
	note = {Series Title: Lecture Notes in Computer Science},
	pages = {629--644},
	file = {PDF:/home/ricardo/Zotero/storage/7YQEHBJJ/Lal et al. - 2020 - SYCL-Bench A Versatile Cross-Platform Benchmark Suite for Heterogeneous Computing.pdf:application/pdf},
}

@inproceedings{dagli_shared_2024,
	address = {Edinburgh United Kingdom},
	title = {Shared {Memory}-contention-aware {Concurrent} {DNN} {Execution} for {Diversely} {Heterogeneous} {System}-on-{Chips}},
	isbn = {9798400704352},
	url = {https://dl.acm.org/doi/10.1145/3627535.3638502},
	doi = {10.1145/3627535.3638502},
	abstract = {Two distinguishing features of state-of-the-art mobile and autonomous systems are: 1) There are often multiple workloads, mainly deep neural network (DNN) inference, running concurrently and continuously. 2) They operate on shared memory System-on-Chips (SoC) that embed heterogeneous accelerators tailored for specific operations. State-of-the-art systems lack efficient performance and resource management techniques necessary to either maximize total system throughput or minimize end-to-end workload latency. In this work, we propose HaX-CoNN, a novel scheme that characterizes and maps layers in concurrently executing DNN inference workloads to a diverse set of accelerators within an SoC. Our scheme uniquely takes per-layer execution characteristics, shared memory (SM) contention, and inter-accelerator transitions into account to find optimal schedules. We evaluate HaX-CoNN on NVIDIA Orin, NVIDIA Xavier, and Qualcomm Snapdragon 865 SoCs. Our experimental results indicate that HaX-CoNN can minimize memory contention by up to 45\% and improve total latency and throughput by up to 32\% and 29\%, respectively, compared to the state-of-the-art.},
	language = {en},
	urldate = {2024-11-18},
	booktitle = {Proceedings of the 29th {ACM} {SIGPLAN} {Annual} {Symposium} on {Principles} and {Practice} of {Parallel} {Programming}},
	publisher = {ACM},
	author = {Dagli, Ismet and Belviranli, Mehmet E.},
	month = mar,
	year = {2024},
	pages = {243--256},
	file = {PDF:/home/ricardo/Zotero/storage/4UPZN9QQ/Dagli and Belviranli - 2024 - Shared Memory-contention-aware Concurrent DNN Execution for Diversely Heterogeneous System-on-Chips.pdf:application/pdf},
}

@article{zhou_deeptm_2024,
	title = {{DeepTM}: {Efficient} {Tensor} {Management} in {Heterogeneous} {Memory} for {DNN} {Training}},
	volume = {35},
	copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
	issn = {1045-9219, 1558-2183, 2161-9883},
	shorttitle = {{DeepTM}},
	url = {https://ieeexplore.ieee.org/document/10606082/},
	doi = {10.1109/TPDS.2024.3431910},
	abstract = {Deep Neural Networks (DNNs) have gained widespread adoption in diverse ﬁelds, including image classiﬁcation, object detection, and natural language processing. However, training large-scale DNN models often encounters signiﬁcant memory bottlenecks, which ask for efﬁcient management of extensive tensors. Heterogeneous memory system, which combines persistent memory (PM) modules with traditional DRAM, offers an economically viable solution to address tensor management challenges during DNN training. However, existing memory management methods on heterogeneous memory systems often lead to low PM access efﬁciency, low bandwidth utilization, and incomplete analysis of model characteristics. To overcome these hurdles, we introduce an efﬁcient tensor management approach, DeepTM, tailored for heterogeneous memory to alleviate memory bottlenecks during DNN training. DeepTM employs page-level tensor aggregation to enhance PM read and write performance and executes contiguous page migration to increase memory bandwidth. Through an analysis of tensor access patterns and model characteristics, we quantify the overall performance and transform the performance optimization problem into the framework of Integer Linear Programming. Additionally, we achieve tensor heat recognition by dynamically adjusting the weights of four key tensor characteristics and develop a global optimization strategy using Deep Reinforcement Learning. To validate the efﬁcacy of our approach, we implement and evaluate DeepTM, utilizing the TensorFlow framework running on a PMbased heterogeneous memory system. The experimental results demonstrate that DeepTM achieves performance improvements of up to 36\% and 49\% compared to the current state-of-the-art memory management strategies AutoTM and Sentinel, respectively.},
	language = {en},
	number = {11},
	urldate = {2024-11-18},
	journal = {IEEE Transactions on Parallel and Distributed Systems},
	author = {Zhou, Haoran and Rang, Wei and Chen, Hongyang and Zhou, Xiaobo and Cheng, Dazhao},
	month = nov,
	year = {2024},
	pages = {1920--1935},
	file = {PDF:/home/ricardo/Zotero/storage/QFNGXW66/Zhou et al. - 2024 - DeepTM Efficient Tensor Management in Heterogeneous Memory for DNN Training.pdf:application/pdf},
}

@article{yao_memory-constraint-aware_nodate,
	title = {A {Memory}-{Constraint}-{Aware} {List} {Scheduling} {Algorithm} for {Memory}-{Constraint} {Heterogeneous} {Muti}-{Processor} {System}},
	abstract = {An effective scheduling algorithm is vital for the execution efﬁciency of applications on Heterogeneous Muti-Processor System (HMPS), especially Memory-Constraint Heterogeneous Muti-Processor System (MCHMPS). Stringent local and external memory constraints have signiﬁcant impact on the execution performance of applications executed on MCHMPS, predictability is also a critical factor for task scheduling on MCHMPS. Therefore, a novel list scheduling algorithm termed Memory-constraint-aware Improved Predict Priority and Optimistic Processor Selection Scheduling (MIPPOSS), essentially a heuristic search optimization algorithm, is proposed in this paper. In MIPPOSS, a predictive approach is applied for task prioritization and processor selection, and a novel memory-constraint-aware approach is employed in the processor selection phase. MIPPOSS has polynomial complexity and produces better results for application scheduling on target architecture. Randomly generated DAGs and 3 real-world applications experiments, including Cybershake, LIGO, and Montage, show that MIPPOSS outperforms the other ﬁve competing algorithms by a large margin.},
	language = {en},
	author = {Yao, Yu and Song, Yukun and Huang, Ying and Ni, Wei and Zhang, Duoli},
	file = {PDF:/home/ricardo/Zotero/storage/7UL7SXWV/Yao et al. - A Memory-Constraint-Aware List Scheduling Algorithm for Memory-Constraint Heterogeneous Muti-Process.pdf:application/pdf},
}