sample.bib

# This BibTex is provided as an example/test case for bb4dw Dokuwiki Plugin
# This file intentionally uses various Bibtex formatting and styles!

@TECHREPORT{Cann89,
  AUTHOR      = {D.C. Cann},
  EMAIL       = {},
  TITLE       = {{C}ompilation {T}echniques for {H}igh {P}erformance {A}pplicative
                 {C}omputation},
  INSTITUTION = {Lawrence Livermore National Laboratory},
  ADDRESS     = {LLNL, Livermore California},
  TYPE        = {},
  NUMBER      = {CS-89-108},
  YEAR        = 1989,
  NOTE        = {},
  FTP         = {},
  KEYWORDS    = {},
  CONTENTS    = {A comprehensive introduction on SISAL's internal structure.
                 Includes a detailled description of the enhanced "update in place"
                 mechanism used by SISAL.} ,
  TOPICS   = {Sisal}
}
@INPROCEEDINGS{IPDPS2018,  author={T. {Macht} and C. {Grelck}},  booktitle={2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},   title={SAC Goes Cluster: Fully Implicit Distributed Computing},   year={2019},  volume={},  number={},  pages={996-1006},}

@ARTICLE{ ScholzJFP03,
  AUTHOR     = {Sven-Bodo Scholz},
  TITLE      = {{Single} {Assignment} {C} ---
                Efficient Support for High-Level Array Operations in a Functional Setting},
  JOURNAL    = {Journal of Functional Programming},
  YEAR       = {2003},
  VOLUME     = {13},
  NUMBER     = {6},
  PAGES      = {1005--1059},
  NOTE       = {},
  CONTENTS   = {Canonical reference for SAC},
  DOI        = {10.1017/S0956796802004458},
  ABSTRACT   = {This paper presents a novel approach for integrating arrays with access time O(1) into functional languages. It introduces n-dimensional arrays combined with a type system that supports hierarchies of array types with varying shape information as well as a shape- invariant form of array comprehension called with-loop. Together, these constructs allow for a programming style similar to that of array programming languages such as Apl. We use Single Assignment C (SaC), a functional C-variant aimed at numerical applications that is based on the proposed design, to demonstrate that programs written in that style can be compiled to code whose runtime performance is competitive with that of hand-optimized Fortran programs. However, essential prerequisites for such performance figures are a shape inference system integrated in the type system as well as several high-level optimizations. Most notably of these is With Loop Folding, an optimization technique for eliminating intermediate arrays.},
  CATEGORY   = {Journal},
  TOPICS     = {SAC}
}

@ARTICLE{ GrelSchoIJPP06,
  AUTHOR    = {Clemens Grelck and Sven-Bodo Scholz},
  TITLE     = {{SAC}: A Functional Array Language for Efficient Multithreaded Execution},
  JOURNAL   = {International Journal of Parallel Programming},
  YEAR      = 2006,
  VOLUME    = {34},
  NUMBER    = {4},
  PAGES     = {383--427},
  NOTE      = {},
  CONTENTS  = {[ISSN: 0885-7458 (Paper) 1573-7640 (Online)]},
  DOI = {10.1007/s10766-006-0018-x},
  ABSTRACT  = {We give an in-depth introduction to the design of our functional array programming language SaC, the main aspects of its compilation into host machine code, and its parallelisation based on multi-threading. The language design of SaC aims at combining high-level, compositional array programming with fully automatic resource management for highly productive code development and maintenance. We outline the compilation process that maps SaC programs to computing machinery. Here, our focus is on optimisation techniques that aim at restructuring entire applications from nested compositions of general fine-grained operations into specialised coarse-grained operations. We present our implicit parallelisation technology for shared memory architectures based on multi-threading and discuss further optimisation opportunities on this level of code generation. Both optimisation and parallelisation rigorously exploit the absence of side-effects and the explicit data flow characteristic of a functional setting.},
  TOPICS    = {SAC},
  CATEGORY  = {Journal},
  AFFIL     = {ctca}

}

@misc{Sakharnykh2017,
    author = "{Nikolay Sakharnykh}",
    title = "{Maximizing Unified Memory Performance in CUDA}",
    year = "2017",
    howpublished = "\url{https://devblogs.nvidia.com/maximizing-unified-memory-performance-cuda/}",
    note = "[Online; 29-May-2019]"
}
@misc{cuda10.1doc,
    author = "{NVIDIA Corporation}",
    title = "{CUDA Toolkit Documentation v10.1.168}",
    year = "2019",
    howpublished = "\url{https://web.archive.org/web/20190523173815/https://docs.nvidia.com/cuda/archive/10.1/}",
    note = "[WayBack Machine; 02-Nov-2019]"
}
@misc{cudaConcurrency2011,
    author = "{Steve Rennich}",
    title = "{CUDA C/C++ Streams and Concurrency}",
    year = "2011",
    howpublished = "\url{http://on-demand.gputechconf.com/gtc-express/2011/presentations/StreamsAndConcurrencyWebinar.pdf}",
    note = "[Online; 03-Nov-2019]"
}
@misc{cudaUnifiedMem2018,
    author = "{Nikolay Sakharnykh}",
    Title = "{Everything You Need To Know About Unified Memory}",
    year = "2018",
    howpublished = "\url{http://on-demand.gputechconf.com/gtc/2018/presentation/s8430-everything-you-need-to-know-about-unified-memory.pdf}",
    note = "[Online; 03-Nov-2019]"
}
@article{HARTMANN2019304,
    title = "GPUart - An application-based limited preemptive GPU real-time scheduler for embedded systems",
    journal = "Journal of Systems Architecture",
    volume = "97",
    pages = "304---319",
    year = "2019",
    issn = "1383-7621",
    doi = "https://doi.org/10.1016/j.sysarc.2018.10.005",
    author = "Christoph Hartmann and Ulrich Margull",
    keywords = "Real-time scheduling, Limited preemption, Graphics processing unit (GPU), GPU resource management, Embedded systems, Automotive",
    abstract = "Emerging technologies like autonomous driving entail computational intense software solutions. More and more companies accelerate their embedded applications by General Purpose Computing on the Graphics Processing Unit (GPGPU), in order to overcome those computational demands. Unfortunately, Graphics Processing Units (GPUs) severely lack real-time capability, for example controllable preemption support, which limits their applicability in the embedded domain. We therefore present GPUart, a framework for GPU real-time scheduling. GPUart focuses on embedded systems and requires neither hardware nor driver stack extensions. We propose a software-only approach for preemption, based on the fixed preemption point strategy. In contrast to prior work, GPUart enables preemption inside a thread block by adding fixed preemption points. We further propose a portable high-level resource management concept to enable gang scheduling on GPUs. GPUart can schedule GPU workload either under the Gang-Earliest Deadline First (EDF) or Gang-Fixed Task Priority (FTP) policy. A case-study on Nvidia Tegra X1, using real-world engine management applications from Audi AG and Continental Automotive GmbH, shows that only up to 0.28% additional global memory is required to enable interruptible thread blocks. GPUart reduces the worst observed response times by a factor of up to 221, leading to response times without deadline misses."
}
@article{grelck_2005,
    author={Grelck, Clemens},
    title={Shared memory multiprocessor support for functional array processing in SAC},
    volume={15},
    doi={10.1017/S0956796805005538},
    number={3},
    journal={Journal of Functional Programming},
    publisher={Cambridge University Press},
    year={2005},
    pages={353–401}
}
@InProceedings{ScholzIFL1997,
  author     = {Sven-Bodo Scholz},
  title      = {With-loop-folding in Sac --- Condensing Consecutive Array Operations},
  booktitle  = {Implementation of Functional Languages, 9th International Workshop (IFL'97), St. Andrews, UK, Selected Papers},
  year       = {1998},
  editor     = {Chris Clack and Tony Davie and Kevin Hammond},
  volume     = {1467},
  series     = {Lecture Notes in Computer Science},
  pages      = {72--92},
  publisher  = {Springer},
  abstract   = {This paper introduces a new compiler optimization called With-loop-folding. It is based on a special loop construct, the with-loop, which in the functional language SAC (for Single Assignment C) serves as a versatile vehicle to describe array operations on an elementwise basis. A general mechanism for combining two of these With-loops into a single loop construct is presented. This mechanism constitutes a powerful tool when it comes to generate efficiently executable code from high-level array specifications. By means of a few examples it is shown that even complex nestings of array operations similar to those available in Apl can be transformed into single loop operations which are similar to hand-optimized With-loop specifications. As a consequence, the way a complex array operation is combined from primitive array operations does not affect the runtime performance of the compiled code, i.e., the programmer is liberated from the burden to take performance considerations into account when specifying complex array operations.},
  category   = {core,design,opt},
  doi        = {10.1007/BFb0055425},
  isbn       = {978-3-540-64849-9},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC,Avoiding Temporaries,Implementation of Arrays},
  url        = {wlf-st-andrews-97.pdf},
}
@inproceedings{jingGPU2011,
  author     = {Jing Guo and Jeyarajan Thiyagalingam and Sven-Bodo Scholz},
  title      = {Breaking the Gpu Programming Barrier with the Auto-parallelising Sac Compiler},
  booktitle  = {6th Workshop on Declarative Aspects of Multicore Programming (DAMP'11), Austin, USA},
  year       = {2011},
  pages      = {15--24},
  publisher  = {ACM Press},
  doi        = {10.1145/1926354.1926359},
}
@inproceedings{jingGPU2009,
  author    = {Jing Guo and
               Jeyarajan Thiyagalingam and
               Sven{-}Bodo Scholz},
  editor    = {Zolt{\'{a}}n Horv{\'{a}}th and
               Vikt{\'{o}}ria Zs{\'{o}}k and
               Peter Achten and
               Pieter W. M. Koopman},
  title     = {Towards Compiling {SAC} to {CUDA}},
  booktitle = {Proceedings of the Tenth Symposium on Trends in Functional Programming,
               {TFP} 2009, Kom{\'{a}}rno, Slovakia, June 2-4, 2009},
  series    = {Trends in Functional Programming},
  volume    = {10},
  pages     = {33--48},
  publisher = {Intellect},
  year      = {2009},
  timestamp = {Tue, 04 Jun 2013 08:01:28 +0200},
  biburl    = {https://dblp.org/rec/conf/sfp/GuoTS09.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{emr2018,
author = {Vie\ss{}mann, Hans-Nikolai and \v{S}inkarovs, Artjoms and Scholz, Sven-Bodo},
title = {Extended Memory Reuse: An Optimisation for Reducing Memory Allocations},
year = {2018},
isbn = {9781450371438},
publisher = {ACM},
address = {New York, NY, USA},
doi = {10.1145/3310232.3310242},
booktitle = {Proceedings of the 30th Symposium on Implementation and Application of Functional Languages},
pages = {107–118},
numpages = {12},
keywords = {memory management, compiler optimisation, reference counting},
location = {Lowell, MA, USA},
series = {IFL 2018}
}
@inproceedings{Halide,
author = {Ragan-Kelley, Jonathan and Barnes, Connelly and Adams, Andrew and Paris, Sylvain and Durand, Fr\'{e}do and Amarasinghe, Saman},
title = {Halide: A Language and Compiler for Optimizing Parallelism, Locality, and Recomputation in Image Processing Pipelines},
year = {2013},
isbn = {9781450320146},
publisher = {ACM},
address = {New York, NY, USA},
doi = {10.1145/2491956.2462176},
abstract = {Image processing pipelines combine the challenges of stencil computations and stream programs. They are composed of large graphs of different stencil stages, as well as complex reductions, and stages with global or data-dependent access patterns. Because of their complex structure, the performance difference between a naive implementation of a pipeline and an optimized one is often an order of magnitude. Efficient implementations require optimization of both parallelism and locality, but due to the nature of stencils, there is a fundamental tension between parallelism, locality, and introducing redundant recomputation of shared values.We present a systematic model of the tradeoff space fundamental to stencil pipelines, a schedule representation which describes concrete points in this space for each stage in an image processing pipeline, and an optimizing compiler for the Halide image processing language that synthesizes high performance implementations from a Halide algorithm and a schedule. Combining this compiler with stochastic search over the space of schedules enables terse, composable programs to achieve state-of-the-art performance on a wide range of real image processing pipelines, and across different hardware architectures, including multicores with SIMD, and heterogeneous CPU+GPU execution. From simple Halide programs written in a few hours, we demonstrate performance up to 5x faster than hand-tuned C, intrinsics, and CUDA implementations optimized by experts over weeks or months, for image processing applications beyond the reach of past automatic compilers.},
booktitle = {Proceedings of the 34th ACM SIGPLAN Conference on Programming Language Design and Implementation},
pages = {519–530},
numpages = {12},
keywords = {redundant computation, gpu, compiler, vectorization, image processing, parallelism, autotuning, optimization, domain specific language, locality},
location = {Seattle, Washington, USA},
series = {PLDI '13}
}
@inproceedings{Futhark,
author = {Henriksen, Troels and Serup, Niels G. W. and Elsman, Martin and Henglein, Fritz and Oancea, Cosmin E.},
title = {Futhark: Purely Functional GPU-Programming with Nested Parallelism and in-Place Array Updates},
year = {2017},
isbn = {9781450349888},
publisher = {ACM},
address = {New York, NY, USA},
doi = {10.1145/3062341.3062354},
abstract = { Futhark is a purely functional data-parallel array language that offers a machine-neutral programming model and an optimising compiler that generates OpenCL code for GPUs.  This paper presents the design and implementation of three key features of Futhark that seek a suitable middle ground with imperative approaches.  First, in order to express efficient code inside the parallel constructs, we introduce a simple type system for in-place updates that ensures referential transparency and supports equational reasoning.  Second, we furnish Futhark with parallel operators capable of expressing efficient strength-reduced code, along with their fusion rules.  Third, we present a flattening transformation aimed at enhancing the degree of parallelism that (i) builds on loop interchange and distribution but uses higher-order reasoning rather than array-dependence analysis, and (ii) still allows further locality-of-reference optimisations. Finally, an evaluation on 16 benchmarks demonstrates the impact of the language and compiler features and shows application-level performance competitive with hand-written GPU code. },
booktitle = {Proceedings of the 38th ACM SIGPLAN Conference on Programming Language Design and Implementation},
pages = {556–571},
numpages = {16},
keywords = {GPGPU, compilers, functional language, parallel},
location = {Barcelona, Spain},
series = {PLDI 2017}
}
@InProceedings{GrelSchoTrojIFL03,
  author     = {Clemens Grelck and Sven-Bodo Scholz and Kai Trojahner},
  title      = {With-loop Scalarization: Merging Nested Array Operations},
  booktitle  = {Implementation of Functional Languages, 15th International Workshop (IFL'03), Edinburgh, Scotland, UK, Revised Selected Papers},
  year       = {2004},
  editor     = {Phil Trinder and Greg Michaelson},
  volume     = {3145},
  series     = {Lecture Notes in Computer Science},
  publisher  = {Springer},
  category   = {design, opt},
  doi        = {10.1007/978-3-540-27861-0_8},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC,Avoiding Temporaries},
  url        = {WLSMNAO.pdf},
}
@inproceedings{HiCUDA,
	author = {Han, Tianyi David and Abdelrahman, Tarek S.},
	title = {{HiCUDA: A High-Level Directive-Based Language for GPU Programming}},
	year = {2009},
	isbn = {9781605585178},
	publisher = {ACM},
	address = {New York, NY, USA},
	doi = {10.1145/1513895.1513902},
	abstract = {The Compute Unified Device Architecture (CUDA) has become a de facto standard for programming NVIDIA GPUs. However, CUDA places on the programmer the burden of packaging GPU code in separate functions, of explicitly managing data transfer between the host memory and various components of the GPU memory, and of manually optimizing the utilization of the GPU memory. Practical experience shows that the programmer needs to make significant code changes, which are often tedious and error-prone, before getting an optimized program. We have designed hiCUDA, a high-level directive-based language for CUDA programming. It allows programmers to perform these tedious tasks in a simpler manner, and directly to the sequential code. Nonetheless, it supports the same programming paradigm already familiar to CUDA programmers. We have prototyped a source-to-source compiler that translates a hiCUDA program to a CUDA program. Experiments using five standard CUDA bechmarks show that the simplicity and flexibility hiCUDA provides come at no expense to performance.},
	booktitle = {Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units},
	pages = {52–61},
	numpages = {10},
	keywords = {GPGPU, data parallel programming, CUDA},
	location = {Washington, D.C., USA},
	series = {GPGPU-2}
}
@INPROCEEDINGS{LIFTIR,
	author={M. {Steuwer} and T. {Remmelg} and C. {Dubach}},
	booktitle={2017 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)},
	title={{LIFT: A functional data-parallel IR for high-performance GPU code generation}},
	year={2017},
	volume={},
	number={},
	pages={74-85},
	doi={10.1109/CGO.2017.7863730}
}
@article{fitzgerald1996,
 author = {Fitzgerald, Steven M. and Oldehoeft, Rodney R.},
 title = {Update-in-place Analysis for True Multidimensional Arrays},
 journal = {Sci. Program.},
 issue_date = {Summer 1996},
 volume = {5},
 number = {2},
 month = Jul,
 year = {1996},
 issn = {1058-9244},
 pages = {147--160},
 numpages = {14},
 doi = {10.1155/1996/493673},
 acmid = {226640},
 publisher = {IOS Press},
 address = {Amsterdam, The Netherlands, The Netherlands},
}
@inproceedings{Guo2014impact,
	author = {Guo, Jing and Bernecky, Robert and
			Thiyagalingam, Jeyarajan and Scholz, Sven-Bodo},
	title = {Polyhedral Methods for Improving Parallel Update-in-Place},
	booktitle = {Proceedings of the 4th International Workshop on Polyhedral Compilation Techniques},
	editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
	year   = 2014,
	month  = Jan,
	address = {Vienna, Austria}
}
@INPROCEEDINGS{chien2019,
    author={S. {Chien} and I. {Peng} and S. {Markidis}},
    booktitle={2019 IEEE/ACM Workshop on Memory Centric High Performance Computing (MCHPC)},
    title={Performance Evaluation of Advanced Features in CUDA Unified Memory},
    year={2019},
    volume={},
    number={},
    pages={50-57},
    doi={10.1109/MCHPC49590.2019.00014}
}
@InProceedings{dyntaskgpu2013,
    author={Chatterjee, Sanjay and Grossman, Max and Sb{\^i}rlea, Alina and Sarkar, Vivek},
    editor={Rajopadhye, Sanjay and Mills Strout, Michelle},
    title={{Dynamic Task Parallelism with a GPU Work-Stealing Runtime System}},
    booktitle={Languages and Compilers for Parallel Computing},
    year={2013},
    publisher={Springer Berlin Heidelberg},
    address={Berlin, Heidelberg},
    pages={203--217},
    abstract={NVIDIA's Compute Unified Device Architecture (CUDA) enabled GPUs become accessible to mainstream programming. Abundance of simple computational cores and high memory bandwidth make GPUs ideal candidates for data parallel applications. However, its potential for executing applications that combine task and data parallelism has not been explored in detail. CUDA does not provide a viable interface for creating dynamic tasks and handling load balancing issues. Any support for such has to be orchestrated entirely by the CUDA programmer today.},
    isbn={978-3-642-36036-7},
    doi={10.1007/978-3-642-36036-7_14}
}
@article{async2012,
	title = {{Performance models for asynchronous data transfers on consumer Graphics Processing Units}},
	journal = {Journal of Parallel and Distributed Computing},
	volume = {72},
	number = {9},
	pages = {1117--1126},
	year = {2012},
	note = {Accelerators for High-Performance Computing},
	issn = {0743-7315},
	doi = {10.1016/j.jpdc.2011.07.011},
	author = {Juan Gómez-Luna and José María González-Linares and José Ignacio Benavides and Nicolás Guil},
	keywords = {GPU, CUDA, Asynchronous transfers, Streams, Overlapping of communication and computation},
	abstract = {Graphics Processing Units (GPU) have impressively arisen as general-purpose coprocessors in high performance computing applications, since the launch of the Compute Unified Device Architecture (CUDA). However, they present an inherent performance bottleneck in the fact that communication between two separate address spaces (the main memory of the CPU and the memory of the GPU) is unavoidable. The CUDA Application Programming Interface (API) provides asynchronous transfers and streams, which permit a staged execution, as a way to overlap communication and computation. Nevertheless, a precise manner to estimate the possible improvement due to overlapping does not exist, neither a rule to determine the optimal number of stages or streams in which computation should be divided. In this work, we present a methodology that is applied to model the performance of asynchronous data transfers of CUDA streams on different GPU architectures. Thus, we illustrate this methodology by deriving expressions of performance for two different consumer graphic architectures belonging to the more recent generations. These models permit programmers to estimate the optimal number of streams in which the computation on the GPU should be broken up, in order to obtain the highest performance improvements. Finally, we have checked the suitability of our performance models with three applications based on codes from the CUDA Software Development Kit (SDK) with successful results.}
}
@InProceedings{autocuda2012,
author={Jung, Hanwoong and Yi, Youngmin and Ha, Soonhoi},
editor={Wyrzykowski, Roman and Dongarra, Jack and Karczewski, Konrad and Wa{\'{s}}niewski, Jerzy},
title={{Automatic CUDA Code Synthesis Framework for Multicore CPU and GPU Architectures}},
booktitle={Parallel Processing and Applied Mathematics},
year={2012},
publisher={Springer Berlin Heidelberg},
address={Berlin, Heidelberg},
pages={579--588},
abstract={Recently, general purpose GPU (GPGPU) programming has spread rapidly after CUDA was first introduced to write parallel programs in high-level languages for NVIDIA GPUs. While a GPU exploits data parallelism very effectively, task-level parallelism is exploited as a multi-threaded program on a multicore CPU. For such a heterogeneous platform that consists of a multicore CPU and GPU, we propose an automatic code synthesis framework that takes a process network model specification as input and generates a multithreaded CUDA code. With the model based specification, one can explicitly specify both function-level and loop-level parallelism in an application and explore the wide design space in mapping of function blocks and selecting the communication methods between CPU and GPU. The proposed technique is complementary to other high-level methods of CUDA programming.},
isbn={978-3-642-31464-3},
doi={10.1007/978-3-642-31464-3_59}
}

@INPROCEEDINGS{uintah2012,
	author={Q. {Meng} and A. {Humphrey} and M. {Berzins}},
	booktitle={2012 SC Companion: High Performance Computing, Networking Storage and Analysis},
	title={The uintah framework: a unified heterogeneous task scheduling and runtime system},
	year={2012},
	volume={},
	number={},
	pages={2441-2448},
	abstract={The development of a new unified, multi-threaded runtime system for the execution of asynchronous tasks on heterogeneous systems is described in this work. These asynchronous tasks arise from the Uintah framework, which was developed to provide an environment for solving a broad class of fluid-structure interaction problems on structured adaptive grids. Uintah has a clear separation between its MPI-free user-coded tasks and its runtime system that ensures these tasks execute efficiently. This separation also allows for complete isolation of the application developer from the complexities involved with the parallelism Uintah provides. While we have designed scalable runtime systems for large CPU core counts, the emergence of heterogeneous systems, with additional on-node accelerators and co-processors presents additional design challenges in terms of effectively utilizing all computational resources on-node and managing multiple levels of parallelism. Our work addresses these challenges for Uintah by the development of new hybrid runtime system and Unified multi-threaded MPI task scheduler, enabling Uintah to fully exploit current and emerging architectures with support for asynchronous, out-of-order scheduling of both CPU and GPU computational tasks. This design coupled with an approach that uses MPI to communicate between nodes, a shared memory model on-node and the use of novel lock-free data structures, has made it possible for Uintah to achieve excellent scalability for challenging fluid-structure problems using adaptive mesh refinement on as many as 256K cores on the DoE Jaguar XK6 system. This design has also demonstrated an ability to run capability jobs on the heterogeneous systems, Keeneland and TitanDev. In this work, the evolution of Uintah and its runtime system is examined in the context of our new Unified multi-threaded scheduler design. The performance of the Unified scheduler is also tested against previous Uintah scheduler and runtime designs over a range of processor core and GPU counts.},
	keywords={application program interfaces;data structures;message passing;multi-threading;processor scheduling;shared memory systems;Keeneland;TitanDev;DoE Jaguar XK6 system;adaptive mesh refinement;lock-free data structures;shared memory model on-node;asynchronous out-of-order scheduling;GPU computational tasks;hybrid runtime system;unified multithreaded MPI task scheduler design;coprocessors;on-node accelerators;CPU core counts;MPI-free user-coded tasks;structured adaptive grids;fluid-structure interaction problems;heterogeneous systems;asynchronous task execution;multithreaded runtime system;unified heterogeneous task scheduling;Uintah framework;Graphics processing units;Instruction sets;Runtime;Data warehouses;Computer architecture;Master-slave;Parallel processing},
	doi={10.1109/SCC.2012.6674233},
	ISSN={},
	month=Nov,
}
@article{kim2013,
	author = {Kim, Yooseong and Shrivastava, Aviral},
	title = {Memory Performance Estimation of CUDA Programs},
	year = {2013},
	issue_date = {September 2013},
	publisher = {ACM},
	address = {New York, NY, USA},
	volume = {13},
	number = {2},
	issn = {1539-9087},
	doi = {10.1145/2514641.2514648},
	abstract = {CUDA has successfully popularized GPU computing, and GPGPU applications are now used in various embedded systems. The CUDA programming model provides a simple interface to program on GPUs, but tuning GPGPU applications for high performance is still quite challenging. Programmers need to consider numerous architectural details, and small changes in source code, especially on the memory access pattern, can affect performance significantly. This makes it very difficult to optimize CUDA programs. This article presents CuMAPz, which is a tool to analyze and compare the memory performance of CUDA programs. CuMAPz can help programmers explore different ways of using shared and global memories, and optimize their program for efficient memory behavior. CuMAPz models several memory-performance-related factors: data reuse, global memory access coalescing, global memory latency hiding, shared memory bank conflict, channel skew, and branch divergence. Experimental results show that CuMAPz can accurately estimate performance with correlation coefficient of 0.96. By using CuMAPz to explore the memory access design space, we could improve the performance of our benchmarks by 30% more than the previous approach [Hong and Kim 2010].},
	journal = {ACM Trans. Embed. Comput. Syst.},
	month = Sep,
	articleno = {21},
	numpages = {22},
	keywords = {CUDA, memory performance, program optimization, GPGPU, performance estimation}
}
@article{choi2020,
	author = {Choi, Jake and You, Hojun and Kim, Chongam and Young Yeom, Heon and Kim, Yoonhee},
	title = {{Comparing unified, pinned, and host/device memory allocations for memory-intensive workloads on Tegra SoC}},
	journal = {Concurrency and Computation: Practice and Experience},
	keywords = {benchmark, CFD, CUDA, GPU, memory, pinned, Rodinia, unified},
	doi = {10.1002/cpe.6018},
    year = {2020},
    month = Sep,
	abstract = {Summary Edge computing focuses on processing near the source of the data. Edge computing devices using the Tegra SoC architecture provide a physically distinct GPU memory architecture. In order to take advantage of this architecture, different modes of memory allocation need to be considered. Different GPU memory allocation techniques yield different results in memory usage and execution times of identical applications on Tegra devices. In this article, we implement several GPU application benchmarks, including our custom CFD code with unified, pinned, and normal host/device memory allocation modes. We evaluate and compare the memory usage and execution time of such workloads on edge computing Tegra system-on-chips (SoC) equipped with integrated GPUs using a shared memory architecture, and non-SoC machines with discrete GPUs equipped with distinct VRAM. We discover that utilizing normal memory allocation methods on SoCs actually use double the required memory because of unnecessary device memory copies, despite being physically shared with host memory. We show that GPU application memory usage can be reduced up to 50\%, and that even performance improvements can occur just by replacing normal memory allocation and memory copy methods with managed unified memory or pinned memory allocation.}
}

@InProceedings{GrelSchoIFL02,
  author     = {Clemens Grelck and Sven-Bodo Scholz},
  title      = {Axis Control in Sac},
  booktitle  = {Implementation of Functional Languages, 14th International Workshop (IFL'02), Madrid, Spain, Revised Selected Papers},
  year       = {2003},
  editor     = {Ricardo Pe{\~n}a and Thomas Arts},
  volume     = {2670},
  series     = {Lecture Notes in Computer Science},
  pages      = {182--198},
  publisher  = {Springer},
  abstract   = {High-level array processing is characterized by the composi-tion of generic operations, which treat all array elements in a uniform way. This paper proposes a mechanism that allows programmers to direct effects of such array operations to non-scalar subarrays of argument ar-rays without sacrificing the high-level programming approach. A versatile notation for axis control is presented, and it is shown how the additional language constructs can be transformed into regular SaC code. Further-more, an optimization technique is introduced which achieves the same runtime performance regardless of whether code is written using the new notation or in a substantially less elegant style employing conventional language features.},
  category   = {core, opt},
  pubaddress = {Berlin, Heidelberg, Germany},
  sourceurl  = {http://www.isp.mu-luebeck.de/~grelck/publications/axis-control-madrid-02.ps.gz},
  topics     = {SAC},
}