From b58bc24940818c8fc1442c5f3c6cb2f871b5b7dc Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 8 Jul 2024 16:47:18 -0700 Subject: [PATCH 01/10] advanced tutorial --- .../00-Kernel-Fusion/00-Kernel-Fusion.cpp | 10 +++ .../00-Kernel-Fusion/CMakeLists.txt | 11 +++ .../01-GPU-Threads/01-GPU-Threads.cpp | 79 +++++++++++++++++++ .../01-GPU-Threads/CMakeLists.txt | 11 +++ Advanced_Tutorial/CMakeLists.txt | 10 +++ CMakeLists.txt | 1 + 6 files changed, 122 insertions(+) create mode 100644 Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp create mode 100644 Advanced_Tutorial/00-Kernel-Fusion/CMakeLists.txt create mode 100644 Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp create mode 100644 Advanced_Tutorial/01-GPU-Threads/CMakeLists.txt create mode 100644 Advanced_Tutorial/CMakeLists.txt diff --git a/Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp b/Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp new file mode 100644 index 0000000..f3a0a2d --- /dev/null +++ b/Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp @@ -0,0 +1,10 @@ +#include + +int main(int argc, char *argv[]) +{ + + std::cout<<"TO DO"< + +int main(int argc, char *argv[]) +{ + + + auto& rm = umpire::ResourceManager::getInstance(); + unsigned char *cnt{nullptr}; + auto allocator = rm.getAllocator("UM"); + auto pool = rm.makeAllocator("qpool", allocator); + cnt = static_cast(pool.allocate(width * width * sizeof(unsigned char))); + + using device_launch = RAJA::cuda_launch_t; + using launch_policy = RAJA::LaunchPolicy; + + //Example 1. Global Indexing: + //GPU programming models such as CUDA and HIP follow a thread/block(team) programming model + //in which a predefined compute grid + { + const int N_x = 10000; + const int N_y = 20000; + const int block_sz = 256; + const int n_blocks_x = (N_x + block_sz) / block_sz + 1; + const int n_blocks_y = (N_y + block_sz) / block_sz + 1; + + using loop_pol_x = RAJA::LoopPolicy; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) { + + //populate + + + }); + }); + + + }); + + } + + + + + + + + + + //Iteration Space: + { + const int n_blocks = 50000; + const int block_sz = 64; + + RAJA::launch + ( RAJA::LaunchParams(RAJA::Teams(n_blocks), + RAJA::Threads(block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + + RAJA::loop(ctx, RAJA::RangeSegment(0, width), [&] (int col) { + + }); + + }); + } + + + + + + + + return 0; +} diff --git a/Advanced_Tutorial/01-GPU-Threads/CMakeLists.txt b/Advanced_Tutorial/01-GPU-Threads/CMakeLists.txt new file mode 100644 index 0000000..7179d6a --- /dev/null +++ b/Advanced_Tutorial/01-GPU-Threads/CMakeLists.txt @@ -0,0 +1,11 @@ +############################################################################### +# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +blt_add_executable( + NAME 01-GPU-Threads + SOURCES 01-GPU-Threads.cpp + DEPENDS_ON cuda umpire RAJA) diff --git a/Advanced_Tutorial/CMakeLists.txt b/Advanced_Tutorial/CMakeLists.txt new file mode 100644 index 0000000..9bfb736 --- /dev/null +++ b/Advanced_Tutorial/CMakeLists.txt @@ -0,0 +1,10 @@ +############################################################################### +# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +add_subdirectory(00-Kernel-Fusion) + +add_subdirectory(01-GPU-Threads) diff --git a/CMakeLists.txt b/CMakeLists.txt index 49d50ed..8d0cb19 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,3 +25,4 @@ endif() add_subdirectory(tpl) add_subdirectory(Intro_Tutorial) add_subdirectory(Intermediate_Tutorial) +add_subdirectory(Advanced_Tutorial) From 8226b8e9c87881a41aba36abbf2964aefda56ebe Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 10 Jul 2024 09:36:17 -0700 Subject: [PATCH 02/10] overview of gpu threads --- .../01-GPU-Threads/01-GPU-Threads.cpp | 174 ++++++++++++++---- 1 file changed, 140 insertions(+), 34 deletions(-) diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp index 5790add..06a53b8 100644 --- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp +++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp @@ -1,21 +1,68 @@ +#include "RAJA/RAJA.hpp" + #include -int main(int argc, char *argv[]) +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) { - - auto& rm = umpire::ResourceManager::getInstance(); - unsigned char *cnt{nullptr}; - auto allocator = rm.getAllocator("UM"); - auto pool = rm.makeAllocator("qpool", allocator); - cnt = static_cast(pool.allocate(width * width * sizeof(unsigned char))); - - using device_launch = RAJA::cuda_launch_t; - using launch_policy = RAJA::LaunchPolicy; - - //Example 1. Global Indexing: - //GPU programming models such as CUDA and HIP follow a thread/block(team) programming model - //in which a predefined compute grid + // GPU programming models such as CUDA and HIP peform computation + // on a predefined grid composed of threads and blocks. + // RAJA provides policies enabling users to utilize different + // thread enumeration strategies. The two main strategies + // are Block/Thread local enumeration and Global enumeration. + // + // + // Under a block and thread enumeration, + // threads have block local coordinate enumeration. + // While blocks are enumerated with respect + // to their location within the compute grid. + // The illustration below shows 2 x 2 compute grid + // wherein each block has 3 x 2 threads. Current + // programing models support up to three-dimensional + // block and thread configurations. + // + // + // Block (0,0) Block (1,0) + // [0,0][0,1][0,2] [0,0][0,1][0,2] + // [1,0][1,1][1,2] [1,0][1,1][1,2] + // + // Block (0,1) Block (1,1) + // [0,0][0,1][0,2] [0,0][0,1][0,2] + // [1,0][1,1][1,2] [1,0][1,1][1,2] + // + // Under the global enumeration each thread + // is a assigned a unique thread id based on + // on the dimension (2D illustrated here). + // The utility here comes when the iteration + // space is amendable to tiles in which blocks + // can be assigned to a tile and threads are + // assigned to work within a tile. + // + // [0,0][0,1][0,2] [0,3][0,4][0,5] + // [1,0][1,1][1,2] [1,3][1,4][1,5] + // + // [2,0][2,1][2,2] [2,3][2,4][2,5] + // [3,0][3,1][3,2] [3,3][3,4][3,5] + // + + // Short note on RAJA nomenclature: + // As RAJA serves as an abstraction layer + // the RAJA::launch API uses the terms + // teams and threads. Teams are analogous + // to blocks in CUDA/HIP nomenclature + // and workgroups in the SYCL programming model. + // Threads are analogous to threads within CUDA/HIP + // and work-items within the SYCL programming model. + +#if defined(RAJA_ENABLE_CUDA) + + // The examples below showcase commonly used GPU policies. + // For the HIP and SYCL programming models, we offer analogous policies. + + using launch_policy = RAJA::LaunchPolicy>; + + // Example 1. Global Indexing + // Main use case: Perfectly nested loops with large iteration spaces. { const int N_x = 10000; const int N_y = 20000; @@ -23,57 +70,116 @@ int main(int argc, char *argv[]) const int n_blocks_x = (N_x + block_sz) / block_sz + 1; const int n_blocks_y = (N_y + block_sz) / block_sz + 1; - using loop_pol_x = RAJA::LoopPolicy; + using global_pol_y = RAJA::LoopPolicy; + using global_pol_x = RAJA::LoopPolicy; - RAJA::launch + RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) { + + //Do something - //populate - - }); }); - }); } + // Example 2. Block and thread direct polcies + // Ideal for when iteration space can broken up into tiles + // Teams can be assigned to a tile and threads can perform + // computations within the tile + + // The example below employs the direct version for block + // and thread policies, the underlying assumption is that + // the loops are within the range of the grid and block sizes + // In CUDA the direct loops are expressed as: + // + // const int i = threadIdx.x; + // if(i < N) { //kernel } + // + { + const int n_blocks = 50000; + const int block_sz = 64; + using outer_pol = RAJA::LoopPolicy; + using inner_pol = RAJA::LoopPolicy; + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks), RAJA::Threads(block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) { - //Iteration Space: - { - const int n_blocks = 50000; - const int block_sz = 64; + RAJA::loop(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) { - RAJA::launch - ( RAJA::LaunchParams(RAJA::Teams(n_blocks), - RAJA::Threads(block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + //Do something + }); + }); - RAJA::loop(ctx, RAJA::RangeSegment(0, width), [&] (int col) { - }); + }); - }); } - + // Example 3. Block and thread loop polcies + // Similar to the example above but using a thread loop + // policy. The utility of the thread loop policy rises when + // we consider multiple thread loops with varying iteration sizes. + + // If a RAJA loop iteration space is beyond the configured number + // of threads in a team. The thread loop policies will perform a team + // stride loop to span the whole range. + // In CUDA the block stride loop is expressed as + // + // for(int i=threadIdx.x; i; + using inner_pol = RAJA::LoopPolicy; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks), RAJA::Threads(block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) { + + //Iteration space is same as number of blocks per thread + //We could also use direct policy here + RAJA::loop(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) { + //Do something here + }); //inner loop + + + //Iteration space is *more* than number of blocks per thread + RAJA::loop(ctx, RAJA::RangeSegment(0, 2*block_sz), [&] (int tx) { + //Do something here + }); //inner loop + + }); //outer loop + + }); + + } +#else + std::cout<<"Please compile with CUDA"< Date: Wed, 10 Jul 2024 09:37:23 -0700 Subject: [PATCH 03/10] move intro to before main --- Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp index 06a53b8..a4aceaf 100644 --- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp +++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp @@ -1,10 +1,5 @@ #include "RAJA/RAJA.hpp" -#include - -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) -{ - // GPU programming models such as CUDA and HIP peform computation // on a predefined grid composed of threads and blocks. // RAJA provides policies enabling users to utilize different @@ -54,6 +49,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) // Threads are analogous to threads within CUDA/HIP // and work-items within the SYCL programming model. +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) +{ + #if defined(RAJA_ENABLE_CUDA) // The examples below showcase commonly used GPU policies. From 93ce2fbd22c19d28401aebd657e0747fa61177e5 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 10 Jul 2024 10:59:10 -0700 Subject: [PATCH 04/10] add resource example --- .../02-CAMP-resource/02-CAMP-resource.cpp | 108 ++++++++++++++++++ .../02-CAMP-resource/CMakeLists.txt | 11 ++ Advanced_Tutorial/CMakeLists.txt | 2 +- 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp create mode 100644 Advanced_Tutorial/02-CAMP-resource/CMakeLists.txt diff --git a/Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp b/Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp new file mode 100644 index 0000000..35e21e3 --- /dev/null +++ b/Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp @@ -0,0 +1,108 @@ +#include "RAJA/RAJA.hpp" + +// In a serial GPU programing model kernels are executed +// in a sequential order based on the order in which the +// the kernels are being launched. +// +// GPU programming models such as CUDA/HIP and SYCL +// have the capability of performing device operations +// concurrently. The RAJA portability suite exposes +// concurrent kernel execution through the use of the +// CAMP::resources. +// +// A CAMP::resources corresponds to device stream in +// which we may guarantee that device operations will +// be executed in sequential order. Different streams, +// however; may operate concurrently. +// + +// +// RAJA::resources by default is configured to not be +// the device's default stream. Historicaly the default +// stream serves as a synchronizing stream. No other +// operations can begin until all issued operations on +// the default stream are completed. + +// In modern versions of CUDA the behavior of the default +// stream can be changed to be non-synchronizing. + + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) +{ + +#if defined(RAJA_ENABLE_CUDA) + + + constexpr int N = 10; + constexpr int M = 1000000; + + //Master resource to orchestrate between memory transfers + RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()}; + RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()}; + int* d_array = def_cuda_res.allocate(N*M); + int* h_array = def_host_res.allocate(N*M); + + RAJA::RangeSegment one_range(0, 1); + RAJA::RangeSegment m_range(0, M); + RAJA::RangeSegment n_range(0, N); + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_pol_x = RAJA::LoopPolicy; + + using inner_pol_x = RAJA::LoopPolicy; + + + for(int i=0; i(res_cuda, + RAJA::LaunchParams(RAJA::Teams(64), + RAJA::Threads(1)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, m_range, [&] (int j) { + RAJA::loop(ctx, one_range, [&] (int k) { + + d_array[i*M + j] = i * M + j; + + }); + }); + + }); + + //perform synchronization between different streams + def_cuda_res.wait_for(&e); + } + + //Master resource to perform the memory copy + //All other streams have been synchronized with respect to def_cuda_res + def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M); + + int ec_count = 0; + RAJA::forall( RAJA::RangeSegment(0, N*M), + [=, &ec_count](int i){ + if (h_array[i] != i) ec_count++; + } + ); + + std::cout << " Result -- "; + if (ec_count > 0) + std::cout << "FAIL : error count = " << ec_count << "\n"; + else + std::cout << "PASS!\n"; + + +#else + std::cout<<"Please compile with CUDA"< Date: Mon, 29 Jul 2024 00:05:42 -0700 Subject: [PATCH 05/10] rename files --- .../01-GPU-Threads/01-GPU-Threads.cpp | 65 +++++++++---------- .../02-raja-resource.cpp} | 4 +- .../CMakeLists.txt | 4 +- Advanced_Tutorial/CMakeLists.txt | 2 +- 4 files changed, 37 insertions(+), 38 deletions(-) rename Advanced_Tutorial/{02-CAMP-resource/02-CAMP-resource.cpp => 02-RAJA-Resource/02-raja-resource.cpp} (97%) rename Advanced_Tutorial/{02-CAMP-resource => 02-RAJA-Resource}/CMakeLists.txt (87%) diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp index a4aceaf..d168a34 100644 --- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp +++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp @@ -57,38 +57,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) // The examples below showcase commonly used GPU policies. // For the HIP and SYCL programming models, we offer analogous policies. - using launch_policy = RAJA::LaunchPolicy>; + contexpr bool async = false; //asynchronous kernel execution - // Example 1. Global Indexing - // Main use case: Perfectly nested loops with large iteration spaces. - { - const int N_x = 10000; - const int N_y = 20000; - const int block_sz = 256; - const int n_blocks_x = (N_x + block_sz) / block_sz + 1; - const int n_blocks_y = (N_y + block_sz) / block_sz + 1; - - using global_pol_y = RAJA::LoopPolicy; - using global_pol_x = RAJA::LoopPolicy; - - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + using launch_policy = RAJA::LaunchPolicy>; - RAJA::loop(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) { - - //Do something - - }); - }); - - }); - - } - - - // Example 2. Block and thread direct polcies + // Example 1a. Block and thread direct polcies // Ideal for when iteration space can broken up into tiles // Teams can be assigned to a tile and threads can perform // computations within the tile @@ -102,7 +75,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) // const int i = threadIdx.x; // if(i < N) { //kernel } // - { const int n_blocks = 50000; const int block_sz = 64; @@ -129,8 +101,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) } - - // Example 3. Block and thread loop polcies + // Example 1b. Block and thread loop polcies // Similar to the example above but using a thread loop // policy. The utility of the thread loop policy rises when // we consider multiple thread loops with varying iteration sizes. @@ -174,6 +145,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) } + // Example 1c. Global Indexing + // Main use case: Perfectly nested loops with large iteration spaces. + { + const int N_x = 10000; + const int N_y = 20000; + const int block_sz = 256; + const int n_blocks_x = (N_x + block_sz) / block_sz + 1; + const int n_blocks_y = (N_y + block_sz) / block_sz + 1; + + using global_pol_y = RAJA::LoopPolicy; + using global_pol_x = RAJA::LoopPolicy; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) { + + //Do something + + }); + }); + + }); + + } + #else std::cout<<"Please compile with CUDA"< Date: Mon, 29 Jul 2024 00:06:41 -0700 Subject: [PATCH 06/10] clean up pass --- .../02-raja-resource.cpp | 0 .../{02-RAJA-Resource => 02-RAJA-Resources}/CMakeLists.txt | 0 Advanced_Tutorial/CMakeLists.txt | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) rename Advanced_Tutorial/{02-RAJA-Resource => 02-RAJA-Resources}/02-raja-resource.cpp (100%) rename Advanced_Tutorial/{02-RAJA-Resource => 02-RAJA-Resources}/CMakeLists.txt (100%) diff --git a/Advanced_Tutorial/02-RAJA-Resource/02-raja-resource.cpp b/Advanced_Tutorial/02-RAJA-Resources/02-raja-resource.cpp similarity index 100% rename from Advanced_Tutorial/02-RAJA-Resource/02-raja-resource.cpp rename to Advanced_Tutorial/02-RAJA-Resources/02-raja-resource.cpp diff --git a/Advanced_Tutorial/02-RAJA-Resource/CMakeLists.txt b/Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt similarity index 100% rename from Advanced_Tutorial/02-RAJA-Resource/CMakeLists.txt rename to Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt diff --git a/Advanced_Tutorial/CMakeLists.txt b/Advanced_Tutorial/CMakeLists.txt index 1e2b292..6154a39 100644 --- a/Advanced_Tutorial/CMakeLists.txt +++ b/Advanced_Tutorial/CMakeLists.txt @@ -7,4 +7,4 @@ add_subdirectory(00-Kernel-Fusion) add_subdirectory(01-GPU-Threads) -add_subdirectory(02-RAJA-Resource) +add_subdirectory(02-RAJA-Resources) From 2762405aa3ec85e3d4e11ea364e5434c6a49f0f4 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 29 Jul 2024 00:15:13 -0700 Subject: [PATCH 07/10] rename files --- .../{02-raja-resource.cpp => 02-raja-resources.cpp} | 0 Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename Advanced_Tutorial/02-RAJA-Resources/{02-raja-resource.cpp => 02-raja-resources.cpp} (100%) diff --git a/Advanced_Tutorial/02-RAJA-Resources/02-raja-resource.cpp b/Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp similarity index 100% rename from Advanced_Tutorial/02-RAJA-Resources/02-raja-resource.cpp rename to Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp diff --git a/Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt b/Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt index d1c9506..c26166c 100644 --- a/Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt +++ b/Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt @@ -6,6 +6,6 @@ ############################################################################### blt_add_executable( - NAME 02-raja-resource - SOURCES 02-raja-resource.cpp + NAME 02-raja-resources + SOURCES 02-raja-resources.cpp DEPENDS_ON cuda umpire RAJA) From b5badcf88859d923f4ecde7ee883701dc13abb14 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 29 Jul 2024 00:22:49 -0700 Subject: [PATCH 08/10] clean up example --- Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp index d168a34..0289b6e 100644 --- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp +++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp @@ -87,16 +87,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) { - //Do something + //loop body }); }); - }); } @@ -130,13 +127,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) //Iteration space is same as number of blocks per thread //We could also use direct policy here RAJA::loop(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) { - //Do something here + //loop body }); //inner loop //Iteration space is *more* than number of blocks per thread RAJA::loop(ctx, RAJA::RangeSegment(0, 2*block_sz), [&] (int tx) { - //Do something here + //loop body }); //inner loop }); //outer loop @@ -164,7 +161,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) RAJA::loop(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) { RAJA::loop(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) { - //Do something + //loop body }); }); From cbd3548935ed7410a07976fe6842af4563f56f6a Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 29 Jul 2024 01:42:06 -0700 Subject: [PATCH 09/10] turn on nvtx and take a clean up pass --- Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp | 2 +- Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp | 7 +++---- tpl/CMakeLists.txt | 1 + 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp index 0289b6e..5f8ebb2 100644 --- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp +++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp @@ -57,7 +57,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) // The examples below showcase commonly used GPU policies. // For the HIP and SYCL programming models, we offer analogous policies. - contexpr bool async = false; //asynchronous kernel execution + constexpr bool async = false; //asynchronous kernel execution using launch_policy = RAJA::LaunchPolicy>; diff --git a/Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp b/Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp index f69c777..2ee0708 100644 --- a/Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp +++ b/Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp @@ -45,7 +45,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) RAJA::RangeSegment one_range(0, 1); RAJA::RangeSegment m_range(0, M); - RAJA::RangeSegment n_range(0, N); using launch_policy = RAJA::LaunchPolicy>; @@ -65,11 +64,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv)) RAJA::resources::Event e = RAJA::launch(res_cuda, RAJA::LaunchParams(RAJA::Teams(64), - RAJA::Threads(1)), + RAJA::Threads(1)), "RAJA resource example", [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, m_range, [&] (int j) { - RAJA::loop(ctx, one_range, [&] (int k) { + RAJA::loop(ctx, m_range, [&] (int j) { + RAJA::loop(ctx, one_range, [&] (int k) { d_array[i*M + j] = i * M + j; diff --git a/tpl/CMakeLists.txt b/tpl/CMakeLists.txt index 19efb48..fc8fed4 100644 --- a/tpl/CMakeLists.txt +++ b/tpl/CMakeLists.txt @@ -4,6 +4,7 @@ # set (RAJA_ENABLE_EXERCISES Off CACHE BOOL "") # set (RAJA_ENABLE_DOCUMENTATION Off CACHE BOOL "") # set (RAJA_ENABLE_BENCHMARKS Off CACHE BOOL "") + set (RAJA_ENABLE_NV_TOOLS_EXT ON CACHE BOOL "") # # set (UMPIRE_ENABLE_TESTS Off CACHE BOOL "") # set (UMPIRE_ENABLE_EXAMPLES Off CACHE BOOL "") From 667e5fceffa68ccee03c8521d3c7716930cf50ed Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 29 Jul 2024 09:28:08 -0700 Subject: [PATCH 10/10] adding Umpire pool coalescing heuristic lesson --- .../03-Umpire-Coalescing-Heuristics.cpp | 128 ++++++++++++++++++ .../CMakeLists.txt | 11 ++ .../03-Umpire-Coalescing-Heuristics/README.md | 16 +++ 3 files changed, 155 insertions(+) create mode 100644 Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/03-Umpire-Coalescing-Heuristics.cpp create mode 100644 Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt create mode 100644 Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md diff --git a/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/03-Umpire-Coalescing-Heuristics.cpp b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/03-Umpire-Coalescing-Heuristics.cpp new file mode 100644 index 0000000..a6f4080 --- /dev/null +++ b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/03-Umpire-Coalescing-Heuristics.cpp @@ -0,0 +1,128 @@ +nclude "umpire/Allocator.hpp" +#include "umpire/ResourceManager.hpp" +#include "umpire/strategy/QuickPool.hpp" +#include "umpire/util/Macros.hpp" +#include "umpire/util/wrap_allocator.hpp" + +int main(int, char**) +{ + //Create the instance of the Resource Manager and use it to create an allocator using the DEVICE memory resource. + auto& rm = umpire::ResourceManager::getInstance(); + auto allocator = rm.getAllocator("DEVICE"); + + /* + * Set up Percent Releasable and Blocks Releasable heuristics. The Percent Releasable heuristic + * will coalesce the pool when some percentage of bytes in the pool is releasable (i.e. free). + * The Blocks Releasable heuristic will coalesce the pool when a certain number of blocks in + * the pool is releasable (i.e. free). Each heuristic function takes a parameter that specifies + * either the percentage or the number of blocks, depending on which heuristic it is. + * Below, do the following: + * 1. Create a Percent Releasable heuristic function that will coalesce when the entire pool is releasable. + * 2. Create a Percent Releasable heuristic function that will coalesce when 75% of the pool is releasable. + * 3. Create a Blocks Releasable heuristic function that will coalesce when 3 blocks of the pool are releasable. + * 4. Create a Blocks Releasable heuristic function that will coalesce when 5 blocks of the pool are releasable. + */ + auto pr75_hwm_heuristic = umpire::strategy::QuickPool::percent_releasable_hwm(75); + auto pr100_heuristic = umpire::strategy::QuickPool::percent_releasable(100); + auto br3_heuristic = umpire::strategy::QuickPool::blocks_releasable(3); + auto br5_hwm_heuristic = umpire::strategy::QuickPool::blocks_releasable_hwm(5); + + //Note: if no heuristic function is set for a pool, the default heuristic function is Percent Releasable set to 100%. + //This should work decently well for many cases, but with particular allocation patterns, it may not be aggressive + //enough. + + /* + * Below, create a separate QuickPool for each heuristic function. The pools should have a parameter to set the + * size of the first block in the pool, a parameter to set the size of the next blocks in the pool, the alignment, and + * finaly the heuristic function. + * + * By passing the specific heuristic function to the constructor of the pool, we are ensuring that every time the + * pool must be coalesced, it uses the exact heuristic function we set above. + */ + auto pool1 = rm.makeAllocator("pool1", allocator, 1024ul, 1024ul, 16, pr75_hwm_heuristic); + auto pool2 = rm.makeAllocator("pool2", allocator, 1024ul, 1024ul, 16, pr100_heuristic); + auto pool3 = rm.makeAllocator("pool3", allocator, 1024ul, 1024ul, 16, br3_heuristic); + auto pool4 = rm.makeAllocator("pool4", allocator, 1024ul, 1024ul, 16, br5_hwm_heuristic); + + //Note: below we are using the allocator's Unwrap utility to expose the QuickPool class underneath. We will use + //this to query pool stats below. It is not a requirement, but can be useful for debugging. + auto quick_pool1 = umpire::util::unwrap_allocator(pool1); + auto quick_pool2 = umpire::util::unwrap_allocator(pool2); + auto quick_pool3 = umpire::util::unwrap_allocator(pool3); + auto quick_pool4 = umpire::util::unwrap_allocator(pool4); + + //Allocate 4 arrays of void pointers + void *a[4], *b[4], *c[4], *d[4]; + + //Allocate 1024 bytes in each element of each array + for (int i = 0; i < 4; ++i) { + a[i] = pool1.allocate(1024); + b[i] = pool2.allocate(1024); + c[i] = pool3.allocate(1024); + d[i] = pool4.allocate(1024); + } + + //Only deallocate one element of the array so that one block is freed up. + pool1.deallocate(a[1]); + pool2.deallocate(b[1]); + pool3.deallocate(c[1]); + pool4.deallocate(d[1]); + + //Allocate larger amounts of bytes in its place. This will cause the pool to rearrange blocks under the hood. + a[1] = pool1.allocate(4096); + b[1] = pool2.allocate(4096); + c[1] = pool3.allocate(4096); + d[1] = pool4.allocate(4096); + + //Next, deallocate another element of the array that is different from above. + pool1.deallocate(a[2]); + pool2.deallocate(b[2]); + pool3.deallocate(c[2]); + pool4.deallocate(d[2]); + + //Allocate a smaller amount of bytes in its place. This will cause the pool to rearrange blocks under the hood. + a[2] = pool1.allocate(64); + b[2] = pool2.allocate(64); + c[2] = pool3.allocate(64); + d[2] = pool4.allocate(64); + + //As we deallocate from each pool, print out stats. Each pool should behave different under the hood because + //of the different coalescing heuristic functions used. + for (int i = 0; i < 4; ++i) { + pool1.deallocate(a[i]); + std::cout << "Pool1 has " << pool1.getActualSize() << " bytes of memory. " + << pool1.getCurrentSize() << " bytes are used. " << quick_pool1->getBlocksInPool() + << " blocks are in the pool. " << quick_pool1->getReleasableSize() << " bytes are releaseable. " + << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + for (int i = 0; i < 4; ++i) { + pool2.deallocate(b[i]); + std::cout << "Pool2 has " << pool2.getActualSize() << " bytes of memory. " + << pool2.getCurrentSize() << " bytes are used. " << quick_pool2->getBlocksInPool() + << " blocks are in the pool. " << quick_pool2->getReleasableSize() << " bytes are releaseable. " + << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + for (int i = 0; i < 4; ++i) { + pool3.deallocate(c[i]); + std::cout << "Pool3 has " << pool3.getActualSize() << " bytes of memory. " + << pool3.getCurrentSize() << " bytes are used. " << quick_pool3->getBlocksInPool() + << " blocks are in the pool. " << quick_pool3->getReleasableSize() << " bytes are releaseable. " + << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + for (int i = 0; i < 4; ++i) { + pool4.deallocate(d[i]); + std::cout << "Pool4 has " << pool4.getActualSize() << " bytes of memory. " + << pool4.getCurrentSize() << " bytes are used. " << quick_pool4->getBlocksInPool() + << " blocks are in the pool. " << quick_pool4->getReleasableSize() << " bytes are releaseable. " + << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + return 0; +} diff --git a/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt new file mode 100644 index 0000000..847e0c5 --- /dev/null +++ b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt @@ -0,0 +1,11 @@ +############################################################################### +# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +blt_add_executable( + NAME 03-Umpire-Coalescing-Heuristics + SOURCES 03-Umpire-Coalescing-Heuristics.cpp + DEPENDS_ON cuda umpire) diff --git a/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md new file mode 100644 index 0000000..a467bed --- /dev/null +++ b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md @@ -0,0 +1,16 @@ +========================================================= +Advanced Tutorial - Using Coalescing Heuristics in Umpire +========================================================= + +Look for the `TODO` comments in the source code. Here you will have create different +Umpire pools with their corresponding coalescing heuristic functions. + +A complete description of the different coalescing heuristic functions is available +in the online Umpire documentation: + +Once you are ready, uncomment the COMPILE define on on top of the file and do + +``` +$ make 03-Umpire-Coalescing-Heuristics +$ ./bin/03-Umpire-Coalescing-Heuristics +```