diff --git a/Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp b/Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp new file mode 100644 index 0000000..f3a0a2d --- /dev/null +++ b/Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp @@ -0,0 +1,10 @@ +#include + +int main(int argc, char *argv[]) +{ + + std::cout<<"TO DO"<>; + + // Example 1a. Block and thread direct polcies + // Ideal for when iteration space can broken up into tiles + // Teams can be assigned to a tile and threads can perform + // computations within the tile + + // The example below employs the direct version for block + // and thread policies, the underlying assumption is that + // the loops are within the range of the grid and block sizes + + // In CUDA the direct loops are expressed as: + // + // const int i = threadIdx.x; + // if(i < N) { //kernel } + // + { + const int n_blocks = 50000; + const int block_sz = 64; + + using outer_pol = RAJA::LoopPolicy; + using inner_pol = RAJA::LoopPolicy; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks), RAJA::Threads(block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) { + + //loop body + + }); + }); + + }); + + } + + // Example 1b. Block and thread loop polcies + // Similar to the example above but using a thread loop + // policy. The utility of the thread loop policy rises when + // we consider multiple thread loops with varying iteration sizes. + + // If a RAJA loop iteration space is beyond the configured number + // of threads in a team. The thread loop policies will perform a team + // stride loop to span the whole range. + + // In CUDA the block stride loop is expressed as + // + // for(int i=threadIdx.x; i; + using inner_pol = RAJA::LoopPolicy; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks), RAJA::Threads(block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) { + + //Iteration space is same as number of blocks per thread + //We could also use direct policy here + RAJA::loop(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) { + //loop body + }); //inner loop + + + //Iteration space is *more* than number of blocks per thread + RAJA::loop(ctx, RAJA::RangeSegment(0, 2*block_sz), [&] (int tx) { + //loop body + }); //inner loop + + }); //outer loop + + }); + + } + + // Example 1c. Global Indexing + // Main use case: Perfectly nested loops with large iteration spaces. + { + const int N_x = 10000; + const int N_y = 20000; + const int block_sz = 256; + const int n_blocks_x = (N_x + block_sz) / block_sz + 1; + const int n_blocks_y = (N_y + block_sz) / block_sz + 1; + + using global_pol_y = RAJA::LoopPolicy; + using global_pol_x = RAJA::LoopPolicy; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) { + + //loop body + + }); + }); + + }); + + } + +#else + + std::cout<<"Please compile with CUDA"<(N*M); + int* h_array = def_host_res.allocate(N*M); + + RAJA::RangeSegment one_range(0, 1); + RAJA::RangeSegment m_range(0, M); + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_pol_x = RAJA::LoopPolicy; + + using inner_pol_x = RAJA::LoopPolicy; + + + for(int i=0; i(res_cuda, + RAJA::LaunchParams(RAJA::Teams(64), + RAJA::Threads(1)), "RAJA resource example", + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, m_range, [&] (int j) { + RAJA::loop(ctx, one_range, [&] (int k) { + + d_array[i*M + j] = i * M + j; + + }); + }); + + }); + + //perform synchronization between different streams + def_cuda_res.wait_for(&e); + } + + //Master resource to perform the memory copy + //All other streams have been synchronized with respect to def_cuda_res + def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M); + + int ec_count = 0; + RAJA::forall( RAJA::RangeSegment(0, N*M), + [=, &ec_count](int i){ + if (h_array[i] != i) ec_count++; + } + ); + + std::cout << " Result -- "; + if (ec_count > 0) + std::cout << "FAIL : error count = " << ec_count << "\n"; + else + std::cout << "PASS!\n"; + + +#else + std::cout<<"Please compile with CUDA"<("pool1", allocator, 1024ul, 1024ul, 16, pr75_hwm_heuristic); + auto pool2 = rm.makeAllocator("pool2", allocator, 1024ul, 1024ul, 16, pr100_heuristic); + auto pool3 = rm.makeAllocator("pool3", allocator, 1024ul, 1024ul, 16, br3_heuristic); + auto pool4 = rm.makeAllocator("pool4", allocator, 1024ul, 1024ul, 16, br5_hwm_heuristic); + + //Note: below we are using the allocator's Unwrap utility to expose the QuickPool class underneath. We will use + //this to query pool stats below. It is not a requirement, but can be useful for debugging. + auto quick_pool1 = umpire::util::unwrap_allocator(pool1); + auto quick_pool2 = umpire::util::unwrap_allocator(pool2); + auto quick_pool3 = umpire::util::unwrap_allocator(pool3); + auto quick_pool4 = umpire::util::unwrap_allocator(pool4); + + //Allocate 4 arrays of void pointers + void *a[4], *b[4], *c[4], *d[4]; + + //Allocate 1024 bytes in each element of each array + for (int i = 0; i < 4; ++i) { + a[i] = pool1.allocate(1024); + b[i] = pool2.allocate(1024); + c[i] = pool3.allocate(1024); + d[i] = pool4.allocate(1024); + } + + //Only deallocate one element of the array so that one block is freed up. + pool1.deallocate(a[1]); + pool2.deallocate(b[1]); + pool3.deallocate(c[1]); + pool4.deallocate(d[1]); + + //Allocate larger amounts of bytes in its place. This will cause the pool to rearrange blocks under the hood. + a[1] = pool1.allocate(4096); + b[1] = pool2.allocate(4096); + c[1] = pool3.allocate(4096); + d[1] = pool4.allocate(4096); + + //Next, deallocate another element of the array that is different from above. + pool1.deallocate(a[2]); + pool2.deallocate(b[2]); + pool3.deallocate(c[2]); + pool4.deallocate(d[2]); + + //Allocate a smaller amount of bytes in its place. This will cause the pool to rearrange blocks under the hood. + a[2] = pool1.allocate(64); + b[2] = pool2.allocate(64); + c[2] = pool3.allocate(64); + d[2] = pool4.allocate(64); + + //As we deallocate from each pool, print out stats. Each pool should behave different under the hood because + //of the different coalescing heuristic functions used. + for (int i = 0; i < 4; ++i) { + pool1.deallocate(a[i]); + std::cout << "Pool1 has " << pool1.getActualSize() << " bytes of memory. " + << pool1.getCurrentSize() << " bytes are used. " << quick_pool1->getBlocksInPool() + << " blocks are in the pool. " << quick_pool1->getReleasableSize() << " bytes are releaseable. " + << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + for (int i = 0; i < 4; ++i) { + pool2.deallocate(b[i]); + std::cout << "Pool2 has " << pool2.getActualSize() << " bytes of memory. " + << pool2.getCurrentSize() << " bytes are used. " << quick_pool2->getBlocksInPool() + << " blocks are in the pool. " << quick_pool2->getReleasableSize() << " bytes are releaseable. " + << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + for (int i = 0; i < 4; ++i) { + pool3.deallocate(c[i]); + std::cout << "Pool3 has " << pool3.getActualSize() << " bytes of memory. " + << pool3.getCurrentSize() << " bytes are used. " << quick_pool3->getBlocksInPool() + << " blocks are in the pool. " << quick_pool3->getReleasableSize() << " bytes are releaseable. " + << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + for (int i = 0; i < 4; ++i) { + pool4.deallocate(d[i]); + std::cout << "Pool4 has " << pool4.getActualSize() << " bytes of memory. " + << pool4.getCurrentSize() << " bytes are used. " << quick_pool4->getBlocksInPool() + << " blocks are in the pool. " << quick_pool4->getReleasableSize() << " bytes are releaseable. " + << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + return 0; +} diff --git a/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt new file mode 100644 index 0000000..847e0c5 --- /dev/null +++ b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt @@ -0,0 +1,11 @@ +############################################################################### +# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +blt_add_executable( + NAME 03-Umpire-Coalescing-Heuristics + SOURCES 03-Umpire-Coalescing-Heuristics.cpp + DEPENDS_ON cuda umpire) diff --git a/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md new file mode 100644 index 0000000..a467bed --- /dev/null +++ b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md @@ -0,0 +1,16 @@ +========================================================= +Advanced Tutorial - Using Coalescing Heuristics in Umpire +========================================================= + +Look for the `TODO` comments in the source code. Here you will have create different +Umpire pools with their corresponding coalescing heuristic functions. + +A complete description of the different coalescing heuristic functions is available +in the online Umpire documentation: + +Once you are ready, uncomment the COMPILE define on on top of the file and do + +``` +$ make 03-Umpire-Coalescing-Heuristics +$ ./bin/03-Umpire-Coalescing-Heuristics +``` diff --git a/Advanced_Tutorial/CMakeLists.txt b/Advanced_Tutorial/CMakeLists.txt new file mode 100644 index 0000000..6154a39 --- /dev/null +++ b/Advanced_Tutorial/CMakeLists.txt @@ -0,0 +1,10 @@ +############################################################################### +# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +add_subdirectory(00-Kernel-Fusion) +add_subdirectory(01-GPU-Threads) +add_subdirectory(02-RAJA-Resources) diff --git a/CMakeLists.txt b/CMakeLists.txt index 49d50ed..8d0cb19 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,3 +25,4 @@ endif() add_subdirectory(tpl) add_subdirectory(Intro_Tutorial) add_subdirectory(Intermediate_Tutorial) +add_subdirectory(Advanced_Tutorial) diff --git a/tpl/CMakeLists.txt b/tpl/CMakeLists.txt index 19efb48..fc8fed4 100644 --- a/tpl/CMakeLists.txt +++ b/tpl/CMakeLists.txt @@ -4,6 +4,7 @@ # set (RAJA_ENABLE_EXERCISES Off CACHE BOOL "") # set (RAJA_ENABLE_DOCUMENTATION Off CACHE BOOL "") # set (RAJA_ENABLE_BENCHMARKS Off CACHE BOOL "") + set (RAJA_ENABLE_NV_TOOLS_EXT ON CACHE BOOL "") # # set (UMPIRE_ENABLE_TESTS Off CACHE BOOL "") # set (UMPIRE_ENABLE_EXAMPLES Off CACHE BOOL "")