From b58bc24940818c8fc1442c5f3c6cb2f871b5b7dc Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 8 Jul 2024 16:47:18 -0700
Subject: [PATCH 01/10] advanced tutorial

---
 .../00-Kernel-Fusion/00-Kernel-Fusion.cpp     | 10 +++
 .../00-Kernel-Fusion/CMakeLists.txt           | 11 +++
 .../01-GPU-Threads/01-GPU-Threads.cpp         | 79 +++++++++++++++++++
 .../01-GPU-Threads/CMakeLists.txt             | 11 +++
 Advanced_Tutorial/CMakeLists.txt              | 10 +++
 CMakeLists.txt                                |  1 +
 6 files changed, 122 insertions(+)
 create mode 100644 Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp
 create mode 100644 Advanced_Tutorial/00-Kernel-Fusion/CMakeLists.txt
 create mode 100644 Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
 create mode 100644 Advanced_Tutorial/01-GPU-Threads/CMakeLists.txt
 create mode 100644 Advanced_Tutorial/CMakeLists.txt
diff --git a/Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp b/Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp
new file mode 100644
index 0000000..f3a0a2d
--- /dev/null
+++ b/Advanced_Tutorial/00-Kernel-Fusion/00-Kernel-Fusion.cpp
@@ -0,0 +1,10 @@
+#include <iostream>
+
+int main(int argc, char *argv[])
+{
+
+  std::cout<<"TO DO"<<std::endl;
+
+
+  return 0;
+}
diff --git a/Advanced_Tutorial/00-Kernel-Fusion/CMakeLists.txt b/Advanced_Tutorial/00-Kernel-Fusion/CMakeLists.txt
new file mode 100644
index 0000000..3f0a3d3
--- /dev/null
+++ b/Advanced_Tutorial/00-Kernel-Fusion/CMakeLists.txt
@@ -0,0 +1,11 @@
+###############################################################################
+# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+blt_add_executable(
+  NAME 00-Kernel-Fusion
+  SOURCES 00-Kernel-Fusion.cpp
+  DEPENDS_ON cuda umpire RAJA)
diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
new file mode 100644
index 0000000..5790add
--- /dev/null
+++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
@@ -0,0 +1,79 @@
+#include <iostream>
+
+int main(int argc, char *argv[])
+{
+
+
+  auto& rm = umpire::ResourceManager::getInstance();
+  unsigned char *cnt{nullptr};
+  auto allocator = rm.getAllocator("UM");
+  auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
+  cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
+
+  using device_launch = RAJA::cuda_launch_t<false>;
+  using launch_policy = RAJA::LaunchPolicy<device_launch>;
+
+  //Example 1. Global Indexing: 
+  //GPU programming models such as CUDA and HIP follow a thread/block(team) programming model 
+  //in which a predefined compute grid 
+  {
+    const int N_x        = 10000;
+    const int N_y        = 20000;
+    const int block_sz   = 256;
+    const int n_blocks_x = (N_x + block_sz) / block_sz + 1;
+    const int n_blocks_y = (N_y + block_sz) / block_sz + 1;
+
+    using loop_pol_x = RAJA::LoopPolicy<RAJA::cuda_global_x>;
+
+    RAJA::launch<device_launch>
+      (RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)),
+       [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+	RAJA::loop<loop_pol_y>(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) {
+	    RAJA::loop<loop_pol_x>(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) {
+
+		//populate 
+		
+		
+	      });
+	  });
+
+
+      });
+
+  }
+
+
+
+
+
+
+
+
+
+  //Iteration Space:
+  {
+  const int n_blocks = 50000;
+  const int block_sz = 64;
+
+  RAJA::launch<launch_policy>
+    ( RAJA::LaunchParams(RAJA::Teams(n_blocks),
+			RAJA::Threads(block_sz)),
+     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+
+      RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {
+
+	});
+
+    });
+  }
+  
+
+
+
+
+
+
+  return 0;
+}
diff --git a/Advanced_Tutorial/01-GPU-Threads/CMakeLists.txt b/Advanced_Tutorial/01-GPU-Threads/CMakeLists.txt
new file mode 100644
index 0000000..7179d6a
--- /dev/null
+++ b/Advanced_Tutorial/01-GPU-Threads/CMakeLists.txt
@@ -0,0 +1,11 @@
+###############################################################################
+# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+blt_add_executable(
+  NAME 01-GPU-Threads
+  SOURCES 01-GPU-Threads.cpp
+  DEPENDS_ON cuda umpire RAJA)
diff --git a/Advanced_Tutorial/CMakeLists.txt b/Advanced_Tutorial/CMakeLists.txt
new file mode 100644
index 0000000..9bfb736
--- /dev/null
+++ b/Advanced_Tutorial/CMakeLists.txt
@@ -0,0 +1,10 @@
+###############################################################################
+# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+add_subdirectory(00-Kernel-Fusion)
+
+add_subdirectory(01-GPU-Threads)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49d50ed..8d0cb19 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,3 +25,4 @@ endif()
 add_subdirectory(tpl)
 add_subdirectory(Intro_Tutorial)
 add_subdirectory(Intermediate_Tutorial)
+add_subdirectory(Advanced_Tutorial)

From 8226b8e9c87881a41aba36abbf2964aefda56ebe Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Wed, 10 Jul 2024 09:36:17 -0700
Subject: [PATCH 02/10] overview of gpu threads

---
 .../01-GPU-Threads/01-GPU-Threads.cpp         | 174 ++++++++++++++----
 1 file changed, 140 insertions(+), 34 deletions(-)

diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
index 5790add..06a53b8 100644
--- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
+++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
@@ -1,21 +1,68 @@
+#include "RAJA/RAJA.hpp"
+
 #include <iostream>
 
-int main(int argc, char *argv[])
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
 {
 
-
-  auto& rm = umpire::ResourceManager::getInstance();
-  unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("UM");
-  auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
-  cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
-
-  using device_launch = RAJA::cuda_launch_t<false>;
-  using launch_policy = RAJA::LaunchPolicy<device_launch>;
-
-  //Example 1. Global Indexing: 
-  //GPU programming models such as CUDA and HIP follow a thread/block(team) programming model 
-  //in which a predefined compute grid 
+  // GPU programming models such as CUDA and HIP peform computation
+  // on a predefined grid composed of threads and blocks.
+  // RAJA provides policies enabling users to utilize different
+  // thread enumeration strategies. The two main strategies
+  // are Block/Thread local enumeration and Global enumeration.
+  //
+  //
+  //  Under a block and thread enumeration,
+  //  threads have block local coordinate enumeration.
+  //  While blocks are enumerated with respect
+  //  to their location within the compute grid.
+  //  The illustration below shows  2 x 2 compute grid
+  //  wherein each block has 3 x 2 threads. Current
+  //  programing models support up to three-dimensional
+  //  block and thread configurations.
+  //
+  //
+  //     Block (0,0)       Block (1,0)
+  //   [0,0][0,1][0,2]  [0,0][0,1][0,2]
+  //   [1,0][1,1][1,2]  [1,0][1,1][1,2]
+  //
+  //     Block (0,1)      Block (1,1)
+  //   [0,0][0,1][0,2]  [0,0][0,1][0,2]
+  //   [1,0][1,1][1,2]  [1,0][1,1][1,2]
+  //
+  //  Under the global enumeration each thread
+  //  is a assigned a unique thread id based on
+  //  on the dimension (2D illustrated here).
+  //  The utility here comes when the iteration
+  //  space is amendable to tiles in which blocks
+  //  can be assigned to a tile and threads are
+  //  assigned to work within a tile.
+  //
+  //   [0,0][0,1][0,2]  [0,3][0,4][0,5]
+  //   [1,0][1,1][1,2]  [1,3][1,4][1,5]
+  //
+  //   [2,0][2,1][2,2]  [2,3][2,4][2,5]
+  //   [3,0][3,1][3,2]  [3,3][3,4][3,5]
+  //
+
+  // Short note on RAJA nomenclature:
+  // As RAJA serves as an abstraction layer
+  // the RAJA::launch API uses the terms
+  // teams and threads. Teams are analogous
+  // to blocks in CUDA/HIP nomenclature
+  // and workgroups in the SYCL programming model.
+  // Threads are analogous to threads within CUDA/HIP
+  // and work-items within the SYCL programming model.
+
+#if defined(RAJA_ENABLE_CUDA)
+
+  // The examples below showcase commonly used GPU policies.
+  // For the HIP and SYCL programming models, we offer analogous policies.
+
+  using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
+
+  // Example 1. Global Indexing
+  // Main use case: Perfectly nested loops with large iteration spaces.
   {
     const int N_x        = 10000;
     const int N_y        = 20000;
@@ -23,57 +70,116 @@ int main(int argc, char *argv[])
     const int n_blocks_x = (N_x + block_sz) / block_sz + 1;
     const int n_blocks_y = (N_y + block_sz) / block_sz + 1;
 
-    using loop_pol_x = RAJA::LoopPolicy<RAJA::cuda_global_x>;
+    using global_pol_y = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
+    using global_pol_x = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
 
-    RAJA::launch<device_launch>
+    RAJA::launch<launch_policy>
       (RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)),
        [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-	RAJA::loop<loop_pol_y>(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) {
-	    RAJA::loop<loop_pol_x>(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) {
+	RAJA::loop<global_pol_y>(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) {
+	    RAJA::loop<global_pol_x>(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) {
+
+		//Do something
 
-		//populate 
-		
-		
 	      });
 	  });
 
-
       });
 
   }
 
 
+  // Example 2. Block and thread direct polcies
+  // Ideal for when iteration space can broken up into tiles
+  // Teams can be assigned to a tile and threads can perform
+  // computations within the tile
+
+  // The example below employs the direct version for block
+  // and thread policies, the underlying assumption is that
+  // the loops are within the range of the grid and block sizes
 
+  // In CUDA the direct loops are expressed as:
+  //
+  //   const int i = threadIdx.x;
+  //   if(i < N) { //kernel }
+  //
 
+  {
+    const int n_blocks = 50000;
+    const int block_sz = 64;
 
+    using outer_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+    using inner_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
+    RAJA::launch<launch_policy>
+      (RAJA::LaunchParams(RAJA::Teams(n_blocks), RAJA::Threads(block_sz)),
+       [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
+	RAJA::loop<outer_pol>(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) {
 
 
-  //Iteration Space:
-  {
-  const int n_blocks = 50000;
-  const int block_sz = 64;
+	    RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) {
 
-  RAJA::launch<launch_policy>
-    ( RAJA::LaunchParams(RAJA::Teams(n_blocks),
-			RAJA::Threads(block_sz)),
-     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+		//Do something
 
+	      });
+	  });
 
-      RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {
 
-	});
+      });
 
-    });
   }
-  
 
 
+  // Example 3. Block and thread loop polcies
+  // Similar to the example above but using a thread loop
+  // policy. The utility of the thread loop policy rises when
+  // we consider multiple thread loops with varying iteration sizes.
+
+  // If a RAJA loop iteration space is beyond the configured number
+  // of threads in a team. The thread loop policies will perform a team
+  // stride loop to span the whole range.
 
+  // In CUDA the block stride loop is expressed as
+  //
+  //   for(int i=threadIdx.x; i<N; i+=blockDim.x)
+  //
+  {
+    const int n_blocks = 50000;
+    const int block_sz = 64;
+
+    using outer_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+    using inner_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
+
+    RAJA::launch<launch_policy>
+      (RAJA::LaunchParams(RAJA::Teams(n_blocks), RAJA::Threads(block_sz)),
+       [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+	RAJA::loop<outer_pol>(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) {
+
+	    //Iteration space is same as number of blocks per thread
+	    //We could also use direct policy here
+	    RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) {
+		//Do something here
+	      }); //inner loop
+
+
+	    //Iteration space is *more* than number of blocks per thread
+	    RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, 2*block_sz), [&] (int tx) {
+		//Do something here
+	      }); //inner loop
+
+	  }); //outer loop
+
+      });
+
+  }
 
+#else
 
+  std::cout<<"Please compile with CUDA"<<std::endl;
+#endif
 
   return 0;
 }

From 5a3b609aba8042b4ef638e6f8d67f8e62d18a257 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Wed, 10 Jul 2024 09:37:23 -0700
Subject: [PATCH 03/10] move intro to before main

---
 Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
index 06a53b8..a4aceaf 100644
--- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
+++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
@@ -1,10 +1,5 @@
 #include "RAJA/RAJA.hpp"
 
-#include <iostream>
-
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
-{
-
   // GPU programming models such as CUDA and HIP peform computation
   // on a predefined grid composed of threads and blocks.
   // RAJA provides policies enabling users to utilize different
@@ -54,6 +49,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
   // Threads are analogous to threads within CUDA/HIP
   // and work-items within the SYCL programming model.
 
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
+{
+
 #if defined(RAJA_ENABLE_CUDA)
 
   // The examples below showcase commonly used GPU policies.

From 93ce2fbd22c19d28401aebd657e0747fa61177e5 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Wed, 10 Jul 2024 10:59:10 -0700
Subject: [PATCH 04/10] add resource example

---
 .../02-CAMP-resource/02-CAMP-resource.cpp     | 108 ++++++++++++++++++
 .../02-CAMP-resource/CMakeLists.txt           |  11 ++
 Advanced_Tutorial/CMakeLists.txt              |   2 +-
 3 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp
 create mode 100644 Advanced_Tutorial/02-CAMP-resource/CMakeLists.txt

diff --git a/Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp b/Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp
new file mode 100644
index 0000000..35e21e3
--- /dev/null
+++ b/Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp
@@ -0,0 +1,108 @@
+#include "RAJA/RAJA.hpp"
+
+// In a serial GPU programing model kernels are executed 
+// in a sequential order based on the order in which the 
+// the kernels are being launched. 
+// 
+// GPU programming models such as CUDA/HIP and SYCL
+// have the capability of performing device operations
+// concurrently. The RAJA portability suite exposes 
+// concurrent kernel execution through the use of the 
+// CAMP::resources.
+//
+// A CAMP::resources corresponds to device stream in
+// which we may guarantee that device operations will
+// be executed in sequential order. Different streams,
+// however; may operate concurrently. 
+//
+
+//
+// RAJA::resources by default is configured to not be
+// the device's default stream. Historicaly the default
+// stream serves as a synchronizing stream. No other 
+// operations can begin until all issued operations on 
+// the default stream are completed.
+
+// In modern versions of CUDA the behavior of the default
+// stream can be changed to be non-synchronizing.
+
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
+{
+
+#if defined(RAJA_ENABLE_CUDA)
+
+
+  constexpr int N = 10;
+  constexpr int M = 1000000;
+
+  //Master resource to orchestrate between memory transfers
+  RAJA::resources::Cuda def_cuda_res{RAJA::resources::Cuda::get_default()};
+  RAJA::resources::Host def_host_res{RAJA::resources::Host::get_default()};
+  int* d_array = def_cuda_res.allocate<int>(N*M);
+  int* h_array = def_host_res.allocate<int>(N*M);
+
+  RAJA::RangeSegment one_range(0, 1);
+  RAJA::RangeSegment m_range(0, M);
+  RAJA::RangeSegment n_range(0, N);
+
+  using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
+
+  using outer_pol_x = RAJA::LoopPolicy<RAJA::cuda_block_x_loop>;
+
+  using inner_pol_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
+
+
+  for(int i=0; i<N; i++) {
+    
+    
+    //Each new resource will intialize a new stream
+    RAJA::resources::Cuda res_cuda;
+
+    //Kernels will be executed asynchrously
+    //A handle e can be used to synchronize between stream activity
+      RAJA::resources::Event e =
+        RAJA::launch<launch_policy>(res_cuda,
+        RAJA::LaunchParams(RAJA::Teams(64),
+                         RAJA::Threads(1)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)  {
+
+       RAJA::loop<teams_x>(ctx, m_range, [&] (int j) {
+         RAJA::loop<threads_x>(ctx, one_range, [&] (int k) {
+
+           d_array[i*M + j] = i * M + j;
+
+           });
+         });
+
+      });
+
+      //perform synchronization between different streams
+      def_cuda_res.wait_for(&e);
+  }
+
+  //Master resource to perform the memory copy
+  //All other streams have been synchronized with respect to def_cuda_res
+  def_cuda_res.memcpy(h_array, d_array, sizeof(int) * N * M);
+
+  int ec_count = 0;
+  RAJA::forall<RAJA::seq_exec>( RAJA::RangeSegment(0, N*M),
+    [=, &ec_count](int i){
+      if (h_array[i] != i) ec_count++;
+    }
+  );
+
+  std::cout << "    Result -- ";
+  if (ec_count > 0)
+    std::cout << "FAIL : error count = " << ec_count << "\n";
+  else
+    std::cout << "PASS!\n";
+
+
+#else
+  std::cout<<"Please compile with CUDA"<<std::endl;
+#endif
+
+  return 0;
+}
diff --git a/Advanced_Tutorial/02-CAMP-resource/CMakeLists.txt b/Advanced_Tutorial/02-CAMP-resource/CMakeLists.txt
new file mode 100644
index 0000000..b63a7f0
--- /dev/null
+++ b/Advanced_Tutorial/02-CAMP-resource/CMakeLists.txt
@@ -0,0 +1,11 @@
+###############################################################################
+# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+blt_add_executable(
+  NAME 02-CAMP-resource
+  SOURCES 02-CAMP-resource.cpp
+  DEPENDS_ON cuda umpire RAJA)
diff --git a/Advanced_Tutorial/CMakeLists.txt b/Advanced_Tutorial/CMakeLists.txt
index 9bfb736..d23a6cf 100644
--- a/Advanced_Tutorial/CMakeLists.txt
+++ b/Advanced_Tutorial/CMakeLists.txt
@@ -6,5 +6,5 @@
 ###############################################################################
 
 add_subdirectory(00-Kernel-Fusion)
-
 add_subdirectory(01-GPU-Threads)
+add_subdirectory(02-CAMP-resource)

From 22340bc8f377d2d0f51771e976f64b94157b5215 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 29 Jul 2024 00:05:42 -0700
Subject: [PATCH 05/10] rename files

---
 .../01-GPU-Threads/01-GPU-Threads.cpp         | 65 +++++++++----------
 .../02-raja-resource.cpp}                     |  4 +-
 .../CMakeLists.txt                            |  4 +-
 Advanced_Tutorial/CMakeLists.txt              |  2 +-
 4 files changed, 37 insertions(+), 38 deletions(-)
 rename Advanced_Tutorial/{02-CAMP-resource/02-CAMP-resource.cpp => 02-RAJA-Resource/02-raja-resource.cpp} (97%)
 rename Advanced_Tutorial/{02-CAMP-resource => 02-RAJA-Resource}/CMakeLists.txt (87%)

diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
index a4aceaf..d168a34 100644
--- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
+++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
@@ -57,38 +57,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
   // The examples below showcase commonly used GPU policies.
   // For the HIP and SYCL programming models, we offer analogous policies.
 
-  using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
+  contexpr bool async = false; //asynchronous kernel execution
 
-  // Example 1. Global Indexing
-  // Main use case: Perfectly nested loops with large iteration spaces.
-  {
-    const int N_x        = 10000;
-    const int N_y        = 20000;
-    const int block_sz   = 256;
-    const int n_blocks_x = (N_x + block_sz) / block_sz + 1;
-    const int n_blocks_y = (N_y + block_sz) / block_sz + 1;
-
-    using global_pol_y = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
-    using global_pol_x = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
-
-    RAJA::launch<launch_policy>
-      (RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)),
-       [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+  using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
 
-	RAJA::loop<global_pol_y>(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) {
-	    RAJA::loop<global_pol_x>(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) {
-
-		//Do something
-
-	      });
-	  });
-
-      });
-
-  }
-
-
-  // Example 2. Block and thread direct polcies
+  // Example 1a. Block and thread direct polcies
   // Ideal for when iteration space can broken up into tiles
   // Teams can be assigned to a tile and threads can perform
   // computations within the tile
@@ -102,7 +75,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
   //   const int i = threadIdx.x;
   //   if(i < N) { //kernel }
   //
-
   {
     const int n_blocks = 50000;
     const int block_sz = 64;
@@ -129,8 +101,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
 
   }
 
-
-  // Example 3. Block and thread loop polcies
+  // Example 1b. Block and thread loop polcies
   // Similar to the example above but using a thread loop
   // policy. The utility of the thread loop policy rises when
   // we consider multiple thread loops with varying iteration sizes.
@@ -174,6 +145,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
 
   }
 
+  // Example 1c. Global Indexing
+  // Main use case: Perfectly nested loops with large iteration spaces.
+  {
+    const int N_x        = 10000;
+    const int N_y        = 20000;
+    const int block_sz   = 256;
+    const int n_blocks_x = (N_x + block_sz) / block_sz + 1;
+    const int n_blocks_y = (N_y + block_sz) / block_sz + 1;
+
+    using global_pol_y = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
+    using global_pol_x = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
+
+    RAJA::launch<launch_policy>
+      (RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)),
+       [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+	RAJA::loop<global_pol_y>(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) {
+	    RAJA::loop<global_pol_x>(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) {
+
+		//Do something
+
+	      });
+	  });
+
+      });
+
+  }
+
 #else
 
   std::cout<<"Please compile with CUDA"<<std::endl;
diff --git a/Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp b/Advanced_Tutorial/02-RAJA-Resource/02-raja-resource.cpp
similarity index 97%
rename from Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp
rename to Advanced_Tutorial/02-RAJA-Resource/02-raja-resource.cpp
index 35e21e3..f69c777 100644
--- a/Advanced_Tutorial/02-CAMP-resource/02-CAMP-resource.cpp
+++ b/Advanced_Tutorial/02-RAJA-Resource/02-raja-resource.cpp
@@ -8,9 +8,9 @@
 // have the capability of performing device operations
 // concurrently. The RAJA portability suite exposes 
 // concurrent kernel execution through the use of the 
-// CAMP::resources.
+// raja::resources.
 //
-// A CAMP::resources corresponds to device stream in
+// A raja::resources corresponds to device stream in
 // which we may guarantee that device operations will
 // be executed in sequential order. Different streams,
 // however; may operate concurrently. 
diff --git a/Advanced_Tutorial/02-CAMP-resource/CMakeLists.txt b/Advanced_Tutorial/02-RAJA-Resource/CMakeLists.txt
similarity index 87%
rename from Advanced_Tutorial/02-CAMP-resource/CMakeLists.txt
rename to Advanced_Tutorial/02-RAJA-Resource/CMakeLists.txt
index b63a7f0..d1c9506 100644
--- a/Advanced_Tutorial/02-CAMP-resource/CMakeLists.txt
+++ b/Advanced_Tutorial/02-RAJA-Resource/CMakeLists.txt
@@ -6,6 +6,6 @@
 ###############################################################################
 
 blt_add_executable(
-  NAME 02-CAMP-resource
-  SOURCES 02-CAMP-resource.cpp
+  NAME 02-raja-resource
+  SOURCES 02-raja-resource.cpp
   DEPENDS_ON cuda umpire RAJA)
diff --git a/Advanced_Tutorial/CMakeLists.txt b/Advanced_Tutorial/CMakeLists.txt
index d23a6cf..1e2b292 100644
--- a/Advanced_Tutorial/CMakeLists.txt
+++ b/Advanced_Tutorial/CMakeLists.txt
@@ -7,4 +7,4 @@
 
 add_subdirectory(00-Kernel-Fusion)
 add_subdirectory(01-GPU-Threads)
-add_subdirectory(02-CAMP-resource)
+add_subdirectory(02-RAJA-Resource)

From 8e9890cc4d579982200ed2b974439504d41eaf71 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 29 Jul 2024 00:06:41 -0700
Subject: [PATCH 06/10] clean up pass

---
 .../02-raja-resource.cpp                                        | 0
 .../{02-RAJA-Resource => 02-RAJA-Resources}/CMakeLists.txt      | 0
 Advanced_Tutorial/CMakeLists.txt                                | 2 +-
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename Advanced_Tutorial/{02-RAJA-Resource => 02-RAJA-Resources}/02-raja-resource.cpp (100%)
 rename Advanced_Tutorial/{02-RAJA-Resource => 02-RAJA-Resources}/CMakeLists.txt (100%)

diff --git a/Advanced_Tutorial/02-RAJA-Resource/02-raja-resource.cpp b/Advanced_Tutorial/02-RAJA-Resources/02-raja-resource.cpp
similarity index 100%
rename from Advanced_Tutorial/02-RAJA-Resource/02-raja-resource.cpp
rename to Advanced_Tutorial/02-RAJA-Resources/02-raja-resource.cpp
diff --git a/Advanced_Tutorial/02-RAJA-Resource/CMakeLists.txt b/Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt
similarity index 100%
rename from Advanced_Tutorial/02-RAJA-Resource/CMakeLists.txt
rename to Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt
diff --git a/Advanced_Tutorial/CMakeLists.txt b/Advanced_Tutorial/CMakeLists.txt
index 1e2b292..6154a39 100644
--- a/Advanced_Tutorial/CMakeLists.txt
+++ b/Advanced_Tutorial/CMakeLists.txt
@@ -7,4 +7,4 @@
 
 add_subdirectory(00-Kernel-Fusion)
 add_subdirectory(01-GPU-Threads)
-add_subdirectory(02-RAJA-Resource)
+add_subdirectory(02-RAJA-Resources)

From 2762405aa3ec85e3d4e11ea364e5434c6a49f0f4 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 29 Jul 2024 00:15:13 -0700
Subject: [PATCH 07/10] rename files

---
 .../{02-raja-resource.cpp => 02-raja-resources.cpp}           | 0
 Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt            | 4 ++--
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename Advanced_Tutorial/02-RAJA-Resources/{02-raja-resource.cpp => 02-raja-resources.cpp} (100%)

diff --git a/Advanced_Tutorial/02-RAJA-Resources/02-raja-resource.cpp b/Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp
similarity index 100%
rename from Advanced_Tutorial/02-RAJA-Resources/02-raja-resource.cpp
rename to Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp
diff --git a/Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt b/Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt
index d1c9506..c26166c 100644
--- a/Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt
+++ b/Advanced_Tutorial/02-RAJA-Resources/CMakeLists.txt
@@ -6,6 +6,6 @@
 ###############################################################################
 
 blt_add_executable(
-  NAME 02-raja-resource
-  SOURCES 02-raja-resource.cpp
+  NAME 02-raja-resources
+  SOURCES 02-raja-resources.cpp
   DEPENDS_ON cuda umpire RAJA)

From b5badcf88859d923f4ecde7ee883701dc13abb14 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 29 Jul 2024 00:22:49 -0700
Subject: [PATCH 08/10] clean up example

---
 Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
index d168a34..0289b6e 100644
--- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
+++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
@@ -87,16 +87,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
        [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
 	RAJA::loop<outer_pol>(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) {
-
-
 	    RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) {
 
-		//Do something
+		//loop body
 
 	      });
 	  });
 
-
       });
 
   }
@@ -130,13 +127,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
 	    //Iteration space is same as number of blocks per thread
 	    //We could also use direct policy here
 	    RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) {
-		//Do something here
+		//loop body
 	      }); //inner loop
 
 
 	    //Iteration space is *more* than number of blocks per thread
 	    RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, 2*block_sz), [&] (int tx) {
-		//Do something here
+		//loop body
 	      }); //inner loop
 
 	  }); //outer loop
@@ -164,7 +161,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
 	RAJA::loop<global_pol_y>(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) {
 	    RAJA::loop<global_pol_x>(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) {
 
-		//Do something
+		//loop body
 
 	      });
 	  });

From cbd3548935ed7410a07976fe6842af4563f56f6a Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 29 Jul 2024 01:42:06 -0700
Subject: [PATCH 09/10] turn on nvtx and take a clean up pass

---
 Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp       | 2 +-
 Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp | 7 +++----
 tpl/CMakeLists.txt                                        | 1 +
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
index 0289b6e..5f8ebb2 100644
--- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
+++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
@@ -57,7 +57,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
   // The examples below showcase commonly used GPU policies.
   // For the HIP and SYCL programming models, we offer analogous policies.
 
-  contexpr bool async = false; //asynchronous kernel execution
+  constexpr bool async = false; //asynchronous kernel execution
 
   using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
 
diff --git a/Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp b/Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp
index f69c777..2ee0708 100644
--- a/Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp
+++ b/Advanced_Tutorial/02-RAJA-Resources/02-raja-resources.cpp
@@ -45,7 +45,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
 
   RAJA::RangeSegment one_range(0, 1);
   RAJA::RangeSegment m_range(0, M);
-  RAJA::RangeSegment n_range(0, N);
 
   using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
 
@@ -65,11 +64,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
       RAJA::resources::Event e =
         RAJA::launch<launch_policy>(res_cuda,
         RAJA::LaunchParams(RAJA::Teams(64),
-                         RAJA::Threads(1)),
+			   RAJA::Threads(1)), "RAJA resource example",
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)  {
 
-       RAJA::loop<teams_x>(ctx, m_range, [&] (int j) {
-         RAJA::loop<threads_x>(ctx, one_range, [&] (int k) {
+       RAJA::loop<outer_pol_x>(ctx, m_range, [&] (int j) {
+         RAJA::loop<inner_pol_x>(ctx, one_range, [&] (int k) {
 
            d_array[i*M + j] = i * M + j;
 
diff --git a/tpl/CMakeLists.txt b/tpl/CMakeLists.txt
index 19efb48..fc8fed4 100644
--- a/tpl/CMakeLists.txt
+++ b/tpl/CMakeLists.txt
@@ -4,6 +4,7 @@
 # set (RAJA_ENABLE_EXERCISES Off CACHE BOOL "")
 # set (RAJA_ENABLE_DOCUMENTATION Off CACHE BOOL "")
 # set (RAJA_ENABLE_BENCHMARKS Off CACHE BOOL "")
+  set (RAJA_ENABLE_NV_TOOLS_EXT ON CACHE BOOL "")
 # 
 # set (UMPIRE_ENABLE_TESTS Off CACHE BOOL "")
 # set (UMPIRE_ENABLE_EXAMPLES Off CACHE BOOL "")

From 667e5fceffa68ccee03c8521d3c7716930cf50ed Mon Sep 17 00:00:00 2001
From: Kristi Belcher <belcher6@llnl.gov>
Date: Mon, 29 Jul 2024 09:28:08 -0700
Subject: [PATCH 10/10] adding Umpire pool coalescing heuristic lesson

---
 .../03-Umpire-Coalescing-Heuristics.cpp       | 128 ++++++++++++++++++
 .../CMakeLists.txt                            |  11 ++
 .../03-Umpire-Coalescing-Heuristics/README.md |  16 +++
 3 files changed, 155 insertions(+)
 create mode 100644 Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/03-Umpire-Coalescing-Heuristics.cpp
 create mode 100644 Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt
 create mode 100644 Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md

diff --git a/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/03-Umpire-Coalescing-Heuristics.cpp b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/03-Umpire-Coalescing-Heuristics.cpp
new file mode 100644
index 0000000..a6f4080
--- /dev/null
+++ b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/03-Umpire-Coalescing-Heuristics.cpp
@@ -0,0 +1,128 @@
+nclude "umpire/Allocator.hpp"
+#include "umpire/ResourceManager.hpp"
+#include "umpire/strategy/QuickPool.hpp"
+#include "umpire/util/Macros.hpp"
+#include "umpire/util/wrap_allocator.hpp"
+
+int main(int, char**)
+{
+  //Create the instance of the Resource Manager and use it to create an allocator using the DEVICE memory resource.
+  auto& rm = umpire::ResourceManager::getInstance();
+  auto allocator = rm.getAllocator("DEVICE");
+
+  /*
+   * Set up Percent Releasable and Blocks Releasable heuristics. The Percent Releasable heuristic
+   * will coalesce the pool when some percentage of bytes in the pool is releasable (i.e. free).
+   * The Blocks Releasable heuristic will coalesce the pool when a certain number of blocks in
+   * the pool is releasable (i.e. free). Each heuristic function takes a parameter that specifies
+   * either the percentage or the number of blocks, depending on which heuristic it is.
+   * Below, do the following:
+   * 1. Create a Percent Releasable heuristic function that will coalesce when the entire pool is releasable.
+   * 2. Create a Percent Releasable heuristic function that will coalesce when 75% of the pool is releasable.
+   * 3. Create a Blocks Releasable heuristic function that will coalesce when 3 blocks of the pool are releasable.
+   * 4. Create a Blocks Releasable heuristic function that will coalesce when 5 blocks of the pool are releasable.
+   */
+  auto pr75_hwm_heuristic = umpire::strategy::QuickPool::percent_releasable_hwm(75);
+  auto pr100_heuristic = umpire::strategy::QuickPool::percent_releasable(100);
+  auto br3_heuristic = umpire::strategy::QuickPool::blocks_releasable(3);
+  auto br5_hwm_heuristic = umpire::strategy::QuickPool::blocks_releasable_hwm(5);
+
+  //Note: if no heuristic function is set for a pool, the default heuristic function is Percent Releasable set to 100%.
+  //This should work decently well for many cases, but with particular allocation patterns, it may not be aggressive
+  //enough.
+
+  /*
+   * Below, create a separate QuickPool for each heuristic function. The pools should have a parameter to set the
+   * size of the first block in the pool, a parameter to set the size of the next blocks in the pool, the alignment, and
+   * finaly the heuristic function.
+   *
+   * By passing the specific heuristic function to the constructor of the pool, we are ensuring that every time the
+   * pool must be coalesced, it uses the exact heuristic function we set above.
+   */
+  auto pool1 = rm.makeAllocator<umpire::strategy::QuickPool>("pool1", allocator, 1024ul, 1024ul, 16, pr75_hwm_heuristic);
+  auto pool2 = rm.makeAllocator<umpire::strategy::QuickPool>("pool2", allocator, 1024ul, 1024ul, 16, pr100_heuristic);
+  auto pool3 = rm.makeAllocator<umpire::strategy::QuickPool>("pool3", allocator, 1024ul, 1024ul, 16, br3_heuristic);
+  auto pool4 = rm.makeAllocator<umpire::strategy::QuickPool>("pool4", allocator, 1024ul, 1024ul, 16, br5_hwm_heuristic);
+
+  //Note: below we are using the allocator's Unwrap utility to expose the QuickPool class underneath. We will use
+  //this to query pool stats below. It is not a requirement, but can be useful for debugging.
+  auto quick_pool1 = umpire::util::unwrap_allocator<umpire::strategy::QuickPool>(pool1);
+  auto quick_pool2 = umpire::util::unwrap_allocator<umpire::strategy::QuickPool>(pool2);
+  auto quick_pool3 = umpire::util::unwrap_allocator<umpire::strategy::QuickPool>(pool3);
+  auto quick_pool4 = umpire::util::unwrap_allocator<umpire::strategy::QuickPool>(pool4);
+
+  //Allocate 4 arrays of void pointers
+  void *a[4], *b[4], *c[4], *d[4];
+  
+  //Allocate 1024 bytes in each element of each array
+  for (int i = 0; i < 4; ++i) {
+    a[i] = pool1.allocate(1024);
+    b[i] = pool2.allocate(1024);
+    c[i] = pool3.allocate(1024);
+    d[i] = pool4.allocate(1024);
+  }
+
+  //Only deallocate one element of the array so that one block is freed up.
+  pool1.deallocate(a[1]);
+  pool2.deallocate(b[1]);
+  pool3.deallocate(c[1]);
+  pool4.deallocate(d[1]);
+
+  //Allocate larger amounts of bytes in its place. This will cause the pool to rearrange blocks under the hood.
+  a[1] = pool1.allocate(4096);
+  b[1] = pool2.allocate(4096);
+  c[1] = pool3.allocate(4096);
+  d[1] = pool4.allocate(4096);
+
+  //Next, deallocate another element of the array that is different from above.
+  pool1.deallocate(a[2]);
+  pool2.deallocate(b[2]);
+  pool3.deallocate(c[2]);
+  pool4.deallocate(d[2]);
+
+  //Allocate a smaller amount of bytes in its place. This will cause the pool to rearrange blocks under the hood.
+  a[2] = pool1.allocate(64);
+  b[2] = pool2.allocate(64);
+  c[2] = pool3.allocate(64);
+  d[2] = pool4.allocate(64);
+
+  //As we deallocate from each pool, print out stats. Each pool should behave different under the hood because
+  //of the different coalescing heuristic functions used.
+  for (int i = 0; i < 4; ++i) {
+    pool1.deallocate(a[i]);
+    std::cout << "Pool1 has " << pool1.getActualSize() << " bytes of memory. "
+              << pool1.getCurrentSize() << " bytes are used. " << quick_pool1->getBlocksInPool()
+              << " blocks are in the pool. " << quick_pool1->getReleasableSize() << " bytes are releaseable. "
+              << std::endl; 
+  }
+  std::cout << "----------------------------------" << std::endl;
+
+  for (int i = 0; i < 4; ++i) {
+    pool2.deallocate(b[i]);
+    std::cout << "Pool2 has " << pool2.getActualSize() << " bytes of memory. "
+              << pool2.getCurrentSize() << " bytes are used. " << quick_pool2->getBlocksInPool()
+              << " blocks are in the pool. " << quick_pool2->getReleasableSize() << " bytes are releaseable. "
+              << std::endl;
+  }
+  std::cout << "----------------------------------" << std::endl;
+
+  for (int i = 0; i < 4; ++i) {
+    pool3.deallocate(c[i]);
+    std::cout << "Pool3 has " << pool3.getActualSize() << " bytes of memory. "
+              << pool3.getCurrentSize() << " bytes are used. " << quick_pool3->getBlocksInPool()
+              << " blocks are in the pool. " << quick_pool3->getReleasableSize() << " bytes are releaseable. "
+              << std::endl;
+  }
+  std::cout << "----------------------------------" << std::endl;
+
+  for (int i = 0; i < 4; ++i) {
+    pool4.deallocate(d[i]);
+    std::cout << "Pool4 has " << pool4.getActualSize() << " bytes of memory. "
+              << pool4.getCurrentSize() << " bytes are used. " << quick_pool4->getBlocksInPool()
+              << " blocks are in the pool. " << quick_pool4->getReleasableSize() << " bytes are releaseable. "
+              << std::endl;
+  }
+  std::cout << "----------------------------------" << std::endl;
+
+  return 0;
+}
diff --git a/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt
new file mode 100644
index 0000000..847e0c5
--- /dev/null
+++ b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/CMakeLists.txt
@@ -0,0 +1,11 @@
+###############################################################################
+# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+blt_add_executable(
+  NAME 03-Umpire-Coalescing-Heuristics
+  SOURCES 03-Umpire-Coalescing-Heuristics.cpp
+  DEPENDS_ON cuda umpire)
diff --git a/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md
new file mode 100644
index 0000000..a467bed
--- /dev/null
+++ b/Advanced_Tutorial/03-Umpire-Coalescing-Heuristics/README.md
@@ -0,0 +1,16 @@
+=========================================================
+Advanced Tutorial - Using Coalescing Heuristics in Umpire
+=========================================================
+
+Look for the `TODO` comments in the source code. Here you will have create different
+Umpire pools with their corresponding coalescing heuristic functions.
+
+A complete description of the different coalescing heuristic functions is available 
+in the online Umpire documentation:
+
+Once you are ready, uncomment the COMPILE define on on top of the file and do
+
+```
+$ make 03-Umpire-Coalescing-Heuristics 
+$ ./bin/03-Umpire-Coalescing-Heuristics
+```