llvm
diff --git a/‎.github/workflows/build_mlir_python_bindings_wheel.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build_mlir_python_bindings_wheel.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build_test_release_eudsl.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/build_test_release_eudsl.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/build_test_release_eudsl_python_extras.yml‎
Lines changed: 23 additions & 0 deletions b/‎.github/workflows/build_test_release_eudsl_python_extras.yml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎projects/eudsl-python-extras/examples/cuda_e2e.ipynb‎
Lines changed: 4 additions & 4 deletions b/‎projects/eudsl-python-extras/examples/cuda_e2e.ipynb‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎projects/eudsl-python-extras/examples/cuda_matmul_opt.py‎
Lines changed: 40 additions & 40 deletions b/‎projects/eudsl-python-extras/examples/cuda_matmul_opt.py‎
Lines changed: 40 additions & 40 deletions
diff --git a/‎projects/eudsl-python-extras/examples/flash_attention.py‎
Lines changed: 2 additions & 2 deletions b/‎projects/eudsl-python-extras/examples/flash_attention.py‎
Lines changed: 2 additions & 2 deletions
@@ -436,7 +436,7 @@ jobs:
           name: build_artifact_python_bindings-ubuntu-wasm
 
       - name: Release current commit
-        if: (github.event_name == 'push' && github.ref_name == 'main') || github.event_name == 'workflow_dispatch'
+        if: (github.event_name == 'push' && github.ref_name == 'main') || (github.event_name == 'workflow_dispatch' && inputs.release)
         uses: ncipollo/[email protected]
         with:
           artifacts: "wheelhouse/mlir_python_bindings*.whl"
 
@@ -440,8 +440,10 @@ jobs:
 
   release-eudsl:
 
-    if: (github.event_name == 'push' && github.ref_name == 'main') || github.event_name == 'workflow_dispatch'
+    if: (github.event_name == 'push' && github.ref_name == 'main') || (github.event_name == 'workflow_dispatch' && inputs.release)
+
     needs: [build-eudsl]
+
     runs-on: "ubuntu-22.04"
 
     permissions:
 
@@ -207,7 +207,30 @@ jobs:
           if [[ $(python -c "print(__import__('sys').version_info >= (3, 13))") == "True" ]]; then
             python projects/eudsl-python-extras/examples/cuda_matmul_opt.py
           fi
+
+      - name: Test jupyter notebooks
+        # sed: can't read C:\hostedtoolcache\windows\Python\3.12.10\x64/jupyter_client/runapp.py: No such file or directory
+        if: matrix.os != 'windows'
+        shell: bash
+        env:
+          BRANCH: ${{ github.head_ref || github.ref_name }}
+        run: |
+          
+          pip install -q jupyter
+          
+          sed -i.bak 's/OUTPUT_TIMEOUT = 10/OUTPUT_TIMEOUT = 1000/g' \
+            $(python -c 'import site; print(site.getsitepackages()[0])')/jupyter_client/runapp.py
+          
+          jupyter execute projects/eudsl-python-extras/examples/mlir_python_extras.ipynb --output=mlir_python_extras_output
+          cat projects/eudsl-python-extras/examples/mlir_python_extras_output.ipynb | jq '.cells[].outputs | select(length > 0) | .[0] | .text'
+          jupyter execute projects/eudsl-python-extras/examples/vectorization_e2e.ipynb --output=vectorization_e2e_output
+          cat projects/eudsl-python-extras/examples/vectorization_e2e_output.ipynb | jq '.cells[].outputs | select(length > 0) | .[0] | .text'
           
+          # TODO(max): build wheels with nv targets
+          # if [ ${{ matrix.os }} == 'ubuntu' ]; then
+          #   jupyter execute projects/eudsl-python-extras/examples/cuda_e2e.ipynb --output=cuda_e2e_output
+          #   cat projects/eudsl-python-extras/examples/cuda_e2e_output.ipynb | jq '.cells[].outputs | select(length > 0) | .[0] | .text'
+          # fi
 
   release-eudsl-python-extras:
 
 
@@ -103,10 +103,10 @@
     "from mlir import _mlir_libs\n",
     "from mlir.extras.ast.canonicalize import canonicalize\n",
     "from mlir.extras.context import RAIIMLIRContext, ExplicitlyManagedModule\n",
-    "from mlir.extras.dialects.ext import arith, memref, scf, gpu\n",
-    "from mlir.extras.dialects.ext import linalg\n",
-    "from mlir.extras.dialects.ext import transform\n",
-    "from mlir.extras.dialects.ext.func import func\n",
+    "from mlir.extras.dialects import arith, memref, scf, gpu\n",
+    "from mlir.extras.dialects import linalg\n",
+    "from mlir.extras.dialects import transform\n",
+    "from mlir.extras.dialects.func import func\n",
     "from mlir.extras.runtime.passes import Pipeline, run_pipeline\n",
     "from mlir.extras.runtime.refbackend import LLVMJITBackend\n",
     "from mlir.extras.util import find_ops\n",
 
@@ -11,17 +11,17 @@
     mlir_mod_ctx,
     MLIRContext,
 )
-from mlir.extras.dialects.ext import arith, memref, gpu, scf, linalg, vector, nvgpu
-from mlir.extras.dialects.ext.gpu import (
+from mlir.extras.dialects import arith, memref, gpu, scf, linalg, vector, nvgpu
+from mlir.extras.dialects.gpu import (
     block_idx,
     thread_idx,
     block_dim,
     get_compile_object_bytes,
     smem_space,
 )
-from mlir.extras.dialects.ext.llvm import llvm_ptr_t
-from mlir.extras.dialects.ext.memref import S
-from mlir.extras.dialects.ext.scf import range_
+from mlir.extras.dialects.llvm import llvm_ptr_t
+from mlir.extras.dialects.memref import S
+from mlir.extras.dialects.scf import range_
 from mlir.extras.runtime.passes import Pipeline, run_pipeline
 
 # noinspection PyUnresolvedReferences
@@ -139,9 +139,9 @@ def sgemm_naive[
     K,
     N,
     dtype,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     one = arith.constant(1.0, type=dtype)
     tmp = arith.constant(0, type=dtype)
@@ -167,9 +167,9 @@ def sgemm_naive_row_order[
     K,
     N,
     dtype,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     one = arith.constant(1.0, type=dtype)
     tmp = arith.constant(0, type=dtype)
@@ -193,10 +193,10 @@ def sgemm_coalesce[
     K,
     N,
     dtype,
-    BLOCK_SIZE: 32,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    BLOCK_SIZE = 32,
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
 
     tid = gpu.thread_id()
@@ -259,10 +259,10 @@ def sgemm_coalesce_transpose_B[
     K,
     N,
     dtype,
-    BLOCK_SIZE: 32,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    BLOCK_SIZE = 32,
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
 
     tid = gpu.thread_id()
@@ -288,10 +288,10 @@ def sgemm_shared_mem_block[
     K,
     N,
     dtype,
-    BLOCK_SIZE: 32,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    BLOCK_SIZE = 32,
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     # allocate buffer for current block in fast shared mem
     # shared mem is shared between all threads in a block
@@ -394,9 +394,9 @@ def sgemm_shared_mem_1d_block_tiling[
     BN,
     BK,
     TM,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     base = gpu.dynamic_shared_memory()
     A_shared = memref.view(base, (BM, BK), dtype=dtype)
@@ -455,9 +455,9 @@ def sgemm_shared_mem_2d_block_tiling[
     BK,
     TM,
     TN,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     base = gpu.dynamic_shared_memory()
     A_shared = memref.view(base, (BM, BK), dtype=dtype)
@@ -542,9 +542,9 @@ def sgemm_shared_mem_2d_block_tiling_vectorize[
     BK,
     TM,
     TN,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     VECTOR_WIDTH = 4
     DTYPE_WIDTH = dtype.width // 8
@@ -656,9 +656,9 @@ def sgemm_warp_tiling[
     TM,
     TN,
     NUM_THREADS,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     VECTOR_WIDTH = 4
     DTYPE_WIDTH = dtype.width // 8
@@ -820,11 +820,11 @@ def sgemm_tensor_core[
     M,
     K,
     N,
-    A_t: T.memref(M, K, T.f16()),
-    B_t: T.memref(K, N, T.f16()),
-    C_t: T.memref(M, N, T.f32()),
-    a_tma_t: llvm_ptr_t(),
-    b_tma_t: llvm_ptr_t(),
+    A_t = T.memref(M, K, T.f16()),
+    B_t = T.memref(K, N, T.f16()),
+    C_t = T.memref(M, N, T.f32()),
+    a_tma_t = llvm_ptr_t(),
+    b_tma_t = llvm_ptr_t(),
 ](A: A_t, B: B_t, C: C_t, a_tma: a_tma_t, b_tma: b_tma_t):
     a_tma = builtin.unrealized_conversion_cast(
         [
 
@@ -6,11 +6,11 @@
 
 from mlir.extras.ast.canonicalize import canonicalize
 from mlir.extras.context import RAIIMLIRContextModule
-from mlir.extras.dialects.ext import memref, scf, arith, gpu, llvm
+from mlir.extras.dialects import memref, scf, arith, gpu, llvm
 from mlir.dialects import math
 
 # noinspection PyUnresolvedReferences
-from mlir.extras.dialects.ext.gpu import (
+from mlir.extras.dialects.gpu import (
     block_idx,
     thread_idx,
     grid_dim,