[Mosaic GPU] Revert deletion of the legacy FFI API.

bchetioui · Google-ML-Automation · commit d5f91dc32932 · 2025-11-30T03:40:54.000-08:00
Reverts 9138c20 PiperOrigin-RevId: 838381988
diff --git a/jax/_src/internal_test_util/export_back_compat_test_data/pallas/mosaic_gpu_add_one.py b/jax/_src/internal_test_util/export_back_compat_test_data/pallas/mosaic_gpu_add_one.py
diff --git a/jaxlib/mosaic/gpu/BUILD b/jaxlib/mosaic/gpu/BUILD
@@ -330,6 +330,8 @@ cc_library(
         "@xla//xla/backends/gpu:ffi",
         "@xla//xla/ffi",
         "@xla//xla/ffi:ffi_api",
+        "@xla//xla/service:custom_call_status",
+        "@xla//xla/service:custom_call_target_registry",
         "@xla//xla/service/gpu/llvm_gpu_backend:nvptx_libdevice_path",
         "@xla//xla/service/llvm_ir:llvm_command_line_options",
         "@xla//xla/stream_executor/cuda:assemble_compilation_provider",
diff --git a/jaxlib/mosaic/gpu/custom_call.cc b/jaxlib/mosaic/gpu/custom_call.cc
@@ -115,6 +115,8 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
+#include "xla/service/custom_call_status.h"
+#include "xla/service/custom_call_target_registry.h"
 #include "xla/service/gpu/llvm_gpu_backend/nvptx_libdevice_path.h"
 #include "xla/service/llvm_ir/llvm_command_line_options.h"
 #include "xla/stream_executor/cuda/assemble_compilation_provider.h"
@@ -639,6 +641,40 @@ absl::StatusOr<CompiledKernel*> CachedCompileAndInit(CacheKey key,
   return &cache.kernels.at(key);
 }
 
+void MosaicGPUCustomCall(void* stream, void** buffers, char* opaque,
+                         size_t opaque_len, XlaCustomCallStatus* status) {
+  // Forward-compatible version using the legacy FFI API
+  if (reinterpret_cast<uintptr_t>(opaque) % alignof(KernelHash)) {
+    fprintf(stderr, "Misaligned opaque pointer\n");
+    abort();
+  }
+  auto hash = *reinterpret_cast<KernelHash*>(opaque);
+  CUcontext ctx;
+  if (cuCtxGetCurrent(&ctx) != CUDA_SUCCESS) {
+    fprintf(stderr, "Failed to get current CUDA context\n");
+    abort();
+  }
+  CacheKey key(hash, reinterpret_cast<uintptr_t>(ctx));
+  auto compiled_kernel = CachedCompileAndInit(key, opaque + sizeof(KernelHash));
+  if (!compiled_kernel.ok()) {
+    XlaCustomCallStatusSetFailure(status,
+                                  compiled_kernel.status().message().data(),
+                                  compiled_kernel.status().message().size());
+    return;
+  }
+  auto ctx_kernel_comm = (*compiled_kernel)->GetHostLaunch();
+  bool is_comm_used = std::get<2>(ctx_kernel_comm);
+  void* args[4] = {&std::get<0>(ctx_kernel_comm), &stream, &buffers};
+  if (is_comm_used) {
+    mosaic::gpu::NvshmemApi::Default().barrier_all_on_stream(
+        reinterpret_cast<cudaStream_t>(stream));
+  }
+  std::get<1>(ctx_kernel_comm)(args);
+}
+
+XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM("mosaic_gpu", &MosaicGPUCustomCall,
+                                         "CUDA");
+
 absl::Status MosaicGpuExecute(gpuStream_t stream, ffi::RemainingArgs inputs,
                               ffi::RemainingRets results,
                               std::string_view kernel_hash,
diff --git a/tests/pallas/export_back_compat_pallas_test.py b/tests/pallas/export_back_compat_pallas_test.py
@@ -81,8 +81,8 @@ def test_mosaic_gpu_add_one(self):
     def add_one(x_ref, o_ref):
       o_ref[...] = x_ref[...] + 1
 
-    data = self.load_testdata(mosaic_gpu_add_one.data_2025_11_27)
-    self.run_one_test(add_one, data)
+    data = self.load_testdata(mosaic_gpu_add_one.data_2025_04_22)
+    self.run_one_test(add_one, data, expect_current_custom_calls=["mosaic_gpu_v2"])
 
   def test_mosaic_gpu_kernel_add_one(self):
     if not jtu.is_cuda_compute_capability_at_least("9.0"):