diff --git a/.github/workflows/docker.jetson.6.2.0.yml b/.github/workflows/docker.jetson.6.2.0.yml
index 6ed5b1d899..317c70aa0e 100644
--- a/.github/workflows/docker.jetson.6.2.0.yml
+++ b/.github/workflows/docker.jetson.6.2.0.yml
@@ -12,9 +12,14 @@ on:
         type: boolean
         description: "Do you want to push image after build?"
         default: false
+      custom_tag:
+        type: string
+        description: "Custom tag to use for the image (overrides VERSION)"
+        default: ""
 
 env:
   VERSION: "0.0.0" # Default version, will be overwritten
+  BASE_IMAGE: "roboflow/roboflow-inference-server-jetson-6.2.0"
 
 jobs:
   docker:
@@ -35,6 +40,15 @@ jobs:
         uses: actions/checkout@v4
       - name: Read version from file
         run: echo "VERSION=$(DISABLE_VERSION_CHECK=true python ./inference/core/version.py)" >> $GITHUB_ENV
+      - name: Determine Image Tags
+        id: tags
+        uses: ./.github/actions/determine-tags
+        with:
+          custom_tag: ${{ github.event.inputs.custom_tag }}
+          version: ${{ env.VERSION }}
+          base_image: ${{ env.BASE_IMAGE }}
+          force_push: ${{ github.event.inputs.force_push }}
+          token: ${{ secrets.GITHUB_TOKEN }}
       - name: Set up Depot CLI
         uses: depot/setup-action@v1
       - name: Build and Push
@@ -42,6 +56,6 @@ jobs:
         with:
           push: ${{ github.event_name == 'release' || (github.event.inputs.force_push == 'true')}}
           project: grl7ffzxd7
-          tags: roboflow/roboflow-inference-server-jetson-6.2.0:latest,roboflow/roboflow-inference-server-jetson-6.2.0:${{ env.VERSION}}
+          tags: ${{ steps.tags.outputs.image_tags }}
           platforms: linux/arm64
           file: ./docker/dockerfiles/Dockerfile.onnx.jetson.6.2.0
diff --git a/docker/dockerfiles/Dockerfile.onnx.jetson.6.2.0 b/docker/dockerfiles/Dockerfile.onnx.jetson.6.2.0
index 11e6b8b63b..108461ab60 100644
--- a/docker/dockerfiles/Dockerfile.onnx.jetson.6.2.0
+++ b/docker/dockerfiles/Dockerfile.onnx.jetson.6.2.0
@@ -3,9 +3,9 @@
 FROM nvcr.io/nvidia/l4t-jetpack:r36.4.0 AS builder
 
 ARG DEBIAN_FRONTEND=noninteractive
-ARG CMAKE_VERSION=3.31.10
-ARG PYTORCH_VERSION=2.8.0
-ARG TORCHVISION_VERSION=0.23.0
+ARG CMAKE_VERSION=4.2.0
+ARG PYTORCH_VERSION=2.6.0
+ARG TORCHVISION_VERSION=0.21.0
 ARG OPENCV_VERSION=4.10.0
 ARG ONNXRUNTIME_VERSION=1.20.0
 ENV LANG=en_US.UTF-8
@@ -94,6 +94,7 @@ RUN git clone --recursive --branch v${PYTORCH_VERSION} https://github.com/pytorc
     export PYTORCH_BUILD_VERSION=${PYTORCH_VERSION} PYTORCH_BUILD_NUMBER=1 && \
     export CMAKE_BUILD_TYPE=Release BUILD_SHARED_LIBS=ON USE_PRIORITIZED_TEXT_FOR_LD=1 && \
     export MAX_JOBS=12 && \
+    export CMAKE_POLICY_VERSION_MINIMUM=3.5 && \
     python3 setup.py bdist_wheel && \
     python3 -m pip install dist/torch-*.whl
 
@@ -113,6 +114,15 @@ ENV CUDA_HOME=/usr/local/cuda \
     PATH=/usr/local/cuda/bin:$PATH \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
+# Install Tensorrt
+RUN apt remove -y 'libnvinfer*' 'libnvonnxparsers*' 'libnvparsers*' 'libnvinfer-plugin*' 'python3-libnvinfer*' 'tensorrt*'
+WORKDIR /build/tensorrt-10.x
+RUN wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/local_repo/nv-tensorrt-local-tegra-repo-ubuntu2204-10.7.0-cuda-12.6_1.0-1_arm64.deb && \
+    dpkg -i nv-tensorrt-local-tegra-repo-ubuntu2204-10.7.0-cuda-12.6_1.0-1_arm64.deb && \
+    cp /var/nv-tensorrt-local-tegra-repo-ubuntu2204-10.7.0-cuda-12.6/nv-tensorrt-local-tegra-C50F04B9-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get install -y tensorrt
+
 # Build onnxruntime-gpu from source with TensorRT support
 WORKDIR /build/onnxruntime
 RUN git clone --recursive --branch v${ONNXRUNTIME_VERSION} https://github.com/microsoft/onnxruntime.git && \
@@ -178,6 +188,7 @@ RUN uv pip install --system --break-system-packages --index-strategy unsafe-best
     -r requirements.sdk.http.txt \
     -r requirements.easyocr.txt \
     -r requirements.jetson.txt \
+    "pycuda>=2025.0.0,<2026.0.0" \
     "setuptools<=75.5.0" \
     packaging \
     && rm -rf ~/.cache/uv
@@ -205,7 +216,6 @@ RUN ln -sf /usr/bin/python3 /usr/bin/python && \
 RUN cd /usr/local/lib/python3.10/dist-packages && \
     find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true && \
     rm -rf debugpy* jupyterlab* jupyter_* notebook* ipython* ipykernel* || true && \
-    rm -rf torch/bin torch/include || true && \
     rm -rf onnx/backend/test onnx/test || true && \
     rm -rf scipy/*/tests pandas/tests || true && \
     rm -rf */examples */benchmarks */docs || true && \
@@ -285,6 +295,12 @@ RUN ldconfig
 
 # Copy Python packages
 COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
+COPY --from=builder /usr/lib/python3.10/dist-packages/tensorrt /usr/local/lib/python3.10/dist-packages/tensorrt
+COPY --from=builder /usr/lib/python3.10/dist-packages/tensorrt-10.7.0.dist-info /usr/local/lib/python3.10/dist-packages/tensorrt-10.7.0.dist-info
+COPY --from=builder /usr/lib/python3.10/dist-packages/tensorrt_dispatch /usr/local/lib/python3.10/dist-packages/tensorrt_dispatch
+COPY --from=builder /usr/lib/python3.10/dist-packages/tensorrt_dispatch-10.7.0.dist-info /usr/local/lib/python3.10/dist-packages/tensorrt_dispatch-10.7.0.dist-info
+COPY --from=builder /usr/lib/python3.10/dist-packages/tensorrt_lean /usr/local/lib/python3.10/dist-packages/tensorrt_lean
+COPY --from=builder /usr/lib/python3.10/dist-packages/tensorrt_lean-10.7.0.dist-info /usr/local/lib/python3.10/dist-packages/tensorrt_lean-10.7.0.dist-info
 COPY --from=builder /usr/local/bin/inference /usr/local/bin/inference
 
 ENV PYTHONPATH=/usr/local/lib/python3.10/dist-packages:$PYTHONPATH
@@ -295,6 +311,8 @@ COPY inference_cli inference_cli
 COPY inference_sdk inference_sdk
 COPY docker/config/gpu_http.py gpu_http.py
 
+RUN python -m pip uninstall -y boto3 botocore && python -m pip install "boto3>=1.40.0,<=1.41.5" "botocore>=1.40.0,<=1.41.5"
+
 # Environment variables
 ENV VERSION_CHECK_MODE=once \
     CORE_MODEL_SAM2_ENABLED=True \
@@ -306,14 +324,14 @@ ENV VERSION_CHECK_MODE=once \
     ORT_TENSORRT_ENGINE_CACHE_PATH=/tmp/ort_cache \
     ORT_TENSORRT_MAX_WORKSPACE_SIZE=4294967296 \
     ORT_TENSORRT_BUILDER_OPTIMIZATION_LEVEL=5 \
-    ONNXRUNTIME_EXECUTION_PROVIDERS=[TensorrtExecutionProvider] \
-    REQUIRED_ONNX_PROVIDERS=TensorrtExecutionProvider \
     OPENBLAS_CORETYPE=ARMV8 \
     LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 \
     WORKFLOWS_STEP_EXECUTION_MODE=local \
     WORKFLOWS_MAX_CONCURRENT_STEPS=4 \
     API_LOGGING_ENABLED=True \
-    DISABLE_WORKFLOW_ENDPOINTS=false
+    DISABLE_WORKFLOW_ENDPOINTS=false \
+    ALLOW_INFERENCE_EXP_UNTRUSTED_MODELS=True \
+    USE_INFERENCE_EXP_MODELS=False
 
 LABEL org.opencontainers.image.description="Inference Server - Jetson 6.2.0 (PyTorch from source, numpy 2.x)"
 
diff --git a/docker/dockerfiles/jp51.cu114.inference-experimental.dockerfile b/docker/dockerfiles/jp51.cu114.inference-experimental.dockerfile
new file mode 100644
index 0000000000..c44ec31aee
--- /dev/null
+++ b/docker/dockerfiles/jp51.cu114.inference-experimental.dockerfile
@@ -0,0 +1,63 @@
+FROM roboflow/l4t-ml:r35.2.1-py3.12-cu118-trt-10-v0.0.1
+
+COPY requirements/requirements.clip.txt \
+    requirements/requirements.http.txt \
+    requirements/requirements.doctr.txt \
+    requirements/requirements.groundingdino.txt \
+    requirements/requirements.sdk.http.txt \
+    requirements/requirements.yolo_world.txt \
+    requirements/_requirements.txt \
+    requirements/requirements.easyocr.txt \
+    requirements/requirements.gpu.txt \
+    ./
+
+RUN python -m pip install \
+    -r _requirements.txt \
+    -r requirements.clip.txt \
+    -r requirements.http.txt \
+    -r requirements.doctr.txt \
+    -r requirements.groundingdino.txt \
+    -r requirements.sdk.http.txt \
+    -r requirements.yolo_world.txt \
+    -r requirements.easyocr.txt \
+    -r requirements.gpu.txt \
+    "pycuda>=2025.0.0,<2026.0.0"
+
+
+WORKDIR /app/
+COPY inference inference
+COPY inference_cli inference_cli
+COPY inference_sdk inference_sdk
+COPY docker/config/gpu_http.py gpu_http.py
+COPY .release .release
+COPY requirements requirements
+COPY Makefile Makefile
+
+RUN make create_inference_cli_whl PYTHON=python3.12
+RUN python -m pip install dist/inference_cli*.whl
+
+ENV VERSION_CHECK_MODE=continuous \
+    PROJECT=roboflow-platform \
+    ORT_TENSORRT_FP16_ENABLE=1 \
+    ORT_TENSORRT_ENGINE_CACHE_ENABLE=1 \
+    CORE_MODEL_SAM_ENABLED=False \
+    PROJECT=roboflow-platform \
+    NUM_WORKERS=1 \
+    HOST=0.0.0.0 \
+    PORT=9001 \
+    OPENBLAS_CORETYPE=ARMV8 \
+    LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 \
+    WORKFLOWS_STEP_EXECUTION_MODE=local \
+    WORKFLOWS_MAX_CONCURRENT_STEPS=2 \
+    API_LOGGING_ENABLED=True \
+    CORE_MODEL_TROCR_ENABLED=false \
+    RUNS_ON_JETSON=True \
+    ENABLE_PROMETHEUS=True \
+    ENABLE_STREAM_API=True \
+    STREAM_API_PRELOADED_PROCESSES=2 \
+    PYTHONPATH=/app:$PYTHONPATH
+ENV CORE_MODEL_SAM3_ENABLED=False \
+    ALLOW_INFERENCE_EXP_UNTRUSTED_MODELS=True \
+    USE_INFERENCE_EXP_MODELS=False
+
+ENTRYPOINT uvicorn gpu_http:app --workers $NUM_WORKERS --host $HOST --port $PORT
\ No newline at end of file
diff --git a/inference/core/env.py b/inference/core/env.py
index d655c37899..c2dba109a4 100644
--- a/inference/core/env.py
+++ b/inference/core/env.py
@@ -208,6 +208,9 @@
 
 # Enable experimental RFDETR backend (inference_exp) rollout, default is True
 USE_INFERENCE_EXP_MODELS = str2bool(os.getenv("USE_INFERENCE_EXP_MODELS", "False"))
+ALLOW_INFERENCE_EXP_UNTRUSTED_MODELS = str2bool(
+    os.getenv("ALLOW_INFERENCE_EXP_UNTRUSTED_MODELS", "False")
+)
 
 # ID of host device, default is None
 DEVICE_ID = os.getenv("DEVICE_ID", None)
diff --git a/inference/core/models/exp_adapter.py b/inference/core/models/exp_adapter.py
index 111e8120e6..71f707cbc9 100644
--- a/inference/core/models/exp_adapter.py
+++ b/inference/core/models/exp_adapter.py
@@ -15,7 +15,7 @@
     ObjectDetectionInferenceResponse,
     ObjectDetectionPrediction,
 )
-from inference.core.env import API_KEY
+from inference.core.env import ALLOW_INFERENCE_EXP_UNTRUSTED_MODELS, API_KEY
 from inference.core.logger import logger
 from inference.core.models.base import Model
 from inference.core.utils.image_utils import load_image_rgb
@@ -37,7 +37,10 @@ def __init__(self, model_id: str, api_key: str = None, **kwargs):
         from inference_exp import AutoModel  # type: ignore
 
         self._exp_model: ObjectDetectionModel = AutoModel.from_pretrained(
-            model_id_or_path=model_id, api_key=self.api_key
+            model_id_or_path=model_id,
+            api_key=self.api_key,
+            allow_untrusted_packages=ALLOW_INFERENCE_EXP_UNTRUSTED_MODELS,
+            allow_direct_local_storage_loading=False,
         )
         # if hasattr(self._exp_model, "optimize_for_inference"):
         #     self._exp_model.optimize_for_inference()
diff --git a/inference_cli/benchmark.py b/inference_cli/benchmark.py
index 6ad4ff8426..f36799bd97 100644
--- a/inference_cli/benchmark.py
+++ b/inference_cli/benchmark.py
@@ -7,6 +7,7 @@
 from inference_cli.lib.benchmark.dataset import PREDEFINED_DATASETS
 from inference_cli.lib.benchmark_adapter import (
     run_infer_api_speed_benchmark,
+    run_inference_experimental_benchmark,
     run_python_package_speed_benchmark,
     run_workflow_api_speed_benchmark,
 )
@@ -269,5 +270,109 @@ def python_package_speed(
         raise typer.Exit(code=1)
 
 
+@benchmark_app.command(
+    help="This command provides a benchmark of inference-exp package. Currently, support for this feature "
+    "is experimental."
+)
+def inference_experimental_speed(
+    model_id: Annotated[
+        str,
+        typer.Option(
+            "--model_id",
+            "-m",
+            help="Model ID in format project/version.",
+        ),
+    ],
+    dataset_reference: Annotated[
+        str,
+        typer.Option(
+            "--dataset_reference",
+            "-d",
+            help=f"Name of predefined dataset (one of {list(PREDEFINED_DATASETS.keys())}) or path to directory with images",
+        ),
+    ] = "coco",
+    warm_up_inferences: Annotated[
+        int,
+        typer.Option("--warm_up_inferences", "-wi", help="Number of warm-up requests"),
+    ] = 10,
+    benchmark_inferences: Annotated[
+        int,
+        typer.Option(
+            "--benchmark_requests", "-bi", help="Number of benchmark requests"
+        ),
+    ] = 1000,
+    batch_size: Annotated[
+        int,
+        typer.Option("--batch_size", "-bs", help="Batch size of single request"),
+    ] = 1,
+    api_key: Annotated[
+        Optional[str],
+        typer.Option(
+            "--api-key",
+            "-a",
+            help="Roboflow API key for your workspace. If not given - env variable `ROBOFLOW_API_KEY` will be used",
+        ),
+    ] = None,
+    model_configuration: Annotated[
+        Optional[str],
+        typer.Option(
+            "--model_config", "-mc", help="Location of yaml file with model config"
+        ),
+    ] = None,
+    output_location: Annotated[
+        Optional[str],
+        typer.Option(
+            "--output_location",
+            "-o",
+            help="Location where to save the result (path to file or directory)",
+        ),
+    ] = None,
+    model_package_id: Annotated[
+        Optional[str],
+        typer.Option(
+            "--model_package_id",
+            "-o",
+            help="Selected model package ID (leave blank to run auto-negotiation)",
+        ),
+    ] = None,
+    turn_images_to_tensors: Annotated[
+        bool,
+        typer.Option(
+            "--images-as-tensors/--no-images-as-tensors",
+            help="Boolean flag to decide if input images are to be loaded as tensors on the device that model "
+            "is running, or should be left as np.arrays.",
+        ),
+    ] = True,
+    allow_untrusted_packages: Annotated[
+        bool,
+        typer.Option(
+            "--allow-untrusted-packages/--no-allow-untrusted-packages",
+            help="Boolean flag to decide if untrusted packages (for example the ones registered by clients) are "
+            "allowed to be loaded.",
+        ),
+    ] = True,
+):
+    try:
+        run_inference_experimental_benchmark(
+            model_id=model_id,
+            dataset_reference=dataset_reference,
+            warm_up_inferences=warm_up_inferences,
+            benchmark_inferences=benchmark_inferences,
+            batch_size=batch_size,
+            api_key=api_key,
+            model_configuration=model_configuration,
+            output_location=output_location,
+            model_package_id=model_package_id,
+            turn_images_to_tensors=turn_images_to_tensors,
+            allow_untrusted_packages=allow_untrusted_packages,
+        )
+    except KeyboardInterrupt:
+        print("Benchmark interrupted.")
+        return
+    except Exception as error:
+        typer.echo(f"Command failed. Cause: {error}")
+        raise typer.Exit(code=1)
+
+
 if __name__ == "__main__":
     benchmark_app()
diff --git a/inference_cli/lib/benchmark/inference_experimental_speed.py b/inference_cli/lib/benchmark/inference_experimental_speed.py
new file mode 100644
index 0000000000..2131af22b7
--- /dev/null
+++ b/inference_cli/lib/benchmark/inference_experimental_speed.py
@@ -0,0 +1,104 @@
+import random
+import time
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from inference_exp import AutoModel
+from inference_exp.configuration import DEFAULT_DEVICE
+from inference_exp.models.auto_loaders.core import AnyModel
+from supervision.utils.file import read_yaml_file
+from tqdm import tqdm
+
+from inference_cli.lib.benchmark.results_gathering import ResultsCollector
+
+
+def run_inference_experimental_benchmark(
+    model_id: str,
+    images: List[np.ndarray],
+    results_collector: ResultsCollector,
+    warm_up_inferences: int = 10,
+    benchmark_inferences: int = 1000,
+    batch_size: int = 1,
+    api_key: Optional[str] = None,
+    model_configuration: Optional[str] = None,
+    model_package_id: Optional[str] = None,
+    turn_images_to_tensors: bool = True,
+    allow_untrusted_packages: bool = True,
+) -> None:
+    inference_configuration = {}
+    if model_configuration is not None:
+        inference_configuration = read_yaml_file(file_path=model_configuration)
+    print(
+        f"Inference will be executed with the following parameters: {inference_configuration}"
+    )
+    AutoModel.describe_model(model_id=model_id, api_key=api_key)
+    if model_package_id:
+        AutoModel.describe_model_package(
+            model_id=model_id, package_id=model_package_id, api_key=api_key
+        )
+    if turn_images_to_tensors:
+        images = [
+            torch.from_numpy(np.ascontiguousarray(image[:, :, ::-1]))
+            .permute(2, 0, 1)
+            .to(DEFAULT_DEVICE)
+            for image in images
+        ]
+    model = AutoModel.from_pretrained(
+        model_id,
+        api_key=api_key,
+        model_package_id=model_package_id,
+        allow_untrusted_packages=allow_untrusted_packages,
+        device=DEFAULT_DEVICE,
+    )
+    run_model_warm_up(
+        model=model,
+        inference_configuration=inference_configuration,
+        image=images[0],
+        warm_up_inferences=warm_up_inferences,
+    )
+    run_benchmark(
+        model=model,
+        inference_configuration=inference_configuration,
+        images=images,
+        results_collector=results_collector,
+        benchmark_inferences=benchmark_inferences,
+        batch_size=batch_size,
+    )
+
+
+def run_model_warm_up(
+    model: AnyModel,
+    inference_configuration: Dict[str, Any],
+    image: Union[np.ndarray, torch.Tensor],
+    warm_up_inferences: int,
+) -> None:
+    for _ in tqdm(
+        range(warm_up_inferences), desc="Warming up model...", total=warm_up_inferences
+    ):
+        _ = model(image, **inference_configuration)
+
+
+def run_benchmark(
+    model: AnyModel,
+    inference_configuration: Dict[str, Any],
+    images: List[Union[np.ndarray, torch.Tensor]],
+    results_collector: ResultsCollector,
+    benchmark_inferences: int,
+    batch_size: int,
+) -> None:
+    while len(images) < batch_size:
+        images = images + images
+    results_collector.start_benchmark()
+    try:
+        for _ in range(benchmark_inferences):
+            random.shuffle(images)
+            payload = images[:batch_size]
+            start = time.time()
+            _ = model(payload, **inference_configuration)
+            duration = time.time() - start
+            results_collector.register_inference_duration(
+                batch_size=batch_size, duration=duration
+            )
+    finally:
+        results_collector.stop_benchmark()
diff --git a/inference_cli/lib/benchmark_adapter.py b/inference_cli/lib/benchmark_adapter.py
index 2be2bbc77d..aaf4138f15 100644
--- a/inference_cli/lib/benchmark_adapter.py
+++ b/inference_cli/lib/benchmark_adapter.py
@@ -17,6 +17,7 @@
 )
 from inference_cli.lib.utils import (
     dump_json,
+    ensure_inference_experimental_is_installed,
     ensure_inference_is_installed,
     initialise_client,
 )
@@ -229,6 +230,72 @@ def run_python_package_speed_benchmark(
     )
 
 
+def run_inference_experimental_benchmark(
+    model_id: str,
+    dataset_reference: str,
+    warm_up_inferences: int = 10,
+    benchmark_inferences: int = 1000,
+    batch_size: int = 1,
+    api_key: Optional[str] = None,
+    model_configuration: Optional[str] = None,
+    output_location: Optional[str] = None,
+    model_package_id: Optional[str] = None,
+    turn_images_to_tensors: bool = True,
+    allow_untrusted_packages: bool = True,
+) -> None:
+    ensure_inference_experimental_is_installed()
+
+    # importing here not to affect other entrypoints by missing `inference` core library
+    from inference_cli.lib.benchmark.inference_experimental_speed import (
+        run_inference_experimental_benchmark,
+    )
+
+    dataset_images = load_dataset_images(
+        dataset_reference=dataset_reference,
+    )
+    image_sizes = {i.shape[:2] for i in dataset_images}
+    print(f"Detected images dimensions: {image_sizes}")
+    results_collector = ResultsCollector()
+    statistics_display_thread = Thread(
+        target=display_benchmark_statistics, args=(results_collector,)
+    )
+    statistics_display_thread.start()
+    run_inference_experimental_benchmark(
+        model_id=model_id,
+        images=dataset_images,
+        results_collector=results_collector,
+        warm_up_inferences=warm_up_inferences,
+        benchmark_inferences=benchmark_inferences,
+        batch_size=batch_size,
+        api_key=api_key,
+        model_configuration=model_configuration,
+        model_package_id=model_package_id,
+        turn_images_to_tensors=turn_images_to_tensors,
+        allow_untrusted_packages=allow_untrusted_packages,
+    )
+    benchmark_results = results_collector.get_statistics()
+    statistics_display_thread.join()
+    if benchmark_results.avg_remote_execution_time is not None:
+        print(
+            f"Average execution time: {benchmark_results.avg_remote_execution_time:.3f}s (across {benchmark_results.inferences_made} inferences)"
+        )
+    if output_location is None:
+        return None
+    benchmark_parameters = {
+        "datetime": datetime.now().isoformat(),
+        "model_id": model_id,
+        "dataset_reference": dataset_reference,
+        "benchmark_inferences": benchmark_inferences,
+        "batch_size": batch_size,
+        "model_configuration": model_configuration,
+    }
+    dump_benchmark_results(
+        output_location=output_location,
+        benchmark_parameters=benchmark_parameters,
+        benchmark_results=benchmark_results,
+    )
+
+
 def dump_benchmark_results(
     output_location: str,
     benchmark_parameters: dict,
diff --git a/inference_cli/lib/roboflow_cloud/data_staging/api_operations.py b/inference_cli/lib/roboflow_cloud/data_staging/api_operations.py
index 9931c22888..8405dcb1d0 100644
--- a/inference_cli/lib/roboflow_cloud/data_staging/api_operations.py
+++ b/inference_cli/lib/roboflow_cloud/data_staging/api_operations.py
@@ -1620,17 +1620,17 @@ def _parse_bucket_path(bucket_path: str) -> Tuple[str, Optional[str]]:
         "s3://bucket/path/" -> ("s3://bucket/path/", None)
         "gs://bucket/" -> ("gs://bucket/", None)
     """
-    has_glob = any(char in bucket_path for char in ['*', '?', '[', ']'])
+    has_glob = any(char in bucket_path for char in ["*", "?", "[", "]"])
 
     if not has_glob:
         return bucket_path, None
 
-    parts = bucket_path.split('/')
+    parts = bucket_path.split("/")
     for i in range(len(parts) - 1, -1, -1):
-        if any(char in parts[i] for char in ['*', '?', '[', ']']):
+        if any(char in parts[i] for char in ["*", "?", "[", "]"]):
             continue
-        base_path = '/'.join(parts[:i+1]) + '/'
-        glob_pattern = '/'.join(parts[i+1:])
+        base_path = "/".join(parts[: i + 1]) + "/"
+        glob_pattern = "/".join(parts[i + 1 :])
         return base_path, glob_pattern
 
     return bucket_path, None
@@ -1682,10 +1682,14 @@ def _get_fs_kwargs(protocol: Optional[str] = None) -> dict:
     # Support both adlfs convention and Azure CLI standard naming
     if protocol in (None, "az", "abfs", "azure"):
         # Account name: try adlfs convention first, fall back to Azure CLI standard
-        azure_account = os.getenv("AZURE_STORAGE_ACCOUNT_NAME") or os.getenv("AZURE_STORAGE_ACCOUNT")
+        azure_account = os.getenv("AZURE_STORAGE_ACCOUNT_NAME") or os.getenv(
+            "AZURE_STORAGE_ACCOUNT"
+        )
 
         # Account key: try adlfs convention first, fall back to Azure CLI standard
-        azure_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY") or os.getenv("AZURE_STORAGE_KEY")
+        azure_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY") or os.getenv(
+            "AZURE_STORAGE_KEY"
+        )
 
         # SAS token: same name in both conventions
         azure_sas_token = os.getenv("AZURE_STORAGE_SAS_TOKEN")
@@ -1721,26 +1725,26 @@ def _match_glob_pattern(path: str, pattern: str) -> bool:
     import re
 
     # Convert glob pattern to regex
-    pattern_parts = pattern.split('/')
+    pattern_parts = pattern.split("/")
     regex_parts = []
 
     i = 0
     while i < len(pattern_parts):
         part = pattern_parts[i]
 
-        if part == '**':
+        if part == "**":
             # ** matches zero or more path segments
             # If it's at the start and followed by more parts, it's optional
             if i == 0 and i + 1 < len(pattern_parts):
                 # Make preceding path optional: either nothing or anything/
-                regex_parts.append('(?:.*/)?')
+                regex_parts.append("(?:.*/)?")
             else:
                 # Match any path segments
-                regex_parts.append('.*')
+                regex_parts.append(".*")
             i += 1
-        elif '*' in part:
+        elif "*" in part:
             # * matches any characters except /
-            part_regex = re.escape(part).replace(r'\*', '[^/]*')
+            part_regex = re.escape(part).replace(r"\*", "[^/]*")
             regex_parts.append(part_regex)
             i += 1
         else:
@@ -1749,13 +1753,13 @@ def _match_glob_pattern(path: str, pattern: str) -> bool:
             i += 1
 
     # Join with / but handle ** specially (already includes separator in regex)
-    regex_pattern = ''
+    regex_pattern = ""
     for j, part in enumerate(regex_parts):
-        if j > 0 and not regex_parts[j-1].endswith(')?'):
-            regex_pattern += '/'
+        if j > 0 and not regex_parts[j - 1].endswith(")?"):
+            regex_pattern += "/"
         regex_pattern += part
 
-    regex_pattern += '$'
+    regex_pattern += "$"
     return re.match(regex_pattern, path) is not None
 
 
@@ -1786,7 +1790,7 @@ def _list_and_filter_files_streaming(
         Exception: Other fsspec errors
     """
     protocol = base_path.split("://")[0]
-    base_without_protocol = base_path.split("://", 1)[1].rstrip('/')
+    base_without_protocol = base_path.split("://", 1)[1].rstrip("/")
 
     # Validate bucket/path exists before walking
     # This catches silent failures where fs.walk() would return empty
@@ -1831,14 +1835,20 @@ def _list_and_filter_files_streaming(
             if root_path == base_without_protocol:
                 relative_path = fname
             else:
-                relative_path = f"{root_path.removeprefix(base_without_protocol + '/')}/{fname}"
+                relative_path = (
+                    f"{root_path.removeprefix(base_without_protocol + '/')}/{fname}"
+                )
 
             # Check glob pattern if specified
             if glob_pattern and not _match_glob_pattern(relative_path, glob_pattern):
                 continue
 
             # Yield full path with protocol
-            full_path = f"{root}/{fname}" if root.startswith(f"{protocol}://") else f"{protocol}://{root}/{fname}"
+            full_path = (
+                f"{root}/{fname}"
+                if root.startswith(f"{protocol}://")
+                else f"{protocol}://{root}/{fname}"
+            )
             yield full_path
 
 
@@ -1866,11 +1876,12 @@ def _generate_presigned_urls_parallel(
     Returns:
         List of dicts with 'name' and 'url' keys
     """
-    from queue import Queue
-    from threading import Thread
-    from rich.progress import Progress, BarColumn, SpinnerColumn, TextColumn
     import multiprocessing
     import traceback
+    from queue import Queue
+    from threading import Thread
+
+    from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn
 
     protocol = base_path.split("://")[0]
     file_queue = Queue(maxsize=0)  # Unlimited queue for continuous listing
@@ -1890,7 +1901,11 @@ def generate_url(file_path: str) -> dict:
         # Special handling for Azure with SAS token
         # When authenticated with SAS token, fs.sign() fails because it needs account_key
         # Instead, use the existing SAS token (ignoring expiration_seconds parameter)
-        if protocol in ("az", "abfs", "azure") and hasattr(fs, 'sas_token') and fs.sas_token:
+        if (
+            protocol in ("az", "abfs", "azure")
+            and hasattr(fs, "sas_token")
+            and fs.sas_token
+        ):
             # Use adlfs built-in utilities to construct URL with existing SAS token
             path_without_protocol = file_path.split("://", 1)[1]
             container, blob, _ = fs.split_path(path_without_protocol)
@@ -1922,10 +1937,10 @@ def producer(generator, queue, progress, task_id, exception_queue):
         except Exception as e:
             # Capture any errors from fs.walk() or file discovery
             error_info = {
-                'error': e,
-                'traceback': traceback.format_exc(),
-                'context': 'File discovery (fs.walk)',
-                'base_path': base_path
+                "error": e,
+                "traceback": traceback.format_exc(),
+                "context": "File discovery (fs.walk)",
+                "base_path": base_path,
             }
             exception_queue.put(error_info)
         finally:
@@ -1952,10 +1967,10 @@ def consumer(queue, progress, task_id, exception_queue):
             except Exception as e:
                 # Capture errors from fs.sign() or URL generation
                 error_info = {
-                    'error': e,
-                    'traceback': traceback.format_exc(),
-                    'context': 'Presigned URL generation (fs.sign)',
-                    'file_path': file_path
+                    "error": e,
+                    "traceback": traceback.format_exc(),
+                    "context": "Presigned URL generation (fs.sign)",
+                    "file_path": file_path,
                 }
                 exception_queue.put(error_info)
             finally:
@@ -1972,7 +1987,7 @@ def consumer(queue, progress, task_id, exception_queue):
         # Start producer thread
         producer_thread = Thread(
             target=producer,
-            args=(file_paths_generator, file_queue, progress, task, exception_queue)
+            args=(file_paths_generator, file_queue, progress, task, exception_queue),
         )
         producer_thread.start()
 
@@ -1981,7 +1996,9 @@ def consumer(queue, progress, task_id, exception_queue):
 
         consumer_threads = []
         for _ in range(num_workers):
-            t = Thread(target=consumer, args=(file_queue, progress, task, exception_queue))
+            t = Thread(
+                target=consumer, args=(file_queue, progress, task, exception_queue)
+            )
             t.start()
             consumer_threads.append(t)
 
@@ -2009,9 +2026,9 @@ def consumer(queue, progress, task_id, exception_queue):
         )
 
         # Add context-specific details
-        if 'base_path' in first_error:
+        if "base_path" in first_error:
             error_msg += f"Base path: {first_error['base_path']}\n"
-        if 'file_path' in first_error:
+        if "file_path" in first_error:
             error_msg += f"File: {first_error['file_path']}\n"
 
         # If multiple errors, mention it
@@ -2019,7 +2036,7 @@ def consumer(queue, progress, task_id, exception_queue):
             error_msg += f"\n(Plus {len(errors) - 1} additional error(s))"
 
         # Re-raise the original exception with enhanced context
-        raise type(first_error['error'])(error_msg) from first_error['error']
+        raise type(first_error["error"])(error_msg) from first_error["error"]
 
     return results
 
@@ -2076,8 +2093,12 @@ def create_images_batch_from_cloud_storage(
         )
 
     if len(references) > MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST:
-        num_chunks = (len(references) + MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST - 1) // MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST
-        print(f"Files will be split into {num_chunks} chunks of up to {MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST} files each")
+        num_chunks = (
+            len(references) + MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST - 1
+        ) // MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST
+        print(
+            f"Files will be split into {num_chunks} chunks of up to {MAX_IMAGE_REFERENCES_IN_INGEST_REQUEST} files each"
+        )
 
     workspace = get_workspace(api_key=api_key)
 
@@ -2174,7 +2195,9 @@ def create_videos_batch_from_cloud_storage(
 
     print(f"Found {len(references)} video files")
     if len(references) > SUGGESTED_MAX_VIDEOS_IN_BATCH:
-        print(f"Warning: Found {len(references)} videos. Suggested max is {SUGGESTED_MAX_VIDEOS_IN_BATCH} videos per batch.")
+        print(
+            f"Warning: Found {len(references)} videos. Suggested max is {SUGGESTED_MAX_VIDEOS_IN_BATCH} videos per batch."
+        )
 
     workspace = get_workspace(api_key=api_key)
 
diff --git a/inference_cli/lib/roboflow_cloud/data_staging/core.py b/inference_cli/lib/roboflow_cloud/data_staging/core.py
index c53ec6c2be..7ffe1fcdca 100644
--- a/inference_cli/lib/roboflow_cloud/data_staging/core.py
+++ b/inference_cli/lib/roboflow_cloud/data_staging/core.py
@@ -175,7 +175,7 @@ def create_batch_of_images(
             "--bucket-path",
             "-bp",
             help="Cloud storage path with optional glob pattern (e.g., 's3://bucket/path/**/*.jpg', 'gs://bucket/images/'). "
-                 "Required for cloud-storage source. Supports S3, GCS, and Azure.",
+            "Required for cloud-storage source. Supports S3, GCS, and Azure.",
         ),
     ] = None,
     ingest_id: Annotated[
@@ -339,7 +339,7 @@ def create_batch_of_videos(
             "--bucket-path",
             "-bp",
             help="Cloud storage path with optional glob pattern (e.g., 's3://bucket/path/**/*.mp4', 'gs://bucket/videos/'). "
-                 "Required for cloud-storage source. Supports S3, GCS, and Azure.",
+            "Required for cloud-storage source. Supports S3, GCS, and Azure.",
         ),
     ] = None,
     ingest_id: Annotated[
diff --git a/inference_cli/lib/utils.py b/inference_cli/lib/utils.py
index 63c573cd44..394775bfbe 100644
--- a/inference_cli/lib/utils.py
+++ b/inference_cli/lib/utils.py
@@ -97,6 +97,15 @@ def ensure_inference_is_installed() -> None:
             ) from inner_error
 
 
+def ensure_inference_experimental_is_installed() -> None:
+    try:
+        import inference_exp
+    except Exception as error:
+        raise InferencePackageMissingError(
+            "You need to install `inference-exp` package to use this feature. Run `pip install inference-exp`"
+        ) from error
+
+
 def read_json(path: str) -> dict:
     with open(path) as f:
         return json.load(f)
diff --git a/inference_experimental/dockerfiles/jp51.cu114.core.dockerfile b/inference_experimental/dockerfiles/jp51.cu114.core.dockerfile
new file mode 100644
index 0000000000..2389eaa854
--- /dev/null
+++ b/inference_experimental/dockerfiles/jp51.cu114.core.dockerfile
@@ -0,0 +1,207 @@
+FROM nvcr.io/nvidia/l4t-ml:r35.2.1-py3 AS builder
+
+# install Python 3.12
+RUN apt-get update -y && apt-get install -y \
+    libssl-dev \
+    git \
+    unzip \
+    libbz2-dev \
+    libssl-dev \
+    libsqlite3-dev \
+    zlib1g-dev \
+    liblzma-dev
+
+RUN mkdir -p /build/python-3.12
+WORKDIR /build/python-3.12
+RUN wget https://www.python.org/ftp/python/3.12.12/Python-3.12.12.tgz && tar -xzf Python-3.12.12.tgz
+WORKDIR /build/python-3.12/Python-3.12.12
+RUN ./configure --enable-optimizations
+RUN make -j$(nproc) && make altinstall
+
+RUN update-alternatives --install /usr/bin/python python /usr/local/bin/python3.12 1
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.12 1
+
+# Get rid of tensorrt-8.X
+RUN apt remove -y 'libnvinfer*' 'libnvonnxparsers*' 'libnvparsers*' 'libnvinfer-plugin*' 'python3-libnvinfer*' 'tensorrt*' 'uff-converter*' 'graphsurgeon*'
+
+# Create out dir where all wheels will be stored
+RUN mkdir -p /build/out/wheels
+
+# Install tensorrt-10.x
+RUN mkdir -p /build/tensorrt-10.x
+WORKDIR /build/tensorrt-10.x
+RUN wget https://storage.googleapis.com/roboflow-tests-assets/TensorRT/TensorRT-10.8.0.43.l4t.aarch64-gnu.cuda-11.4.tar.gz
+RUN tar xzf TensorRT-10.8.0.43.l4t.aarch64-gnu.cuda-11.4.tar.gz
+WORKDIR /build/tensorrt-10.x/TensorRT-10.8.0.43/targets/aarch64-linux-gnu
+RUN mkdir -p /usr/src/tensorrt/bin
+RUN cp bin/trtexec /usr/src/tensorrt/bin/trtexec
+RUN cp include/* /usr/include/aarch64-linux-gnu/
+RUN mkdir -p /usr/lib/aarch64-linux-gnu/stubs
+RUN cp -r lib/stubs/* /usr/lib/aarch64-linux-gnu/stubs/
+RUN cp lib/libnvinfer.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer.so.10.8.0 && \
+    cp lib/libnvinfer_builder_resource.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_builder_resource.so.10.8.0 && \
+    cp lib/libnvinfer_static.a /usr/lib/aarch64-linux-gnu/libnvinfer_static.a && \
+    cp lib/libnvinfer_dispatch.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_dispatch.so.10.8.0 && \
+    cp lib/libnvinfer_dispatch_static.a /usr/lib/aarch64-linux-gnu/libnvinfer_dispatch_static.a && \
+    cp lib/libnvinfer_lean.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_lean.so.10.8.0 && \
+    cp lib/libnvinfer_lean_static.a /usr/lib/aarch64-linux-gnu/libnvinfer_lean_static.a && \
+    cp lib/libnvinfer_plugin.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_plugin.so.10.8.0 && \
+    cp lib/libnvinfer_plugin_static.a /usr/lib/aarch64-linux-gnu/libnvinfer_plugin_static.a && \
+    cp lib/libnvinfer_vc_plugin.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_vc_plugin.so.10.8.0 && \
+    cp lib/libnvinfer_vc_plugin_static.a /usr/lib/aarch64-linux-gnu/libnvinfer_vc_plugin_static.a && \
+    cp lib/libnvonnxparser.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvonnxparser.so.10.8.0 && \
+    cp lib/libnvonnxparser_static.a /usr/lib/aarch64-linux-gnu/libnvonnxparser_static.a && \
+    cp lib/libonnx_proto.a /usr/lib/aarch64-linux-gnu/libonnx_proto.a
+
+RUN ln -s /usr/lib/aarch64-linux-gnu/libnvinfer.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer.so.10 && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvinfer.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer.so && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvinfer_dispatch.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_dispatch.so.10 && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvinfer_dispatch.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_dispatch.so && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvinfer_lean.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_lean.so.10 && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvinfer_lean.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_lean.so && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvinfer_plugin.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_plugin.so.10 && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvinfer_plugin.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_plugin.so && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvinfer_vc_plugin.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_vc_plugin.so.10 && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvinfer_vc_plugin.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvinfer_vc_plugin.so && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvonnxparser.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvonnxparser.so.10 && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnvonnxparser.so.10.8.0 /usr/lib/aarch64-linux-gnu/libnvonnxparser.so
+
+WORKDIR /build/tensorrt-10.x/TensorRT-10.8.0.43/python
+RUN cp -r * /build/out/wheels
+RUN python3.12 -m pip install /build/out/wheels/tensorrt-10.8.0.43-cp312-none-linux_aarch64.whl
+
+# Install newer Cmake for builds
+RUN mkdir -p /build/cmake
+WORKDIR /build/cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v4.1.2/cmake-4.1.2-linux-aarch64.sh
+RUN mkdir build && chmod ugo+x cmake-4.1.2-linux-aarch64.sh && bash cmake-4.1.2-linux-aarch64.sh --skip-license --prefix=./build
+
+# Install gcc-11
+WORKDIR /build/gcc/
+RUN wget https://ftp.gnu.org/gnu/gcc/gcc-11.1.0/gcc-11.1.0.tar.gz
+RUN tar xzf gcc-11.1.0.tar.gz
+WORKDIR /build/gcc/gcc-11.1.0
+RUN ./contrib/download_prerequisites
+WORKDIR /build/gcc/
+RUN mkdir objdir
+WORKDIR /build/gcc/objdir
+RUN $PWD/../gcc-11.1.0/configure --prefix=$HOME/GCC-11 --enable-languages=c,c++
+RUN make -j$(nproc)
+RUN make install
+RUN export PATH=/root/GCC-11/bin:$PATH
+RUN export LD_LIBRARY_PATH=/root/GCC-11/lib64/:$LD_LIBRARY_PATH
+RUN ldconfig
+
+# upgrade to CUDA 11.8
+WORKDIR /build/cuda-118
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/arm64/cuda-ubuntu2004.pin -O /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-tegra-repo-ubuntu2004-11-8-local_11.8.0-1_arm64.deb && \
+    dpkg -i cuda-tegra-repo-*.deb && \
+    rm cuda-tegra-repo-*.deb
+
+RUN cp /var/cuda-tegra-repo-*/cuda-tegra-*-keyring.gpg /usr/share/keyrings/
+
+RUN mkdir /var/cuda-compat && \
+    cd /var/cuda-compat && \
+    ar x ../cuda-tegra-repo-*/cuda-compat-*.deb && \
+    tar xvf data.tar.xz -C / && \
+    rm -rf /var/cuda-compat
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+            cuda-toolkit-* \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+# Install ONNX-runtime GPU
+RUN mkdir -p /build/onnxruntime
+WORKDIR /build/onnxruntime
+RUN git clone https://github.com/microsoft/onnxruntime.git
+WORKDIR /build/onnxruntime/onnxruntime
+RUN git checkout v1.21.1
+# Hash aligned with the source code that had this problem fixed on main branch - we need to stick to this version and patch, as our env is cuda 11 and the patched version do only support cuda 12
+RUN sed -i 's|eigen;https://gitlab.com/libeigen/eigen/-/archive/1d8b82b0740839c0de7f1242a3585e3390ff5f33/eigen-1d8b82b0740839c0de7f1242a3585e3390ff5f33.zip;5ea4d05e62d7f954a46b3213f9b2535bdd866803|eigen;https://github.com/eigen-mirror/eigen/archive/1d8b82b0740839c0de7f1242a3585e3390ff5f33/eigen-1d8b82b0740839c0de7f1242a3585e3390ff5f33.zip;05b19b49e6fbb91246be711d801160528c135e34|' cmake/deps.txt
+RUN python3.12 -m pip install packaging setuptools "numpy==2.3.5"
+RUN LD_LIBRARY_PATH=/root/GCC-11/lib64/:$LD_LIBRARY_PATH CC=/root/GCC-11/bin/gcc CXX=/root/GCC-11/bin/g++ PATH=/build/cmake/build/bin:$PATH CMAKE_POLICY_VERSION_MINIMUM=3.5 ./build.sh --update --config Release --build --build_wheel --use_cuda --cuda_version=11.8 --cuda_home /usr/local/cuda-11.8 --cudnn_home /usr/lib/aarch64-linux-gnu --use_tensorrt --tensorrt_home /usr/lib/aarch64-linux-gnu --allow_running_as_root --parallel 4 --disable_types float8 --skip_tests --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
+RUN python3.12 -m pip install ./build/Linux/Release/dist/onnxruntime_gpu-1.21.1-cp312-cp312-linux_aarch64.whl
+RUN cp ./build/Linux/Release/dist/onnxruntime_gpu-1.21.1-cp312-cp312-linux_aarch64.whl /build/out/wheels/onnxruntime_gpu-1.21.1-cp312-cp312-linux_aarch64.whl
+
+# Install PyTorch
+RUN mkdir -p /build/torch
+WORKDIR /build/torch
+RUN git clone https://github.com/pytorch/pytorch.git
+WORKDIR /build/torch/pytorch
+RUN git checkout v2.4.1
+RUN git submodule sync && git submodule update --init --recursive
+RUN PATH=/build/cmake/build/bin:$PATH python3.12 -m pip install setuptools wheel astunparse numpy ninja pyyaml cmake "typing-extensions>=4.10.0" requests
+ARG MAX_TORCH_COMPILATION_JOBS=4
+RUN PATH=/build/cmake/build/bin:$PATH PYTORCH_BUILD_VERSION=2.4.1 PYTORCH_BUILD_NUMBER=1 MAX_JOBS=${MAX_TORCH_COMPILATION_JOBS} FORCE_CUDA=1 CUDA_HOME=/usr/local/cuda-11.8 CUDACXX=/usr/local/cuda-11.8/bin/nvcc TORCH_CUDA_ARCH_LIST="8.7" USE_NCCL=0 USE_DISTRIBUTED=0 USE_MKLDNN=0 BUILD_TEST=0 CMAKE_POLICY_VERSION_MINIMUM=3.5 python3.12 setup.py bdist_wheel
+RUN python3.12 -m pip install dist/torch-*.whl
+RUN cp dist/torch-*.whl /build/out/wheels/
+
+# Install Torchvision
+RUN mkdir -p /build/torchvision
+WORKDIR /build/torchvision
+RUN git clone https://github.com/pytorch/vision.git
+WORKDIR /build/torchvision/vision
+RUN git checkout v0.19.1
+RUN git submodule sync && git submodule update --init --recursive
+RUN CC=/root/GCC-11/bin/gcc CXX=/root/GCC-11/bin/g++ FORCE_CUDA=1 PATH=/build/cmake/build/bin:$PATH BUILD_VERSION=0.19.1 TORCH_CUDA_ARCH_LIST="8.7" CUDA_HOME=/usr/local/cuda-11.8 CMAKE_POLICY_VERSION_MINIMUM=3.5 python3.12 setup.py bdist_wheel
+RUN python3.12 -m pip install dist/torchvision-*.whl
+RUN cp dist/torchvision-*.whl /build/out/wheels/
+
+FROM nvcr.io/nvidia/l4t-ml:r35.2.1-py3 AS target
+
+RUN apt-get update -y && apt-get install -y \
+    libssl-dev \
+    git \
+    unzip \
+    libbz2-dev \
+    libssl-dev \
+    libsqlite3-dev \
+    zlib1g-dev \
+    liblzma-dev
+
+RUN apt remove -y 'libnvinfer*' 'libnvonnxparsers*' 'libnvparsers*' 'libnvinfer-plugin*' 'python3-libnvinfer*' 'tensorrt*' 'uff-converter*' 'graphsurgeon*'
+
+
+COPY --from=builder /root/GCC-11 /opt/gcc-11
+COPY --from=builder /build/out/wheels /compiled_python_packages
+COPY --from=builder /usr/include /usr/include
+COPY --from=builder /usr/lib /usr/lib
+COPY --from=builder /usr/share /usr/share
+COPY --from=builder /usr/src /usr/src
+COPY --from=builder /usr/local/bin /usr/local/bin
+COPY --from=builder /usr/local/include /usr/local/include
+COPY --from=builder /usr/local/lib /usr/local/lib
+COPY --from=builder /usr/local/share /usr/local/share
+COPY --from=builder /usr/local/cuda-11.8 /usr/local/cuda-11.8
+RUN rm /etc/alternatives/cuda /etc/alternatives/cuda-11
+RUN ln -s /usr/local/cuda-11.8 /etc/alternatives/cuda
+RUN ln -s /usr/local/cuda-11.8 /etc/alternatives/cuda-11
+RUN rm -rf /usr/local/cuda-11.4
+ENV LD_LIBRARY_PATH="/opt/gcc-11/lib64:$$LD_LIBRARY_PATH"
+
+
+RUN update-alternatives --install /usr/bin/python python /usr/local/bin/python3.12 1
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.12 1
+
+# Install OpenCV
+RUN mkdir -p /build/opencv
+WORKDIR  /build/opencv
+RUN curl -L https://github.com/opencv/opencv/archive/4.12.0.zip -o opencv-4.12.0.zip
+RUN curl -L https://github.com/opencv/opencv_contrib/archive/4.12.0.zip -o opencv_contrib-4.12.0.zip
+RUN unzip opencv-4.12.0.zip
+RUN unzip opencv_contrib-4.12.0.zip
+WORKDIR /build/opencv/opencv-4.12.0
+RUN mkdir release
+WORKDIR /build/opencv/opencv-4.12.0/release
+RUN cmake -D WITH_CUDA=ON -D WITH_CUDNN=ON -D CUDA_ARCH_BIN="8.7" -D CUDA_ARCH_PTX="" -D OPENCV_GENERATE_PKGCONFIG=ON -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib-4.12.0/modules -D WITH_GSTREAMER=ON -D WITH_LIBV4L=ON -D BUILD_opencv_python3=ON -D BUILD_TESTS=OFF -D BUILD_PERF_TESTS=OFF -D BUILD_EXAMPLES=OFF -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local -D PYTHON3_INCLUDE_DIR=/usr/local/include/python3.12 -D OPENCV_PYTHON3_INSTALL_PATH=/usr/local/lib/python3.12/site-packages -D PYTHON3_EXECUTABLE=/usr/local/bin/python3.12 -D PYTHON_VERSION=312 -DBUILD_SHARED_LIBS=OFF -DWITH_OPENCLAMDFFT=OFF -DWITH_OPENCLAMDBLAS=OFF -DWITH_VA_INTEL=OFF ..
+RUN make -j$(nproc)
+RUN make install
+RUN python3.12 -m pip wheel ./python_loader --wheel-dir /build/out/wheels --verbose
+RUN python3.12 -m pip install /build/out/wheels/opencv-4.12.0-py3-none-any.whl
+
+WORKDIR /
+
+ENTRYPOINT ["bash"]
diff --git a/inference_experimental/dockerfiles/jp61.cu126.base.dockerfile b/inference_experimental/dockerfiles/jp61.cu126.base.dockerfile
index 6126d85175..ff92b0cf48 100644
--- a/inference_experimental/dockerfiles/jp61.cu126.base.dockerfile
+++ b/inference_experimental/dockerfiles/jp61.cu126.base.dockerfile
@@ -3,6 +3,7 @@ FROM nvcr.io/nvidia/l4t-jetpack:r36.4.0
 ARG DEBIAN_FRONTEND=noninteractive
 ENV LANG=en_US.UTF-8
 
+RUN chmod 1777 /tmp
 RUN apt-get update -y && \
     apt-get install -y --no-install-recommends \
     lshw \
diff --git a/inference_experimental/inference_exp/models/common/onnx.py b/inference_experimental/inference_exp/models/common/onnx.py
index c17203971c..1d40f0681a 100644
--- a/inference_experimental/inference_exp/models/common/onnx.py
+++ b/inference_experimental/inference_exp/models/common/onnx.py
@@ -281,8 +281,14 @@ def run_session_via_iobinding(
             if pre_allocated_output is not None:
                 result.append(pre_allocated_output)
                 continue
-            dlpack_tensor = bound_output._ortvalue.to_dlpack()
-            out_tensor = torch.utils.dlpack.from_dlpack(dlpack_tensor)
+            # This is added for the sake of true compatibility with older builds of onnxruntime
+            # which do not support zero-copy OrtValue -> torch.Tensor thanks top dlpack
+            if not hasattr(bound_output._ortvalue, "to_dlpack"):
+                # slower but needed :(
+                out_tensor = torch.from_numpy(bound_output._ortvalue.numpy()).to(device)
+            else:
+                dlpack_tensor = bound_output._ortvalue.to_dlpack()
+                out_tensor = torch.utils.dlpack.from_dlpack(dlpack_tensor)
             result.append(out_tensor)
         return result
 
diff --git a/inference_experimental/pyproject.toml b/inference_experimental/pyproject.toml
index 851392cf0c..66dfc142fd 100644
--- a/inference_experimental/pyproject.toml
+++ b/inference_experimental/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "inference-exp"
-version = "0.16.3"
+version = "0.16.4"
 description = "Experimental vresion of inference package which is supposed to evolve into inference 1.0"
 readme = "README.md"
 requires-python = ">=3.9,<3.13"
diff --git a/inference_experimental/uv.lock b/inference_experimental/uv.lock
index 2e971a919d..0b20099192 100644
--- a/inference_experimental/uv.lock
+++ b/inference_experimental/uv.lock
@@ -1991,7 +1991,7 @@ wheels = [
 
 [[package]]
 name = "inference-exp"
-version = "0.16.3"
+version = "0.16.4"
 source = { virtual = "." }
 dependencies = [
     { name = "accelerate" },
diff --git a/requirements/_requirements.txt b/requirements/_requirements.txt
index 535e9c390d..52ed81eb51 100644
--- a/requirements/_requirements.txt
+++ b/requirements/_requirements.txt
@@ -26,7 +26,8 @@ pydantic-settings<2.8
 openai>=1.12.0,<2.0.0
 structlog>=24.1.0,<25.0.0
 zxing-cpp~=2.2.0
-boto3<=1.35.60
+boto3>=1.40.0,<=1.41.5
+botocore>=1.40.0,<=1.41.5
 typing_extensions>=4.8.0,<=4.12.2
 pydot~=2.0.0
 shapely>=2.0.4,<2.1.0