Add prefix cache test to github actions

rlakhtakia · rlakhtakia · commit 72c5beacb816 · 2025-11-17T09:13:36.000Z
diff --git a/.github/scripts/e2e/e2e-validate.sh b/.github/scripts/e2e/e2e-validate.sh
@@ -0,0 +1,141 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# -----------------------------------------------------------------------------
+# e2e-validate.sh — CI e2e Gateway smoke-test (chat + completion, 10 iterations)
+# -----------------------------------------------------------------------------
+
+show_help() {
+  cat <<EOF
+Usage: $(basename "$0") [OPTIONS]
+
+Options:
+  -n, --namespace NAMESPACE   Kubernetes namespace (default: llm-d)
+  -m, --model MODEL_ID        Model to query. If unset, discovers the first available model.
+  -v, --verbose               Echo kubectl/curl commands before running
+  -h, --help                  Show this help and exit
+EOF
+  exit 0
+}
+
+# ── Defaults ────────────────────────────────────────────────────────────────
+NAMESPACE="igw-e2e"
+CLI_MODEL_ID=""
+VERBOSE=false
+
+# ── Flag parsing ────────────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -n|--namespace) NAMESPACE="$2"; shift 2 ;;
+    -m|--model)     CLI_MODEL_ID="$2"; shift 2 ;;
+    -v|--verbose)   VERBOSE=true; shift ;;
+    -h|--help)      show_help ;;
+    *) echo "Unknown option: $1"; show_help ;;
+  esac
+done
+
+if [[ "${VERBOSE}" == "true" ]]; then
+  set -x
+fi
+
+# ── Create a unique pod suffix ────────────────────────────────────────────
+gen_id() { echo $(( RANDOM % 10000 + 1 )); }
+
+# ── Discover Gateway address ────────────────────────────────────────────────
+HOST="${GATEWAY_HOST:-$(kubectl get gateway -n "$NAMESPACE" \
+          -o jsonpath='{.items[0].status.addresses[0].value}' 2>/dev/null || true)}"
+if [[ -z "$HOST" ]]; then
+  echo "Error: could not discover a Gateway address in namespace '$NAMESPACE'." >&2
+  exit 1
+fi
+PORT=80
+SVC_HOST="${HOST}:${PORT}"
+
+# ── Determine MODEL_ID ──────────────────────────────────────────────────────
+if [[ -n "$CLI_MODEL_ID" ]]; then
+  MODEL_ID="$CLI_MODEL_ID"
+elif [[ -n "${MODEL_ID-}" ]]; then
+  MODEL_ID="$MODEL_ID"
+else
+    echo "Error: Failed to find model id. Please specify one using the -m flag or the MODEL_ID environment variable." >&2
+    exit 1
+fi
+
+echo "Namespace: $NAMESPACE"
+echo "Inference Gateway:   ${SVC_HOST}"
+echo "Model ID:  $MODEL_ID"
+echo
+
+# ── Main test loop (10 iterations) ──────────────────────────────────────────
+for i in {1..10}; do
+  echo "=== Iteration $i of 10 ==="
+  failed=false
+
+  # 1) POST /v1/chat/completions
+  echo "1) POST /v1/chat/completions at ${SVC_HOST}"
+  chat_payload='{
+    "model":"'"$MODEL_ID"'",
+    "messages":[{"role":"user","content":"Hello!  Who are you?"}]
+  }'
+  ID=$(gen_id)
+  if $VERBOSE; then cat <<CMD
+  - Running command:
+    kubectl run --rm -i curl-${ID} \\
+      --namespace "${NAMESPACE}" \\
+      --image=curlimages/curl --restart=Never -- \\
+      curl -sS -X POST "http://${SVC_HOST}/v1/chat/completions" \\
+        -H 'accept: application/json' \\
+        -H 'Content-Type: application/json' \\
+        -d '${chat_payload//\'/\'}'
+
+CMD
+  fi
+  ret=0
+  output=$(kubectl run --rm -i curl-"$ID" \
+            --namespace "$NAMESPACE" \
+            --image=curlimages/curl --restart=Never -- \
+            sh -c "sleep 1; curl -sS -X POST 'http://${SVC_HOST}/v1/chat/completions' \
+                 -H 'accept: application/json' \
+                 -H 'Content-Type: application/json' \
+                 -d '$chat_payload'") || ret=$?
+  echo "$output"
+  [[ $ret -ne 0 || "$output" != *'{'* ]] && {
+    echo "Error: POST /v1/chat/completions failed (exit $ret or no JSON)" >&2; failed=true; }
+  echo
+
+  # 2) POST /v1/completions
+  echo "2) POST /v1/completions at ${SVC_HOST}"
+  payload='{"model":"'"$MODEL_ID"'","prompt":"You are a helpful AI assistant."}'
+  ID=$(gen_id)
+  if $VERBOSE; then cat <<CMD
+  - Running command:
+    kubectl run --rm -i curl-${ID} \\
+      --namespace "${NAMESPACE}" \\
+      --image=curlimages/curl --restart=Never -- \\
+      curl -sS -X POST "http://${SVC_HOST}/v1/completions" \\
+        -H 'accept: application/json' \\
+        -H 'Content-Type: application/json' \\
+        -d '${payload//\'/\'}'
+
+CMD
+  fi
+  ret=0
+  output=$(kubectl run --rm -i curl-"$ID" \
+            --namespace "$NAMESPACE" \
+            --image=curlimages/curl --restart=Never -- \
+            sh -c "sleep 1; curl -sS -X POST 'http://${SVC_HOST}/v1/completions' \
+                 -H 'accept: application/json' \
+                 -H 'Content-Type: application/json' \
+                 -d '$payload'") || ret=$?
+  echo "$output"
+  [[ $ret -ne 0 || "$output" != *'{'* ]] && {
+    echo "Error: POST /v1/completions failed (exit $ret or no JSON)" >&2; failed=true; }
+  echo
+
+  if $failed; then
+    echo "Iteration $i encountered errors; exiting." >&2
+    exit 1
+  fi
+done
+
+echo "✅ All 10 iterations succeeded."
diff --git a/.github/workflows/e2e-prefix-cache-aware-gke.yaml b/.github/workflows/e2e-prefix-cache-aware-gke.yaml
@@ -0,0 +1,233 @@
+name: GKE Prefix Cache Aware Test
+
+on:
+  # Runs with a PR comment /run-gke-prefix-cache-aware
+  issue_comment:
+    types: [created]
+  workflow_dispatch:
+    inputs:
+      pr_or_branch:
+        description: 'Pull-request number or branch name to test'
+        required: true
+        default: 'actions'
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  deploy_and_validate:
+    if: >
+      github.event_name == 'schedule' ||
+      github.event_name == 'pull_request' ||
+      github.event_name == 'workflow_dispatch' ||
+      (
+        github.event_name == 'issue_comment' &&
+        github.event.issue.pull_request &&
+        (
+          contains(github.event.comment.body, '/run-gke-prefix-cache-aware')
+        ) &&
+        (
+          github.event.comment.author_association == 'OWNER' ||
+          github.event.comment.author_association == 'MEMBER' ||
+          github.event.comment.author_association == 'COLLABORATOR'
+        )
+      )
+    name: Test on ${{ matrix.accelerator.name }}
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        accelerator:
+          - name: GPU
+            helmfile_env: gke
+            pod_readiness_sleep_seconds: 180
+          - name: TPU
+            helmfile_env: gke_tpu
+            pod_readiness_sleep_seconds: 1080
+
+    env:
+      GCP_PROJECT_ID: llm-d-scale
+      GKE_CLUSTER_NAME: llm-d-e2e-us-east5
+      GKE_CLUSTER_ZONE: us-east5
+      NAMESPACE: igw-prefix-cache-aware
+      GATEWAY: gke-l7-regional-external-managed
+      GATEWAY_TYPE: gke
+      PR_OR_BRANCH: ${{ github.event.inputs.pr_or_branch || github.event.issue.number || github.event.number || 'actions' }}
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+
+      - name: Determine if pr_or_branch is a PR number
+        id: check_pr
+        env:
+          PR_OR_BRANCH: ${{ github.event.inputs.pr_or_branch }}
+        shell: bash
+        run: |
+          echo "PR_OR_BRANCH=${PR_OR_BRANCH:-actions}" >> "$GITHUB_ENV"
+          if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
+            echo "is_pr=true" >> "$GITHUB_OUTPUT"
+          elif [[ "${{ github.event_name }}" = "pull_request" ]]; then
+            echo "PR_OR_BRANCH=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
+            echo "is_pr=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_pr=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Fetch and checkout PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          git fetch origin pull/"$PR_OR_BRANCH"/head:pr-"$PR_OR_BRANCH"
+          git checkout pr-"$PR_OR_BRANCH"
+
+      - name: Checkout branch
+        if: steps.check_pr.outputs.is_pr == 'false'
+        run: git checkout "$PR_OR_BRANCH"
+
+      - name: Authenticate to Google Cloud
+        id: auth
+        uses: google-github-actions/auth@b7593ed2efd1c1617e1b0254da33b86225adb2a5
+        with:
+          credentials_json: ${{ secrets.GKE_SA_KEY }}
+
+      - name: Set up gcloud CLI and kubectl
+        uses: google-github-actions/setup-gcloud@cb1e50a9932213ecece00a606661ae9ca44f3397
+        with:
+          project_id: ${{ env.GCP_PROJECT_ID }}
+          install_components: 'kubectl,gke-gcloud-auth-plugin'
+
+      - name: Get GKE credentials
+        run: |
+          gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"
+
+      - name: Create namespace
+        run: |
+          kubectl create namespace "${NAMESPACE}" || echo "Namespace already exists"
+
+      - name: Create llm-d-hf-token secret
+        run: |
+          kubectl create secret generic llm-d-hf-token \
+            --from-literal="HF_TOKEN=${{ secrets.HF_TOKEN }}" \
+            --namespace "${NAMESPACE}" \
+            --dry-run=client -o yaml | kubectl apply -f -
+
+      - name: Deploy Model Server and CRDs
+        run: |
+          cd config/manifests/vllm
+          sed -i '/- --model/a\          - --enable-prefix-caching' gpu-deployment.yaml
+          echo "Deploying Model Server..."
+          kubectl apply -f gpu-deployment.yaml -n ${NAMESPACE} | tee ~/igw-prefix-cache-deployment.log
+          echo "Installing CRDs"
+          kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.1.0/manifests.yaml
+          echo "---------------------------------------" >> ~/igw-prefix-cache-deployment.log
+
+      - name: Deploy InferencePool and Endpoint Picker Extension
+        run: |
+          helm install vllm-llama3-8b-instruct \
+          --namespace $NAMESPACE \
+          --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
+          --set provider.name=$GATEWAY_TYPE \
+          --version $IGW_CHART_VERSION \
+          oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool | tee ~/igw-prefix-cache-deployment.log
+          echo "---------------------------------------" >> ~/igw-prefix-cache-deployment.log
+
+      - name: Deploy Gateway
+        run: |
+          echo "Deploying Gateway..."
+          kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.1.0/config/manifests/gateway/gke/gateway.yaml | tee ~/igw-prefix-cache-deployment.log 
+          echo "Deploying HTTPRoute..."
+          kubectl apply -f httproute.gke.yaml -n ${NAMESPACE} | tee ~/igw-prefix-cache-deployment.log
+          echo "---------------------------------------" >> ~/igw-prefix-cache-deployment.log
+
+      - name: Wait for all pods to be ready
+        run: |
+          kubectl wait pod \
+            --for=condition=Ready \
+            --all \
+            -n "${NAMESPACE}" \
+            --timeout=15m
+          sleep ${{ matrix.accelerator.pod_readiness_sleep_seconds }} # TODO: remove this once examples have readiness probes
+          echo "✅ All pods are ready."
+          kubectl get pods -n "${NAMESPACE}"
+
+      - name: Wait for gateway to be ready
+        run: |
+          GATEWAY_NAME=inference-gateway
+          kubectl wait gateway/${GATEWAY_NAME} \
+            --for=condition=Programmed=True \
+            -n "${NAMESPACE}" \
+            --timeout=300s
+          echo "✅ Gateway is ready."
+          kubectl get gateway -n "${NAMESPACE}"
+
+      - name: Show deployment status
+        run: |
+          echo "=== Deployments ==="
+          kubectl get deployments -n "${NAMESPACE}"
+          echo ""
+          echo "=== Pods ==="
+          kubectl get pods -n "${NAMESPACE}"
+          echo ""
+          echo "=== Services ==="
+          kubectl get svc -n "${NAMESPACE}"
+          echo ""
+          echo "=== Helm releases ==="
+          helm list -n "${NAMESPACE}" || true
+          echo ""
+          echo "=== Inference Pools ==="
+          kubectl get inferencepools -n "${NAMESPACE}" || true
+          echo ""
+          echo "=== HTTPRoutes ==="
+          kubectl get httproutes -n "${NAMESPACE}" || true
+          echo ""
+          echo "=== Gateway ==="
+          kubectl get Gateway -n "${NAMESPACE}" || true
+          echo ""
+
+      - name: Verify installation and run inference tests
+        run: |
+          cd .github/scripts/e2e
+          ./e2e-validate.sh -n "${NAMESPACE}" -v
+
+      - name: Collect and upload Kubernetes pod logs
+        if: always()
+        run: |
+            mkdir -p pod-logs-inference-prefix-cache
+            cd pod-logs-inference-prefix-cache
+            echo "Fetching ${NAMESPACE} pods log..."
+            kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
+            | xargs -I{} sh -c 'kubectl logs --all-containers=true -n "${NAMESPACE}" {} > "{}.log" 2>&1'
+            echo "Fetching ${NAMESPACE} pods descriptions..."
+            kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
+            | xargs -I{} sh -c 'kubectl describe pod -n "${NAMESPACE}" {} > "{}-describe.log" 2>&1'
+            mv ~/igw-prefix-cache-deployment.log . || true
+            mv ~/install-deps.log . || true
+
+      - name: Upload pod logs as artifact
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: igw-pod-logs-inference-prefix-cache-${{ matrix.accelerator.name }}
+          path: pod-logs-inference-prefix-cache
+
+      - name: Send Google Chat notification on failure
+        if: failure()
+        uses: SimonScholz/google-chat-action@3b3519e5102dba8aa5046fd711c4b553586409bb
+        with:
+          webhookUrl: ${{ secrets.GOOGLE_CHAT_WEBHOOK }}
+          jobStatus: ${{ job.status }}
+          title: '${{ github.workflow }} - ${{ matrix.accelerator.name }}'
+
+      - name: Cleanup deployment
+        if: always()
+        run: |
+          GATEWAY_NAME=inference-gateway
+          helm uninstall vllm-llama3-8b-instruct -n ${NAMESPACE}
+          kubectl delete -f httproute.gke.yaml -n ${NAMESPACE}
+          kubectl delete gateway ${GATEWAY_NAME} -n ${NAMESPACE}