add guide for quantizing llm on gke (#1813)

moficodes · web-flow · commit d6d5c575214c · 2025-10-30T14:21:33.000-04:00
* feat: add llm quantization example

* add readme file for quantize guide
diff --git a/ai-ml/llm-quantize/llm-compressor-gptq/Dockerfile b/ai-ml/llm-quantize/llm-compressor-gptq/Dockerfile
@@ -0,0 +1,23 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime
+
+COPY requirements.txt ./
+
+RUN pip install -r requirements.txt
+
+COPY main.py ./
+
+CMD ["python", "main.py"]
diff --git a/ai-ml/llm-quantize/llm-compressor-gptq/README.md b/ai-ml/llm-quantize/llm-compressor-gptq/README.md
@@ -0,0 +1,58 @@
+# LLM Quantization on GKE
+
+This document describes how to run LLM quantization on a GKE cluster.
+
+## Prerequisites
+
+- A GKE cluster with NVIDIA H100 GPUs.
+- `gcloud` CLI installed and configured.
+- `kubectl` CLI installed and configured.
+
+## Steps
+
+1. **Create an Artifact Registry repository:**
+
+   ```bash
+   export REPO_NAME=llm-quantize
+   export REGION=us-central1
+   gcloud artifacts repositories create $REPO_NAME --repository-format=docker --location=$REGION
+   ```
+
+2. **Build and push the Docker image:**
+
+   ```bash
+   export IMAGE_URL=${REGION}-docker.pkg.dev/$(gcloud config get-value project)/${REPO_NAME}/llm-processor-gptq
+   gcloud auth configure-docker ${REGION}-docker.pkg.dev
+   gcloud builds submit --tag $IMAGE_URL .
+   ```
+
+3. **Set environment variables:**
+
+   ```bash
+   export MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+   export HF_TOKEN="your-hugging-face-token"
+   ```
+
+4. **Create a Kubernetes secret for the Hugging Face token:**
+
+   ```bash
+   kubectl create secret generic hf-secret --from-literal=hf_api_token=$HF_TOKEN
+   ```
+
+5. **Deploy the quantization Job to GKE:**
+
+   ```bash
+   envsubst < job.yaml | kubectl apply -f -
+   ```
+
+6. **Monitor the Job:**
+
+   ```bash
+   kubectl get pods -w
+   ```
+
+7. **View the logs:**
+
+   ```bash
+   kubectl logs -f -l job-name=quantize
+   ```
diff --git a/ai-ml/llm-quantize/llm-compressor-gptq/job.yaml b/ai-ml/llm-quantize/llm-compressor-gptq/job.yaml
@@ -0,0 +1,52 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quantize
+spec:
+  ttlSecondsAfterFinished: 100
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-h100-80gb
+      containers:
+      - name: llm-compressor
+        image: $IMAGE_URL
+        command: ["python", "main.py"]
+        resources:
+          limits:
+            nvidia.com/gpu: "1"
+            cpu: "12"
+            memory: "80Gi"
+            ephemeral-storage: "80Gi"
+        env:
+        - name: LD_LIBRARY_PATH
+          value: ${LD_LIBRARY_PATH}:/usr/local/nvidia/lib64
+        - name: MODEL_ID
+          value: $MODEL_ID
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-secret
+              key: hf_api_token
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      volumes:
+      - name: dshm
+        emptyDir:
+            medium: Memory
+      restartPolicy: Never
diff --git a/ai-ml/llm-quantize/llm-compressor-gptq/main.py b/ai-ml/llm-quantize/llm-compressor-gptq/main.py
@@ -0,0 +1,96 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# Select model and load it.
+model_id = os.environ["MODEL_ID"]
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with GPTQ with a group size 128
+recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+model.push_to_hub(SAVE_DIR)
+tokenizer.push_to_hub(SAVE_DIR)
diff --git a/ai-ml/llm-quantize/llm-compressor-gptq/requirements.txt b/ai-ml/llm-quantize/llm-compressor-gptq/requirements.txt
@@ -0,0 +1 @@
+llmcompressor==0.8.1