Khan · jwbron · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/terraform/modules/cloud-build-docker/.digests/khan-academy-staging_admin_latest.txt b/terraform/modules/cloud-build-docker/.digests/khan-academy-staging_admin_latest.txt
@@ -0,0 +1 @@
+gcr.io/khan-academy-staging/admin@sha256:05e1e2e8a7853510f42b61054ea987c25aea11a0391444d33302b609a8462a57
diff --git a/terraform/modules/cloud-build-docker/build_image.py b/terraform/modules/cloud-build-docker/build_image.py
@@ -11,6 +11,7 @@
 import os
 import subprocess
 import sys
+import time
 
 
 def run_command(cmd, **kwargs):
@@ -30,25 +31,31 @@ def run_command(cmd, **kwargs):
 def get_image_digest(image_uri, tag, project_id):
     """Query the digest of an existing image."""
     try:
+        # Use list-tags to get all tags, then filter in Python for exact match
+        # This avoids issues where "tags:latest" matches both "latest" and "latest-builder"
         result = run_command(
             [
                 "gcloud",
                 "container",
                 "images",
                 "list-tags",
                 image_uri,
-                "--filter",
-                f"tags:{tag}",
-                "--limit",
-                "1",
-                "--format=get(digest)",
+                "--format=json",
                 "--project",
                 project_id,
             ]
         )
-        digest = result.stdout.strip()
-        if digest:
-            return f"{image_uri}@{digest}"
+
+        import json
+        images = json.loads(result.stdout)
+
+        # Find the image with the exact tag match
+        for image in images:
+            if tag in image.get("tags", []):
+                digest = image.get("digest")
+                if digest:
+                    return f"{image_uri}@{digest}"
+
         return None
     except subprocess.CalledProcessError:
         return None
@@ -57,23 +64,29 @@ def get_image_digest(image_uri, tag, project_id):
 def check_cache_tag_exists(image_uri, cache_tag, project_id):
     """Check if a cache tag exists for the given image."""
     try:
+        # Get all tags and filter for exact match to avoid partial matches
         result = run_command(
             [
                 "gcloud",
                 "container",
                 "images",
                 "list-tags",
                 image_uri,
-                "--filter",
-                f"tags:{cache_tag}",
-                "--limit",
-                "1",
-                "--format=get(digest)",
+                "--format=json",
                 "--project",
                 project_id,
             ]
         )
-        return bool(result.stdout.strip())
+
+        import json
+        images = json.loads(result.stdout)
+
+        # Check if any image has this exact tag
+        for image in images:
+            if cache_tag in image.get("tags", []):
+                return True
+
+        return False
     except subprocess.CalledProcessError:
         return False
 
@@ -103,6 +116,7 @@ def build_image(
     project_id,
     image_tag_suffix,
     base_digest="latest",
+    region="us-central1",
 ):
     """Build a Docker image via Cloud Build and return its digest."""
 
@@ -172,20 +186,94 @@ def build_image(
         cloudbuild_config = os.path.join(script_dir, "cloudbuild.yml")
 
         # TODO(jwbron): Consider adding automatic GCS bucket creation with import support for existing buckets in terraform
-        run_command(
+        # Submit build asynchronously to avoid rate limit issues when running many parallel builds
+        print(f"Submitting build for {image_name} in region {region}...", file=sys.stderr)
+        result = subprocess.run(
             [
                 "gcloud",
                 "builds",
                 "submit",
                 context_path,
                 f"--config={cloudbuild_config}",
                 f"--project={project_id}",
+                f"--region={region}",
                 f"--gcs-source-staging-dir=gs://{project_id}-cloudbuild-ci/staging",
                 f"--gcs-log-dir=gs://{project_id}-cloudbuild-ci/logs",
                 f"--substitutions={subs_str}",
-            ]
+                "--async",  # Don't wait/poll - avoids rate limit issues
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
         )
 
+        # Extract build ID from async output (table format)
+        # Expected format:
+        # ID                                    CREATE_TIME                DURATION  SOURCE  IMAGES  STATUS
+        # 4b1a6381-bdd5-477c-bd47-4eb376987d88  2025-11-06T07:11:53+00:00  -         ...     -       QUEUED
+        build_id = None
+        lines = result.stdout.splitlines()
+        for i, line in enumerate(lines):
+            # Find the header line (contains "ID" and "CREATE_TIME")
+            if "ID" in line and "CREATE_TIME" in line:
+                # Build ID is in the first column of the next line
+                if i + 1 < len(lines):
+                    data_line = lines[i + 1].strip()
+                    if data_line:
+                        # Extract first field (build ID)
+                        build_id = data_line.split()[0]
+                        break
+
+        if not build_id:
+            raise RuntimeError(f"Failed to extract build ID from output: {result.stdout}")
+
+        print(f"Build submitted: {build_id}", file=sys.stderr)
+        print(f"Waiting for build to complete...", file=sys.stderr)
+
+        # Poll build status using 'gcloud builds describe'
+        # Use exponential backoff to reduce API calls and avoid rate limits
+        poll_interval = 10  # Start with 10 seconds
+        max_interval = 60   # Cap at 60 seconds
+        elapsed = 0
+
+        while True:
+            time.sleep(poll_interval)
+            elapsed += poll_interval
+
+            try:
+                result = run_command(
+                    [
+                        "gcloud",
+                        "builds",
+                        "describe",
+                        build_id,
+                        f"--project={project_id}",
+                        f"--region={region}",
+                        "--format=get(status)",
+                    ]
+                )
+                status = result.stdout.strip()
+
+                if status == "SUCCESS":
+                    print(f"Build completed successfully after {elapsed}s", file=sys.stderr)
+                    break
+                elif status in ["FAILURE", "TIMEOUT", "CANCELLED", "INTERNAL_ERROR"]:
+                    print(f"\nBuild {status} after {elapsed}s", file=sys.stderr)
+                    print(f"\nTo view build logs:", file=sys.stderr)
+                    print(f"  gcloud builds log {build_id} --project={project_id}", file=sys.stderr)
+                    print(f"Or visit: https://console.cloud.google.com/cloud-build/builds/{build_id}?project={project_id}", file=sys.stderr)
+                    raise RuntimeError(f"Build {build_id} {status}")
+                elif status in ["QUEUED", "WORKING"]:
+                    print(f"Build status: {status} (elapsed: {elapsed}s)", file=sys.stderr)
+                    # Increase poll interval (exponential backoff, capped)
+                    poll_interval = min(poll_interval * 1.5, max_interval)
+                else:
+                    print(f"Unknown build status: {status}", file=sys.stderr)
+
+            except subprocess.CalledProcessError as e:
+                print(f"\nFailed to check build status: {e}", file=sys.stderr)
+                raise RuntimeError(f"Failed to check status for build {build_id}")
+
         # Query the digest of the newly built image
         digest = get_image_digest(image_uri, image_tag_suffix, project_id)
         if not digest:
@@ -216,6 +304,7 @@ def main():
         project_id = input_data["project_id"]
         image_tag_suffix = input_data["image_tag_suffix"]
         base_digest = input_data.get("base_digest", "latest")
+        region = input_data.get("region", "us-central1")
 
         # Validate that image_tag_suffix is not empty
         if not image_tag_suffix or image_tag_suffix.strip() == "":
@@ -229,6 +318,7 @@ def main():
             project_id=project_id,
             image_tag_suffix=image_tag_suffix,
             base_digest=base_digest,
+            region=region,
         )
 
         # Return JSON output for Terraform

diff --git a/terraform/modules/cloud-build-docker/cloudbuild.yml b/terraform/modules/cloud-build-docker/cloudbuild.yml
@@ -1,36 +1,48 @@
-# Cloud Build configuration with branch-based caching.
-# Enables caching by pulling the previous image with a given "cache tag", if it exists, and reusing its layers.
+# Cloud Build configuration with BuildKit inline cache for multi-stage builds.
+# BuildKit is required to properly cache intermediate stages in multi-stage builds.
 steps:
 - name: 'gcr.io/cloud-builders/docker'
-  id: Pull previous image
+  id: Pull cache images
   entrypoint: bash
   args:
   - -c
   - |
-    echo "Attempting to pull cache image: $_IMAGE_NAME:$_CACHE_TAG"
-    docker pull "$_IMAGE_NAME:$_CACHE_TAG" || echo "Cache image not found, will build without cache"
+    echo "Attempting to pull cache image..."
+    # With BuildKit inline cache, we only need to pull the final image
+    # which contains metadata about all intermediate stages
+    docker pull "$_IMAGE_NAME:$_CACHE_TAG" || echo "Cache image not found, building from scratch"
+
 - name: 'gcr.io/cloud-builders/docker'
-  id: Build image
+  id: Build image with BuildKit
   entrypoint: bash
+  env:
+  - 'DOCKER_BUILDKIT=1'
   args:
   - -c
   - |
-    if docker image inspect "$_IMAGE_NAME:$_CACHE_TAG" > /dev/null 2>&1; then
-      echo "Using cache from $_IMAGE_NAME:$_CACHE_TAG"
-      docker build --tag="$_IMAGE_TAG" --cache-from="$_IMAGE_NAME:$_CACHE_TAG" --build-arg BASE_IMAGE="$_BASE_DIGEST" .
-    else
-      echo "No cache found for $_IMAGE_NAME:$_CACHE_TAG, building without --cache-from"
-      docker build --tag="$_IMAGE_TAG" --build-arg BASE_IMAGE="$_BASE_DIGEST" .
-    fi
+    echo "Building with BuildKit and inline cache..."
+
+    # Build with BuildKit inline cache
+    # --cache-from pulls cache metadata from previous build
+    # --build-arg BUILDKIT_INLINE_CACHE=1 embeds cache metadata in the image
+    docker build \
+      --tag="$_IMAGE_TAG" \
+      --cache-from="$_IMAGE_NAME:$_CACHE_TAG" \
+      --build-arg BUILDKIT_INLINE_CACHE=1 \
+      --build-arg BASE_IMAGE="$_BASE_DIGEST" \
+      .
+
 - name: 'gcr.io/cloud-builders/docker'
-  id: Tag cache
+  id: Tag cache image
   entrypoint: bash
   args: ['-c', 'docker tag "$_IMAGE_TAG" "$_IMAGE_NAME:$_CACHE_TAG"']
-  waitFor: ['Build image']
+  waitFor: ['Build image with BuildKit']
+
 - name: 'gcr.io/cloud-builders/docker'
-  id: Push cache
+  id: Push cache image
   entrypoint: bash
   args: ['-c', 'docker push "$_IMAGE_NAME:$_CACHE_TAG"']
-  waitFor: ['Tag cache']
+  waitFor: ['Tag cache image']
+
 images:
 - '$_IMAGE_TAG'
diff --git a/terraform/modules/cloud-build-docker/main.tf b/terraform/modules/cloud-build-docker/main.tf
@@ -28,6 +28,7 @@ data "external" "image_build" {
     project_id       = var.project_id
     image_tag_suffix = var.image_tag_suffix
     base_digest      = var.base_digest
+    region           = var.region
   }
 
   # Trigger rebuild when any of these change

diff --git a/terraform/modules/cloud-build-docker/variables.tf b/terraform/modules/cloud-build-docker/variables.tf
@@ -39,3 +39,9 @@ variable "base_digest" {
   type        = string
   default     = "latest"
 }
+
+variable "region" {
+  description = "The GCP region where Cloud Build jobs will run"
+  type        = string
+  default     = "us-central1"
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		gcr.io/khan-academy-staging/admin@sha256:05e1e2e8a7853510f42b61054ea987c25aea11a0391444d33302b609a8462a57
jwbron marked this conversation as resolved. Show resolved Hide resolved