Skip to content
Open
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
gcr.io/khan-academy-staging/admin@sha256:05e1e2e8a7853510f42b61054ea987c25aea11a0391444d33302b609a8462a57
122 changes: 106 additions & 16 deletions terraform/modules/cloud-build-docker/build_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import os
import subprocess
import sys
import time


def run_command(cmd, **kwargs):
Expand All @@ -30,25 +31,31 @@ def run_command(cmd, **kwargs):
def get_image_digest(image_uri, tag, project_id):
"""Query the digest of an existing image."""
try:
# Use list-tags to get all tags, then filter in Python for exact match
# This avoids issues where "tags:latest" matches both "latest" and "latest-builder"
Comment on lines +34 to +35
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the issue is you want an exact match, use tags={tag} instead of tags:{tag} on (the old) line 41. At least, that's my reading of gcloud topic filters.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In any case, it seems like this is fixing an issue that is unrelated to what this PR is trying to accomplish? Seems like the updates to build_image.py should be their own PR, no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was all work that was done during the hackathon. We will have a number of similar changesets (mostly in webapp) that may not seem totally related. I'd prefer to avoid putting together dozens of PRs and instead grouping changes together in a reasonable way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like having a PR be one change, instead of a few random changes that just happen to be in the same file. It makes it easier to review, easier to debug, and easier to roll back in case of problems. I don't think the overhead of creating a PR is that high?

But if you want to do things this way, then I will ask you, as a reviewer, to document every change you make with what logical change it goes with, so I can understand the PR better.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(You can do that in the commit message if you want -- it's probably the best way to go about it -- but you should be extra-clear what code changes go with what logical change.)

result = run_command(
[
"gcloud",
"container",
"images",
"list-tags",
image_uri,
"--filter",
f"tags:{tag}",
"--limit",
"1",
"--format=get(digest)",
"--format=json",
"--project",
project_id,
]
)
digest = result.stdout.strip()
if digest:
return f"{image_uri}@{digest}"

import json
images = json.loads(result.stdout)

# Find the image with the exact tag match
for image in images:
if tag in image.get("tags", []):
digest = image.get("digest")
if digest:
return f"{image_uri}@{digest}"

return None
except subprocess.CalledProcessError:
return None
Expand All @@ -57,23 +64,29 @@ def get_image_digest(image_uri, tag, project_id):
def check_cache_tag_exists(image_uri, cache_tag, project_id):
"""Check if a cache tag exists for the given image."""
try:
# Get all tags and filter for exact match to avoid partial matches
result = run_command(
[
"gcloud",
"container",
"images",
"list-tags",
image_uri,
"--filter",
f"tags:{cache_tag}",
"--limit",
"1",
"--format=get(digest)",
"--format=json",
"--project",
project_id,
]
)
return bool(result.stdout.strip())

import json
images = json.loads(result.stdout)

# Check if any image has this exact tag
for image in images:
if cache_tag in image.get("tags", []):
return True

return False
except subprocess.CalledProcessError:
return False

Expand Down Expand Up @@ -103,6 +116,7 @@ def build_image(
project_id,
image_tag_suffix,
base_digest="latest",
region="us-central1",
):
"""Build a Docker image via Cloud Build and return its digest."""

Expand Down Expand Up @@ -172,20 +186,94 @@ def build_image(
cloudbuild_config = os.path.join(script_dir, "cloudbuild.yml")

# TODO(jwbron): Consider adding automatic GCS bucket creation with import support for existing buckets in terraform
run_command(
# Submit build asynchronously to avoid rate limit issues when running many parallel builds
print(f"Submitting build for {image_name} in region {region}...", file=sys.stderr)
result = subprocess.run(
[
"gcloud",
"builds",
"submit",
context_path,
f"--config={cloudbuild_config}",
f"--project={project_id}",
f"--region={region}",
f"--gcs-source-staging-dir=gs://{project_id}-cloudbuild-ci/staging",
f"--gcs-log-dir=gs://{project_id}-cloudbuild-ci/logs",
f"--substitutions={subs_str}",
]
"--async", # Don't wait/poll - avoids rate limit issues
],
check=True,
capture_output=True,
text=True,
)

# Extract build ID from async output (table format)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can do --format=json to get json output instead of table output. That will be a lot easier, and more robust, to parse.

Though maybe --format=get(id) would be even better!

# Expected format:
# ID CREATE_TIME DURATION SOURCE IMAGES STATUS
# 4b1a6381-bdd5-477c-bd47-4eb376987d88 2025-11-06T07:11:53+00:00 - ... - QUEUED
build_id = None
lines = result.stdout.splitlines()
for i, line in enumerate(lines):
# Find the header line (contains "ID" and "CREATE_TIME")
if "ID" in line and "CREATE_TIME" in line:
# Build ID is in the first column of the next line
if i + 1 < len(lines):
data_line = lines[i + 1].strip()
if data_line:
# Extract first field (build ID)
build_id = data_line.split()[0]
break

if not build_id:
raise RuntimeError(f"Failed to extract build ID from output: {result.stdout}")

print(f"Build submitted: {build_id}", file=sys.stderr)
print(f"Waiting for build to complete...", file=sys.stderr)

# Poll build status using 'gcloud builds describe'
# Use exponential backoff to reduce API calls and avoid rate limits
poll_interval = 10 # Start with 10 seconds
max_interval = 60 # Cap at 60 seconds
elapsed = 0

while True:
time.sleep(poll_interval)
elapsed += poll_interval

try:
result = run_command(
[
"gcloud",
"builds",
"describe",
build_id,
f"--project={project_id}",
f"--region={region}",
"--format=get(status)",
]
)
status = result.stdout.strip()

if status == "SUCCESS":
print(f"Build completed successfully after {elapsed}s", file=sys.stderr)
break
elif status in ["FAILURE", "TIMEOUT", "CANCELLED", "INTERNAL_ERROR"]:
print(f"\nBuild {status} after {elapsed}s", file=sys.stderr)
print(f"\nTo view build logs:", file=sys.stderr)
print(f" gcloud builds log {build_id} --project={project_id}", file=sys.stderr)
print(f"Or visit: https://console.cloud.google.com/cloud-build/builds/{build_id}?project={project_id}", file=sys.stderr)
raise RuntimeError(f"Build {build_id} {status}")
elif status in ["QUEUED", "WORKING"]:
print(f"Build status: {status} (elapsed: {elapsed}s)", file=sys.stderr)
# Increase poll interval (exponential backoff, capped)
poll_interval = min(poll_interval * 1.5, max_interval)
else:
print(f"Unknown build status: {status}", file=sys.stderr)

except subprocess.CalledProcessError as e:
print(f"\nFailed to check build status: {e}", file=sys.stderr)
raise RuntimeError(f"Failed to check status for build {build_id}")

# Query the digest of the newly built image
digest = get_image_digest(image_uri, image_tag_suffix, project_id)
if not digest:
Expand Down Expand Up @@ -216,6 +304,7 @@ def main():
project_id = input_data["project_id"]
image_tag_suffix = input_data["image_tag_suffix"]
base_digest = input_data.get("base_digest", "latest")
region = input_data.get("region", "us-central1")

# Validate that image_tag_suffix is not empty
if not image_tag_suffix or image_tag_suffix.strip() == "":
Expand All @@ -229,6 +318,7 @@ def main():
project_id=project_id,
image_tag_suffix=image_tag_suffix,
base_digest=base_digest,
region=region,
)

# Return JSON output for Terraform
Expand Down
46 changes: 29 additions & 17 deletions terraform/modules/cloud-build-docker/cloudbuild.yml
Original file line number Diff line number Diff line change
@@ -1,36 +1,48 @@
# Cloud Build configuration with branch-based caching.
# Enables caching by pulling the previous image with a given "cache tag", if it exists, and reusing its layers.
# Cloud Build configuration with BuildKit inline cache for multi-stage builds.
# BuildKit is required to properly cache intermediate stages in multi-stage builds.
steps:
- name: 'gcr.io/cloud-builders/docker'
id: Pull previous image
id: Pull cache images
entrypoint: bash
args:
- -c
- |
echo "Attempting to pull cache image: $_IMAGE_NAME:$_CACHE_TAG"
docker pull "$_IMAGE_NAME:$_CACHE_TAG" || echo "Cache image not found, will build without cache"
echo "Attempting to pull cache image..."
# With BuildKit inline cache, we only need to pull the final image
# which contains metadata about all intermediate stages
docker pull "$_IMAGE_NAME:$_CACHE_TAG" || echo "Cache image not found, building from scratch"
- name: 'gcr.io/cloud-builders/docker'
id: Build image
id: Build image with BuildKit
entrypoint: bash
env:
- 'DOCKER_BUILDKIT=1'
args:
- -c
- |
if docker image inspect "$_IMAGE_NAME:$_CACHE_TAG" > /dev/null 2>&1; then
echo "Using cache from $_IMAGE_NAME:$_CACHE_TAG"
docker build --tag="$_IMAGE_TAG" --cache-from="$_IMAGE_NAME:$_CACHE_TAG" --build-arg BASE_IMAGE="$_BASE_DIGEST" .
else
echo "No cache found for $_IMAGE_NAME:$_CACHE_TAG, building without --cache-from"
docker build --tag="$_IMAGE_TAG" --build-arg BASE_IMAGE="$_BASE_DIGEST" .
fi
echo "Building with BuildKit and inline cache..."
# Build with BuildKit inline cache
# --cache-from pulls cache metadata from previous build
# --build-arg BUILDKIT_INLINE_CACHE=1 embeds cache metadata in the image
docker build \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work if the --cache-from argument isn't found? Or do you still need the if like you had in the old code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current code should work fine without the conditional. Current approach (works correctly):

  • Line 13: Try to pull cache image with || echo - if pull fails, just print message and continue (doesn't exit)
  • Line 30: Always use --cache-from in the build command

Docker/BuildKit behavior:

  • --cache-from is graceful - if the referenced image doesn't exist locally, Docker just ignores it and builds without cache
  • No error is thrown, build continues normally

--tag="$_IMAGE_TAG" \
--cache-from="$_IMAGE_NAME:$_CACHE_TAG" \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--build-arg BASE_IMAGE="$_BASE_DIGEST" \
.
- name: 'gcr.io/cloud-builders/docker'
id: Tag cache
id: Tag cache image
entrypoint: bash
args: ['-c', 'docker tag "$_IMAGE_TAG" "$_IMAGE_NAME:$_CACHE_TAG"']
waitFor: ['Build image']
waitFor: ['Build image with BuildKit']

- name: 'gcr.io/cloud-builders/docker'
id: Push cache
id: Push cache image
entrypoint: bash
args: ['-c', 'docker push "$_IMAGE_NAME:$_CACHE_TAG"']
waitFor: ['Tag cache']
waitFor: ['Tag cache image']

images:
- '$_IMAGE_TAG'
1 change: 1 addition & 0 deletions terraform/modules/cloud-build-docker/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data "external" "image_build" {
project_id = var.project_id
image_tag_suffix = var.image_tag_suffix
base_digest = var.base_digest
region = var.region
}

# Trigger rebuild when any of these change
Expand Down
6 changes: 6 additions & 0 deletions terraform/modules/cloud-build-docker/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,9 @@ variable "base_digest" {
type = string
default = "latest"
}

variable "region" {
description = "The GCP region where Cloud Build jobs will run"
type = string
default = "us-central1"
}