Skip to content

Fix deployment on arm64 architecture (#109) #229

Fix deployment on arm64 architecture (#109)

Fix deployment on arm64 architecture (#109) #229

name: Build & Deploy Changed Services
permissions:
packages: write
contents: read
on:
push:
branches: [main, dev, "release/*"]
pull_request:
branches: [main, dev, "release/*"]
env:
TAG: ${{ github.run_number }}
jobs:
build:
name: Build and Deploy
runs-on: [self-hosted, paicicd]
timeout-minutes: 120
environment: auto-test
container:
image: ubuntu:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
env:
DOCKER_BUILDKIT: "1"
steps:
- name: Install git
run: |
DEBIAN_FRONTEND=noninteractive apt update
DEBIAN_FRONTEND=noninteractive apt install -y git
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
submodules: false
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.ref_name }}
- name: Get Changed Folders (Services)
id: changes
run: |
git config --global --add safe.directory "$GITHUB_WORKSPACE"
if [ "${{ github.event_name }}" = "pull_request" ]; then
echo "Pull request detected"
# Fetch the merge base to get only PR changes
git fetch origin ${{ github.event.pull_request.base.ref }} --depth=50
base_sha=$(git merge-base origin/${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.sha }})
head_sha="${{ github.event.pull_request.head.sha }}"
else
base_sha="${{ github.event.before }}"
head_sha="${{ github.sha }}"
fi
echo "Comparing $base_sha...$head_sha"
changed_files=$(git diff --name-only "$base_sha" "$head_sha")
echo "Changed files: $changed_files"
# extract service folders under src/
folders=$(echo "$changed_files" | grep '^src/' \
| awk -F'/' '{print $2}' \
| sort -u | tr '\n' ' ')
echo "Changed folders: $folders"
# export as output for next steps
echo "folders=$folders" >> $GITHUB_OUTPUT
- name: Check if folders are empty
id: check
run: |
if [ -z "${{ steps.changes.outputs.folders }}" ]; then
echo "has_changed=false" >> $GITHUB_OUTPUT
else
echo "has_changed=true" >> $GITHUB_OUTPUT
fi
- name: Install Package
if: steps.check.outputs.has_changed == 'true'
run: |
DEBIAN_FRONTEND=noninteractive apt install -y python3 python-is-python3 pip git unzip ca-certificates curl apt-transport-https lsb-release gnupg parallel
curl -sL https://aka.ms/InstallAzureCLIDeb | bash
curl -fsSL https://get.docker.com | sh
- name: Install python libs
if: steps.check.outputs.has_changed == 'true'
run: python -m pip install --break-system-packages pyyaml jinja2 paramiko etcd3 protobuf==3.20.3 kubernetes gitpython
- name: Decode and unzip config file
if: steps.check.outputs.has_changed == 'true'
run: |
echo "${{ secrets.CONFIG_FILE_B64 }}" | base64 -d > config.zip
mkdir -p $GITHUB_WORKSPACE/config
unzip -o config.zip -d $GITHUB_WORKSPACE/config
ls -l $GITHUB_WORKSPACE/config
- name: Arrange Config Files
if: steps.check.outputs.has_changed == 'true'
run: |
rm -rf /tmp/auth-configuration
mv $GITHUB_WORKSPACE/config/auth-configuration /tmp/
ls -l /tmp/auth-configuration
- name: Login to GHCR
if: steps.check.outputs.has_changed == 'true'
run: |
docker login ghcr.io -u ${{ github.actor }} -p ${{ secrets.GITHUB_TOKEN }}
- name: Build Images of Changed Services
if: steps.check.outputs.has_changed == 'true'
run: |
changed_services="${{ steps.changes.outputs.folders }}"
echo "Building: $changed_services"
if [[ "$changed_services" == *"alert-manager"* ]]; then
echo "alert-manager is in the changed services"
changed_services=$(echo $changed_services | sed 's/alert-manager//g')
# build specific images in alert-manager
echo "Building specific alert-manager images"
$GITHUB_WORKSPACE/build/pai_build.py build \
-c $GITHUB_WORKSPACE/config/cluster-configuration \
-s alert-manager
-i abnormal-detector,alert-handler,alert-parser,cert-expiration-checker,cluster-utilization,job-data-recorder,job-status-change-notification,node-failure-detection,node-issue-classifier,nvidia-gpu-low-perf-fixer,redis-monitoring
fi
echo "Changed services after removing alert-manager: $changed_services"
$GITHUB_WORKSPACE/build/pai_build.py build \
-c $GITHUB_WORKSPACE/config/cluster-configuration \
-s $changed_services
- name: Push Images of Changed Services to ACR
if: steps.check.outputs.has_changed == 'true'
run: |
changed_services="${{ steps.changes.outputs.folders }}"
echo "Pushing: $changed_services"
$GITHUB_WORKSPACE/build/pai_build.py push \
-c $GITHUB_WORKSPACE/config/cluster-configuration \
-s $changed_services
- name: Push Images of Changed Service to GHCR
if: steps.check.outputs.has_changed == 'true'
run: |
changed_services="${{ steps.changes.outputs.folders }}"
echo "Pushing: $changed_services"
# check whether alert-manager is in the changed services
echo "Changed services before removing alert-manager: $changed_services"
if [[ "$changed_services" == *"alert-manager"* ]]; then
echo "alert-manager is in the changed services"
changed_services=$(echo $changed_services | sed 's/alert-manager//g')
# push specific images in alert-manager to GHCR
echo "Pushing specific alert-manager images to GHCR"
$GITHUB_WORKSPACE/build/pai_build.py push \
-c $GITHUB_WORKSPACE/config/cluster-configuration \
-s alert-manager \
-i abnormal-detector,alert-handler,alert-parser,cert-expiration-checker,cluster-utilization,job-data-recorder,job-status-change-notification,node-failure-detection,node-issue-classifier,nvidia-gpu-low-perf-fixer,redis-monitoring \
--docker-registry ghcr.io \
--docker-namespace ${GITHUB_REPOSITORY_OWNER} \
--docker-username ${{ github.actor }} \
--docker-password ${{ secrets.GITHUB_TOKEN }}
fi
echo "Changed services after removing alert-manager: $changed_services"
$GITHUB_WORKSPACE/build/pai_build.py push \
-c $GITHUB_WORKSPACE/config/cluster-configuration \
-s $changed_services \
--docker-registry ghcr.io \
--docker-namespace ${GITHUB_REPOSITORY_OWNER} \
--docker-username ${{ github.actor }} \
--docker-password ${{ secrets.GITHUB_TOKEN }}
- name: Azure CLI get credentials and deploy
if: steps.check.outputs.has_changed == 'true'
run: |
az version
az login --identity --client-id ${{ secrets.AZURE_MANAGED_IDENTITY_CLIENT_ID }}
az aks install-cli
az aks get-credentials \
--resource-group ${{ secrets.AZURE_RESOURCE_GROUP }} \
--name ${{ secrets.KUBERNETES_CLUSTER }} \
--overwrite-existing
kubelogin convert-kubeconfig -l azurecli
kubectl config use-context ${{ secrets.KUBERNETES_CLUSTER }}
# Replace "webportal" with "webportal-dind" if "webportal" is changed
services_to_deploy="${{ steps.changes.outputs.folders }}"
if echo " $services_to_deploy " | grep -q " webportal "; then
tmp=""
for s in $services_to_deploy; do
[ "$s" = "webportal" ] && continue
[ "$s" = "webportal-dind" ] && continue
tmp="$tmp $s"
done
services_to_deploy="$tmp webportal-dind"
services_to_deploy=$(echo "$services_to_deploy" | xargs)
fi
echo "Final services to deploy: $services_to_deploy"
echo "${{ secrets.PAI_CLUSTER_NAME }}" > cluster_id
echo "Stopping changed pai services $services_to_deploy on ${{ secrets.PAI_CLUSTER_NAME }} ..."
$GITHUB_WORKSPACE/paictl.py service stop -n $services_to_deploy < cluster_id
echo "Pushing config to cluster \"${{ secrets.PAI_CLUSTER_NAME }}\" ..."
$GITHUB_WORKSPACE/paictl.py config push -m service -p $GITHUB_WORKSPACE/config/cluster-configuration < cluster_id
echo "Starting to update $services_to_deploy on ${{ secrets.PAI_CLUSTER_NAME }} ..."
$GITHUB_WORKSPACE/paictl.py service start -n $services_to_deploy < cluster_id
kubectl get pod
kubectl get service
test:
name: Test rest-server
needs: build
runs-on: [self-hosted, paicicd]
environment: auto-test
steps:
- name: Test rest-server
run: |
echo "Testing rest-server ${{ secrets.PAI_WEB_URL }}/rest-server/api/v2/info"
curl ${{ secrets.PAI_WEB_URL }}/rest-server/api/v2/info
echo "Checking virtual cluster status..."
vc_info=$(curl -H "Authorization: Bearer ${{ secrets.PAI_WEB_TOKEN }}" -s ${{ secrets.PAI_WEB_URL }}/rest-server/api/v2/virtual-clusters)
if [ $? -ne 0 ]; then
echo "Failed to access virtual cluster API"
exit 1
fi
echo "Virtual cluster info: $vc_info"