Skip to content

Commit 7e1a263

Browse files
authored
Fix deployment on arm64 architecture (#109)
Fix service build and deployment on arm64 architecture.
1 parent fc2d2dc commit 7e1a263

17 files changed

+108
-72
lines changed

.github/workflows/build-deploy-changes.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ jobs:
2323
image: ubuntu:latest
2424
volumes:
2525
- /var/run/docker.sock:/var/run/docker.sock
26+
env:
27+
DOCKER_BUILDKIT: "1"
2628
steps:
2729
- name: Install git
2830
run: |
@@ -76,8 +78,9 @@ jobs:
7678
- name: Install Package
7779
if: steps.check.outputs.has_changed == 'true'
7880
run: |
79-
DEBIAN_FRONTEND=noninteractive apt install -y python3 python-is-python3 pip git unzip docker-cli ca-certificates curl apt-transport-https lsb-release gnupg parallel
81+
DEBIAN_FRONTEND=noninteractive apt install -y python3 python-is-python3 pip git unzip ca-certificates curl apt-transport-https lsb-release gnupg parallel
8082
curl -sL https://aka.ms/InstallAzureCLIDeb | bash
83+
curl -fsSL https://get.docker.com | sh
8184
8285
- name: Install python libs
8386
if: steps.check.outputs.has_changed == 'true'
@@ -98,6 +101,11 @@ jobs:
98101
mv $GITHUB_WORKSPACE/config/auth-configuration /tmp/
99102
ls -l /tmp/auth-configuration
100103
104+
- name: Login to GHCR
105+
if: steps.check.outputs.has_changed == 'true'
106+
run: |
107+
docker login ghcr.io -u ${{ github.actor }} -p ${{ secrets.GITHUB_TOKEN }}
108+
101109
- name: Build Images of Changed Services
102110
if: steps.check.outputs.has_changed == 'true'
103111
run: |

contrib/kubespray/script/environment.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,5 +51,5 @@ sudo python3 -m pip install -r ${HOME}/pai-deploy/kubespray/requirements.txt
5151

5252
# workaround python3-apt issue
5353
SOABI=$(python3 -c 'import sysconfig; print(sysconfig.get_config_var("SOABI"))')
54-
sudo ln -s /usr/lib/python3/dist-packages/apt_inst.${SOABI}.so /usr/lib/python3/dist-packages/apt_inst.so
55-
sudo ln -s /usr/lib/python3/dist-packages/apt_pkg.${SOABI}.so /usr/lib/python3/dist-packages/apt_pkg.so
54+
sudo ln -sf /usr/lib/python3/dist-packages/apt_inst.${SOABI}.so /usr/lib/python3/dist-packages/apt_inst.so
55+
sudo ln -sf /usr/lib/python3/dist-packages/apt_pkg.${SOABI}.so /usr/lib/python3/dist-packages/apt_pkg.so
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
FROM everpeace/k8s-host-device-plugin
5+
6+
#TODO: add arm64 image
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
FROM nvcr.io/nvidia/k8s-device-plugin:v0.18.0
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
FROM ghcr.io/mellanox/k8s-rdma-shared-dev-plugin:1.4.0
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
FROM rocm/k8s-device-plugin
5+
6+
# no arm64 support

src/device-plugin/deploy/device-plugin.yaml renamed to src/device-plugin/deploy/device-plugin.yaml.template

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,15 @@ spec:
2929
name: k8s-host-device-plugin-ds
3030
template:
3131
metadata:
32-
annotations:
33-
scheduler.alpha.kubernetes.io/critical-pod: ""
3432
labels:
3533
name: k8s-host-device-plugin-ds
3634
spec:
35+
priorityClassName: pai-daemon-priority
3736
tolerations:
3837
- key: CriticalAddonsOnly
3938
operator: Exists
4039
containers:
41-
- image: luciaopenai.azurecr.io/luciaopenai/k8s-host-device-plugin:latest
40+
- image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}k8s-host-device-plugin:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }}
4241
name: k8s-host-device-plugin-ctr
4342
securityContext:
4443
privileged: true
@@ -62,3 +61,5 @@ spec:
6261
items:
6362
- key: config.json
6463
path: config.json
64+
imagePullSecrets:
65+
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}

src/device-plugin/deploy/service.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ prerequisite:
2525
template-list:
2626
- start.sh
2727
- delete.sh
28+
- device-plugin.yaml
2829

2930
start-script: start.sh
3031
stop-script: stop.sh

src/device-plugin/deploy/start.sh.template

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,25 +29,33 @@ pushd $(dirname "$0") > /dev/null
2929
# Begin: NVIDIA GPU device plugin
3030
{% if 'nvidia.com/gpu' in cluster_cfg['device-plugin']['devices'] %}
3131

32-
curl -s https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml \
33-
| sed 's|nvcr.io/nvidia/k8s-device-plugin|luciaopenai.azurecr.io/luciaopenai/nvidia/k8s-device-plugin|' \
32+
{ curl -s https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml \
33+
| sed 's|nvcr.io/nvidia/k8s-device-plugin:v0.15.0|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-nvidia-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \
3434
| sed -E '/^[[:space:]]*allowPrivilegeEscalation: false/ {
3535
h
3636
s/^([[:space:]]*)allowPrivilegeEscalation: false.*$/\1privileged: false/
3737
G
3838
s/(^[[:space:]]*allowPrivilegeEscalation: false.*)\n([[:space:]]*privileged: false)/\1\n\2/
39-
}' \
40-
| kubectl apply --overwrite=true -f - || exit $?
39+
}';
40+
cat <<'YAML'
41+
imagePullSecrets:
42+
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
43+
YAML
44+
} | kubectl apply --overwrite=true -f - || exit $?
4145

4246
{% endif %}
4347
# End: NVIDIA GPU device plugin
4448

4549
# Begin: AMD GPU device plugin
4650
{% if 'amd.com/gpu' in cluster_cfg['device-plugin']['devices'] %}
4751

48-
curl -s https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml \
49-
| sed 's|rocm/k8s-device-plugin|luciaopenai.azurecr.io/luciaopenai/rocm/k8s-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \
50-
| kubectl apply --overwrite=true -f - || exit $?
52+
{ curl -s https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml \
53+
| sed 's|rocm/k8s-device-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rocm-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|';
54+
cat <<'YAML'
55+
imagePullSecrets:
56+
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
57+
YAML
58+
} | kubectl apply --overwrite=true -f - || exit $?
5159

5260
{% endif %}
5361
# End: AMD GPU device plugin
@@ -75,9 +83,13 @@ kubectl apply --overwrite=true -f device-plugin.yaml || exit $?
7583
{% if 'rdma/hca' in cluster_cfg['device-plugin']['devices'] %}
7684

7785
kubectl apply --overwrite=true -f rdma-devices.yaml || exit $?
78-
curl -s https://raw.githubusercontent.com/Mellanox/k8s-rdma-shared-dev-plugin/v1.4.0/deployment/k8s/base/daemonset.yaml \
79-
| sed 's|ghcr.io/mellanox/k8s-rdma-shared-dev-plugin|luciaopenai.azurecr.io/luciaopenai/k8s-rdma-shared-dev-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \
80-
| kubectl apply --overwrite=true -f - || exit $?
86+
{ curl -s https://raw.githubusercontent.com/Mellanox/k8s-rdma-shared-dev-plugin/v1.4.0/deployment/k8s/base/daemonset.yaml \
87+
| sed 's|ghcr.io/mellanox/k8s-rdma-shared-dev-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rdma-shared-dev-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|';
88+
cat <<'YAML'
89+
imagePullSecrets:
90+
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
91+
YAML
92+
} | kubectl apply --overwrite=true -f - || exit $?
8193

8294
{% endif %}
8395

src/frameworkcontroller/deploy/frameworkcontroller.yaml.template

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,5 @@ spec:
5151
- name: frameworkcontroller-config
5252
configMap:
5353
name: frameworkcontroller-config
54+
imagePullSecrets:
55+
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}

0 commit comments

Comments
 (0)