Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .github/workflows/build-deploy-changes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ jobs:
image: ubuntu:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
env:
DOCKER_BUILDKIT: "1"
steps:
- name: Install git
run: |
Expand Down Expand Up @@ -76,8 +78,9 @@ jobs:
- name: Install Package
if: steps.check.outputs.has_changed == 'true'
run: |
DEBIAN_FRONTEND=noninteractive apt install -y python3 python-is-python3 pip git unzip docker-cli ca-certificates curl apt-transport-https lsb-release gnupg parallel
DEBIAN_FRONTEND=noninteractive apt install -y python3 python-is-python3 pip git unzip ca-certificates curl apt-transport-https lsb-release gnupg parallel
curl -sL https://aka.ms/InstallAzureCLIDeb | bash
curl -fsSL https://get.docker.com | sh

- name: Install python libs
if: steps.check.outputs.has_changed == 'true'
Expand All @@ -98,6 +101,11 @@ jobs:
mv $GITHUB_WORKSPACE/config/auth-configuration /tmp/
ls -l /tmp/auth-configuration

- name: Login to GHCR
if: steps.check.outputs.has_changed == 'true'
run: |
docker login ghcr.io -u ${{ github.actor }} -p ${{ secrets.GITHUB_TOKEN }}

- name: Build Images of Changed Services
if: steps.check.outputs.has_changed == 'true'
run: |
Expand Down
4 changes: 2 additions & 2 deletions contrib/kubespray/script/environment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,5 @@ sudo python3 -m pip install -r ${HOME}/pai-deploy/kubespray/requirements.txt

# workaround python3-apt issue
SOABI=$(python3 -c 'import sysconfig; print(sysconfig.get_config_var("SOABI"))')
sudo ln -s /usr/lib/python3/dist-packages/apt_inst.${SOABI}.so /usr/lib/python3/dist-packages/apt_inst.so
sudo ln -s /usr/lib/python3/dist-packages/apt_pkg.${SOABI}.so /usr/lib/python3/dist-packages/apt_pkg.so
sudo ln -sf /usr/lib/python3/dist-packages/apt_inst.${SOABI}.so /usr/lib/python3/dist-packages/apt_inst.so
sudo ln -sf /usr/lib/python3/dist-packages/apt_pkg.${SOABI}.so /usr/lib/python3/dist-packages/apt_pkg.so
6 changes: 6 additions & 0 deletions src/device-plugin/build/k8s-host-device-plugin.k8s.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

FROM everpeace/k8s-host-device-plugin

#TODO: add arm64 image
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

FROM nvcr.io/nvidia/k8s-device-plugin:v0.18.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

FROM ghcr.io/mellanox/k8s-rdma-shared-dev-plugin:1.4.0
6 changes: 6 additions & 0 deletions src/device-plugin/build/k8s-rocm-device-plugin.k8s.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

FROM rocm/k8s-device-plugin

# no arm64 support
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,15 @@ spec:
name: k8s-host-device-plugin-ds
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: k8s-host-device-plugin-ds
spec:
priorityClassName: pai-daemon-priority
tolerations:
- key: CriticalAddonsOnly
operator: Exists
containers:
- image: luciaopenai.azurecr.io/luciaopenai/k8s-host-device-plugin:latest
- image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}k8s-host-device-plugin:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }}
name: k8s-host-device-plugin-ctr
securityContext:
privileged: true
Expand All @@ -62,3 +61,5 @@ spec:
items:
- key: config.json
path: config.json
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
1 change: 1 addition & 0 deletions src/device-plugin/deploy/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ prerequisite:
template-list:
- start.sh
- delete.sh
- device-plugin.yaml

start-script: start.sh
stop-script: stop.sh
Expand Down
32 changes: 22 additions & 10 deletions src/device-plugin/deploy/start.sh.template
Original file line number Diff line number Diff line change
Expand Up @@ -29,25 +29,33 @@ pushd $(dirname "$0") > /dev/null
# Begin: NVIDIA GPU device plugin
{% if 'nvidia.com/gpu' in cluster_cfg['device-plugin']['devices'] %}

curl -s https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml \
| sed 's|nvcr.io/nvidia/k8s-device-plugin|luciaopenai.azurecr.io/luciaopenai/nvidia/k8s-device-plugin|' \
{ curl -s https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml \
| sed 's|nvcr.io/nvidia/k8s-device-plugin:v0.15.0|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-nvidia-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \
| sed -E '/^[[:space:]]*allowPrivilegeEscalation: false/ {
h
s/^([[:space:]]*)allowPrivilegeEscalation: false.*$/\1privileged: false/
G
s/(^[[:space:]]*allowPrivilegeEscalation: false.*)\n([[:space:]]*privileged: false)/\1\n\2/
}' \
| kubectl apply --overwrite=true -f - || exit $?
}';
cat <<'YAML'
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
YAML
} | kubectl apply --overwrite=true -f - || exit $?

{% endif %}
# End: NVIDIA GPU device plugin

# Begin: AMD GPU device plugin
{% if 'amd.com/gpu' in cluster_cfg['device-plugin']['devices'] %}

curl -s https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml \
| sed 's|rocm/k8s-device-plugin|luciaopenai.azurecr.io/luciaopenai/rocm/k8s-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \
| kubectl apply --overwrite=true -f - || exit $?
{ curl -s https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml \
| sed 's|rocm/k8s-device-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rocm-device-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|';
cat <<'YAML'
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
YAML
} | kubectl apply --overwrite=true -f - || exit $?

{% endif %}
# End: AMD GPU device plugin
Expand Down Expand Up @@ -75,9 +83,13 @@ kubectl apply --overwrite=true -f device-plugin.yaml || exit $?
{% if 'rdma/hca' in cluster_cfg['device-plugin']['devices'] %}

kubectl apply --overwrite=true -f rdma-devices.yaml || exit $?
curl -s https://raw.githubusercontent.com/Mellanox/k8s-rdma-shared-dev-plugin/v1.4.0/deployment/k8s/base/daemonset.yaml \
| sed 's|ghcr.io/mellanox/k8s-rdma-shared-dev-plugin|luciaopenai.azurecr.io/luciaopenai/k8s-rdma-shared-dev-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|' \
| kubectl apply --overwrite=true -f - || exit $?
{ curl -s https://raw.githubusercontent.com/Mellanox/k8s-rdma-shared-dev-plugin/v1.4.0/deployment/k8s/base/daemonset.yaml \
| sed 's|ghcr.io/mellanox/k8s-rdma-shared-dev-plugin|{{ cluster_cfg['cluster']['docker-registry']['prefix'] }}k8s-rdma-shared-dev-plugin:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}|';
cat <<'YAML'
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
YAML
} | kubectl apply --overwrite=true -f - || exit $?

{% endif %}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,5 @@ spec:
- name: frameworkcontroller-config
configMap:
name: frameworkcontroller-config
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
3 changes: 2 additions & 1 deletion src/grafana/build/grafana.common.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

FROM ubuntu:22.04

ARG TARGETARCH
ENV \
GRAFANA_VERSION=10.4.18+security~01 \
GF_PLUGIN_DIR=/grafana-plugins \
Expand All @@ -29,7 +30,7 @@ ENV \
RUN \
apt-get update && \
apt-get -y --force-yes --no-install-recommends install libfontconfig wget ca-certificates adduser libfontconfig1 musl curl jq && \
wget -O /tmp/grafana.deb https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_amd64.deb && \
wget -O /tmp/grafana.deb https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_${TARGETARCH}.deb && \
dpkg -i /tmp/grafana.deb && \
rm -f /tmp/grafana.deb && \
### branding && \
Expand Down
49 changes: 0 additions & 49 deletions src/hivedscheduler/build/hivedscheduler.common.dockerfile

This file was deleted.

30 changes: 30 additions & 0 deletions src/hivedscheduler/build/hivedscheduler.k8s.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

FROM golang:1.24.3-alpine3.21 AS builder

ARG TEST=false
ENV GOPATH=/go
ENV PROJECT_DIR=/src
ENV INSTALL_DIR=/opt/hivedscheduler/hivedscheduler

RUN apk update && apk add --no-cache bash
RUN mkdir -p ${PROJECT_DIR} ${INSTALL_DIR}
COPY src ${PROJECT_DIR}
RUN if [ ${TEST} == "true" ]; \
then ${PROJECT_DIR}/build/hivedscheduler/go-build.sh test; \
else ${PROJECT_DIR}/build/hivedscheduler/go-build.sh; fi && \
mv ${PROJECT_DIR}/dist/hivedscheduler/* ${INSTALL_DIR}


FROM alpine:3.21

ENV INSTALL_DIR=/opt/hivedscheduler/hivedscheduler

RUN apk update && apk add --no-cache bash
RUN apk upgrade --no-cache

COPY --from=builder ${INSTALL_DIR} ${INSTALL_DIR}
WORKDIR ${INSTALL_DIR}

ENTRYPOINT ["./start.sh"]
4 changes: 4 additions & 0 deletions src/hivedscheduler/build/kube-scheduler.k8s.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

FROM registry.k8s.io/kube-scheduler:v1.28.9
4 changes: 4 additions & 0 deletions src/hivedscheduler/deploy/hivedscheduler.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ spec:
- name: hivedscheduler-config
configMap:
name: hivedscheduler-config
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}

{%- for vc in cluster_cfg['hivedscheduler']['structured-config']['virtualClusters'] %}
---
Expand Down Expand Up @@ -96,4 +98,6 @@ spec:
- name: hivedscheduler-config
configMap:
name: hivedscheduler-config
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
{%- endfor %}
13 changes: 7 additions & 6 deletions src/job-exporter/build/job-exporter.common.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@


FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04

ARG TARGETARCH
# Register the ROCM package repository, and install rocm-dev package
ARG ROCM_VERSION=6.2.2
ARG AMDGPU_VERSION=6.2.2
Expand All @@ -25,8 +27,8 @@ RUN echo "$APT_PREF" > /etc/apt/preferences.d/rocm-pin-600

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl libnuma-dev gnupg \
&& curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
&& printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list \
&& printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list \
&& printf "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list \
&& printf "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list \
&& apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
sudo \
libelf1 \
Expand Down Expand Up @@ -74,13 +76,12 @@ COPY build/update-dcgm.py .
# For the job exporter
ENV NERDCTL_VERSION=2.1.3
RUN apt-get update && apt-get install --no-install-recommends -y wget ca-certificates
RUN wget https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-amd64.tar.gz && \
RUN wget -O /tmp/nerdctl.tar.gz https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz && \
mkdir -p /tmp/nerdctl && \
tar -xzvf nerdctl-${NERDCTL_VERSION}-linux-amd64.tar.gz -C /tmp/nerdctl && \
tar -xzvf /tmp/nerdctl.tar.gz -C /tmp/nerdctl && \
mv /tmp/nerdctl/nerdctl /usr/local/bin/nerdctl && \
mkdir -p /job_exporter && \
rm -rf /tmp/nerdctl && \
rm -rf nerdctl-${NERDCTL_VERSION}-linux-amd64.tar.gz
rm -rf /tmp/nerdctl*

COPY requirements.txt /job_exporter/
RUN pip3 install -r /job_exporter/requirements.txt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ WORKDIR /kube-runtime/src
COPY src/src ./
COPY src/requirements.txt ./

#TODO: update the hardcode image for arm64
COPY --from=frameworkcontroller/frameworkbarrier:v1.0.0 $BARRIER_DIR/frameworkbarrier ./init.d
COPY --from=builder ${INSTALL_DIR}/* ./runtime.d/

Expand Down