diff --git a/ai-ml/mldiagnostics-webhook-and-operator/READMDE.md b/ai-ml/mldiagnostics-webhook-and-operator/READMDE.md new file mode 100644 index 0000000000..89ffcf4c0d --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/READMDE.md @@ -0,0 +1,44 @@ +## mldiagnostics-webhook-and-operator + +It provide helm charts for mldiagnostics webhook and operator, which is needed for integrating mldiagnostics SDK in GKE. + + + +### Install cert-manager if not already installed + +Cert-manager is a prerequisite for the injection-webhook. If it’s not installed, follow this to install. After installing cert-manager, it may take up to two minutes for the certificate to become ready. + +```bash +helm repo add jetstack https://charts.jetstack.io +helm repo update + +helm install \ + cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version v1.13.0 \ + --set installCRDs=true \ + --set global.leaderElection.namespace=cert-manager \ + --timeout 10m +``` + +### Install injection-webhook + +```bash +helm install mldiagnostics-injection-webhook \ + --namespace=gke-diagon\ + --create-namespace \ + ./injection-webhook/chart + +``` + + +### Install connection-operator + +```bash +helm install mldiagnostics-connection-operator \ + --namespace=gke-diagon\ + --create-namespace \ + ./connection-operator/chart +``` + diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/.helmignore b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/.helmignore new file mode 100644 index 0000000000..0e8a0eb36f --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/Chart.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/Chart.yaml new file mode 100644 index 0000000000..a9fb47dfc0 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/Chart.yaml @@ -0,0 +1,21 @@ +apiVersion: v2 +name: mldiagnostics-connection-operator +description: A Helm chart to capture profiler traces based on MLDiagnosticsConnection Custom Resource in frameworks JAX, Pytorch XLA and TensorFlow. +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "0.1.0" diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/_helpers.tpl b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/_helpers.tpl new file mode 100644 index 0000000000..7ba5edc272 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "chart.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "chart.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "chart.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "chart.labels" -}} +helm.sh/chart: {{ include "chart.chart" . }} +{{ include "chart.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "chart.selectorLabels" -}} +app.kubernetes.io/name: {{ include "chart.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "chart.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "chart.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/deployment.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/deployment.yaml new file mode 100644 index 0000000000..91a235c037 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/deployment.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "chart.fullname" . }} + labels: + control-plane: diagon-connection-operator + {{- include "chart.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.diagonConnectionOperator.replicas }} + selector: + matchLabels: + app.kubernetes.io/name: connection-operator + control-plane: diagon-connection-operator + {{- include "chart.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + app.kubernetes.io/name: connection-operator + control-plane: diagon-connection-operator + {{- include "chart.selectorLabels" . | nindent 8 }} + annotations: + kubectl.kubernetes.io/default-container: controller + spec: + containers: + - args: {{- toYaml .Values.diagonConnectionOperator.controller.args | nindent 8 }} + command: + - /manager + env: + - name: KUBERNETES_CLUSTER_DOMAIN + value: {{ quote .Values.kubernetesClusterDomain }} + image: {{ .Values.diagonConnectionOperator.controller.image.repository }}:{{ .Values.diagonConnectionOperator.controller.image.tag + | default .Chart.AppVersion }} + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: controller + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: {{- toYaml .Values.diagonConnectionOperator.controller.resources | nindent + 10 }} + securityContext: {{- toYaml .Values.diagonConnectionOperator.controller.containerSecurityContext + | nindent 10 }} + - env: + - name: KUBERNETES_CLUSTER_DOMAIN + value: {{ quote .Values.kubernetesClusterDomain }} + image: '{{ .Values.diagonConnectionOperator.googleCloudMldiagnosticsProfiler.image.repository + }}:{{ .Values.diagonConnectionOperator.googleCloudMldiagnosticsProfiler.image.tag + | default .Chart.AppVersion }}' + name: google-cloud-mldiagnostics-profiler + ports: + - containerPort: 5001 + resources: {} + securityContext: {{- toYaml .Values.diagonConnectionOperator.googleCloudMldiagnosticsProfiler.containerSecurityContext + | nindent 10 }} + volumeMounts: + - mountPath: /tmp + name: tmp-volume + securityContext: {{- toYaml .Values.diagonConnectionOperator.podSecurityContext | + nindent 8 }} + serviceAccountName: {{ include "chart.fullname" . }} + terminationGracePeriodSeconds: 10 + volumes: + - emptyDir: {} + name: tmp-volume diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/leader-election-rbac.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/leader-election-rbac.yaml new file mode 100644 index 0000000000..b86c51de44 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/leader-election-rbac.yaml @@ -0,0 +1,53 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "chart.fullname" . }}-leader-election-role + labels: + {{- include "chart.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "chart.fullname" . }}-leader-election-rolebinding + labels: + {{- include "chart.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: '{{ include "chart.fullname" . }}-leader-election-role' +subjects: +- kind: ServiceAccount + name: '{{ include "chart.fullname" . }}' + namespace: '{{ .Release.Namespace }}' diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/manager-rbac.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/manager-rbac.yaml new file mode 100644 index 0000000000..ef332bfd49 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/manager-rbac.yaml @@ -0,0 +1,56 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "chart.fullname" . }}-manager-role + labels: + {{- include "chart.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +- apiGroups: + - diagon.gke.io + resources: + - mldiagnosticsconnections + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - diagon.gke.io + resources: + - mldiagnosticsconnections/finalizers + verbs: + - update +- apiGroups: + - diagon.gke.io + resources: + - mldiagnosticsconnections/status + verbs: + - get + - patch + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "chart.fullname" . }}-manager-rolebinding + labels: + {{- include "chart.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "chart.fullname" . }}-manager-role' +subjects: +- kind: ServiceAccount + name: '{{ include "chart.fullname" . }}' + namespace: '{{ .Release.Namespace }}' diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/metrics-auth-rbac.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/metrics-auth-rbac.yaml new file mode 100644 index 0000000000..30394e28cc --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/metrics-auth-rbac.yaml @@ -0,0 +1,34 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "chart.fullname" . }}-metrics-auth-role + labels: + {{- include "chart.labels" . | nindent 4 }} +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "chart.fullname" . }}-metrics-auth-rolebinding + labels: + {{- include "chart.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "chart.fullname" . }}-metrics-auth-role' +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: '{{ .Release.Namespace }}' diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/metrics-reader-rbac.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/metrics-reader-rbac.yaml new file mode 100644 index 0000000000..ce5d8b5a13 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/metrics-reader-rbac.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "chart.fullname" . }}-metrics-reader + labels: + {{- include "chart.labels" . | nindent 4 }} +rules: +- nonResourceURLs: + - /metrics + verbs: + - get diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/metrics-service.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/metrics-service.yaml new file mode 100644 index 0000000000..df56a44c30 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/metrics-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "chart.fullname" . }}-controller-manager-metrics + labels: + control-plane: controller-manager + {{- include "chart.labels" . | nindent 4 }} +spec: + type: {{ .Values.metricsService.type }} + selector: + app.kubernetes.io/name: connection-operator + control-plane: controller-manager + {{- include "chart.selectorLabels" . | nindent 4 }} + ports: + {{- .Values.metricsService.ports | toYaml | nindent 2 }} diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-admin-rbac.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-admin-rbac.yaml new file mode 100644 index 0000000000..87653a7282 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-admin-rbac.yaml @@ -0,0 +1,19 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "chart.fullname" . }}-mldiagnosticsconnection-admin-role + labels: + {{- include "chart.labels" . | nindent 4 }} +rules: +- apiGroups: + - diagon.gke.io + resources: + - mldiagnosticsconnections + verbs: + - '*' +- apiGroups: + - diagon.gke.io + resources: + - mldiagnosticsconnections/status + verbs: + - get diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-crd.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-crd.yaml new file mode 100644 index 0000000000..a65b4ea300 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-crd.yaml @@ -0,0 +1,147 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: mldiagnosticsconnections.diagon.gke.io + annotations: + controller-gen.kubebuilder.io/version: v0.18.0 + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + group: diagon.gke.io + names: + kind: MLDiagnosticsConnection + listKind: MLDiagnosticsConnectionList + plural: mldiagnosticsconnections + shortNames: + - mldc + singular: mldiagnosticsconnection + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.phase + name: Status + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: MLDiagnosticsConnection is the Schema for the mldiagnosticsconnections + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: MLDiagnosticsConnectionSpec defines the desired state of MLDiagnosticsConnection + properties: + triggerCapture: + description: TriggerCapture contains the details for triggering a diagnostic + capture. + properties: + capturePort: + description: CapturePort is the port on the target pod to connect + to. + format: int32 + type: integer + durationSeconds: + description: DurationSeconds is the duration of the capture in seconds. + format: int32 + minimum: 1 + type: integer + podName: + description: PodName is a comma-separated string of target pod names + for diagnostics. + type: string + podNamespace: + description: PodNamespace is the namespace of the target pod. + type: string + profiler: + description: Profiler specifies the type of profiler to use. + enum: + - google_cloud_mldiagnostics + type: string + storageLocation: + description: StorageLocation is the location to store the capture + data. This can be a GCS location or a local path on the pod's + local directory. + type: string + required: + - capturePort + - durationSeconds + - podName + - podNamespace + - profiler + - storageLocation + type: object + required: + - triggerCapture + type: object + status: + description: MLDiagnosticsConnectionStatus defines the observed state of + MLDiagnosticsConnection + properties: + captureResult: + description: CaptureResult contains the results of the capture operation. + properties: + errorMessage: + description: A human-readable message about the job's status. + type: string + outputLocation: + description: The final location of the stored capture output. + type: string + status: + description: Status of the capture (e.g., Succeeded, Failed). + enum: + - Succeeded + - Failed + - Unknown + type: string + type: object + completionTime: + description: CompletionTime is the time the job completed. + format: date-time + type: string + message: + description: A human-readable message about the job's status. + type: string + phase: + description: The current phase of the job (e.g., Pending, Running, Succeeded, + Failed). + enum: + - Pending + - Running + - Succeeded + - Failed + type: string + startTime: + description: StartTime is the time the job started. + format: date-time + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-editor-rbac.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-editor-rbac.yaml new file mode 100644 index 0000000000..17884145fb --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-editor-rbac.yaml @@ -0,0 +1,25 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "chart.fullname" . }}-mldiagnosticsconnection-editor-role + labels: + {{- include "chart.labels" . | nindent 4 }} +rules: +- apiGroups: + - diagon.gke.io + resources: + - mldiagnosticsconnections + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - diagon.gke.io + resources: + - mldiagnosticsconnections/status + verbs: + - get diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-viewer-rbac.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-viewer-rbac.yaml new file mode 100644 index 0000000000..19c95fdc29 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/mldiagnosticsconnection-viewer-rbac.yaml @@ -0,0 +1,21 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "chart.fullname" . }}-mldiagnosticsconnection-viewer-role + labels: + {{- include "chart.labels" . | nindent 4 }} +rules: +- apiGroups: + - diagon.gke.io + resources: + - mldiagnosticsconnections + verbs: + - get + - list + - watch +- apiGroups: + - diagon.gke.io + resources: + - mldiagnosticsconnections/status + verbs: + - get diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/serviceaccount.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/serviceaccount.yaml new file mode 100644 index 0000000000..78458f5b08 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/templates/serviceaccount.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "chart.fullname" . }} + labels: + {{- include "chart.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.diagonConnectionOperator.serviceAccount.annotations | nindent 4 }} diff --git a/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/values.yaml b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/values.yaml new file mode 100644 index 0000000000..646d22b909 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/connection-operator/chart/values.yaml @@ -0,0 +1,49 @@ +diagonConnectionOperator: + controller: + args: + - --leader-elect + - --health-probe-bind-address=:8081 + - --sidecar-timeout=5m + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsUser: 65532 + image: + repository: gcr.io/gke-release/connection-operator + tag: v1.0.0-gke.8 + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + googleCloudMldiagnosticsProfiler: + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsUser: 1001 + image: + repository: gcr.io/gke-release/connection-operator-google-cloud-mldiagnostics-profiler + tag: v1.0.0-gke.8 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + replicas: 1 + serviceAccount: + annotations: {} +kubernetesClusterDomain: cluster.local +metricsService: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: 8443 + type: ClusterIP diff --git a/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/.helmignore b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/.helmignore new file mode 100644 index 0000000000..0e8a0eb36f --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/Chart.yaml b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/Chart.yaml new file mode 100644 index 0000000000..19d7754097 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/Chart.yaml @@ -0,0 +1,21 @@ +apiVersion: v2 +name: mldiagnostics-injection-webhook +description: A Helm chart to inject metadata into JobSet/RayJob/LWS pods which are needed by mldiagnostics SDK. +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "0.1.0" diff --git a/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/_helpers.tpl b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/_helpers.tpl new file mode 100644 index 0000000000..7ba5edc272 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "chart.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "chart.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "chart.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "chart.labels" -}} +helm.sh/chart: {{ include "chart.chart" . }} +{{ include "chart.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "chart.selectorLabels" -}} +app.kubernetes.io/name: {{ include "chart.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "chart.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "chart.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/cert.yaml b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/cert.yaml new file mode 100644 index 0000000000..8371a15f75 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/cert.yaml @@ -0,0 +1,24 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ include "chart.fullname" . }}-selfsigned-issuer + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "chart.fullname" . }}-certificate + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + dnsNames: + - '{{ include "chart.fullname" . }}-service.{{ .Release.Namespace }}.svc' + - '{{ include "chart.fullname" . }}-service.{{ .Release.Namespace }}.svc.local' + issuerRef: + group: cert-manager.io + kind: Issuer + name: '{{ include "chart.fullname" . }}-selfsigned-issuer' + secretName: diagon-webhook-certificate diff --git a/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/deployment.yaml b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/deployment.yaml new file mode 100644 index 0000000000..101b690e0f --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/deployment.yaml @@ -0,0 +1,65 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "chart.fullname" . }} + labels: + app: diagon-webhook + {{- include "chart.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.webhookDeployment.replicas }} + selector: + matchLabels: + app: diagon-webhook + {{- include "chart.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + app: diagon-webhook + {{- include "chart.selectorLabels" . | nindent 8 }} + spec: + containers: + - env: + - name: CERT_PATH + value: {{ quote .Values.webhookDeployment.diagonWebhookContainer.env.certPath + }} + - name: KEY_PATH + value: {{ quote .Values.webhookDeployment.diagonWebhookContainer.env.keyPath + }} + - name: KUBERNETES_CLUSTER_DOMAIN + value: {{ quote .Values.kubernetesClusterDomain }} + image: '{{ .Values.webhookDeployment.diagonWebhookContainer.image.repository }}:{{ + .Values.webhookDeployment.diagonWebhookContainer.image.tag | default .Chart.AppVersion + }}' + name: diagon-webhook-container + ports: + - containerPort: 8443 + name: webhook-port + readinessProbe: + tcpSocket: + port: webhook-port + initialDelaySeconds: 5 + periodSeconds: 10 + resources: {} + volumeMounts: + - mountPath: /etc/webhook/certs + name: webhook-tls + readOnly: true + volumes: + - name: webhook-tls + secret: + secretName: diagon-webhook-certificate +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "chart.fullname" . }}-service + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + type: {{ .Values.webhookService.type }} + selector: + app: diagon-webhook + {{- include "chart.selectorLabels" . | nindent 4 }} + ports: + {{- .Values.webhookService.ports | toYaml | nindent 2 }} diff --git a/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/diagon-webhook.yaml b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/diagon-webhook.yaml new file mode 100644 index 0000000000..6d4d557354 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/diagon-webhook.yaml @@ -0,0 +1,35 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: {{ include "chart.fullname" . }}-mutating-webhook-config + annotations: + cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "chart.fullname" . }}-certificate + labels: + {{- include "chart.labels" . | nindent 4 }} +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: '{{ include "chart.fullname" . }}-service' + namespace: '{{ .Release.Namespace }}' + path: /mutate + failurePolicy: Fail + name: '{{ include "chart.fullname" . }}-service.{{ .Release.Namespace }}.svc' + namespaceSelector: + matchExpressions: + - key: diagon-webhook-exempt + operator: NotIn + values: + - "true" + rules: + - apiGroups: + - "" + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: Namespaced + sideEffects: NoneOnDryRun diff --git a/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/rbac.yaml b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/rbac.yaml new file mode 100644 index 0000000000..43aff65cc6 --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/templates/rbac.yaml @@ -0,0 +1,75 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "chart.fullname" . }}-resource-reader + labels: + {{- include "chart.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - namespaces + - nodes + - pods + verbs: + - get + - list + - watch +- apiGroups: + - ray.io + resources: + - rayclusters + - rayjobs + verbs: + - get + - list + - watch +- apiGroups: + - jobset.x-k8s.io + resources: + - jobsets + verbs: + - get + - list + - watch +- apiGroups: + - leaderworkerset.x-k8s.io + resources: + - leaderworkersets + verbs: + - get + - list + - watch +- apiGroups: + - batch + resources: + - jobs + verbs: + - get + - list + - watch +- apiGroups: + - apps + resources: + - statefulsets + - replicasets + - deployments + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "chart.fullname" . }}-default-sa-binding + labels: + {{- include "chart.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "chart.fullname" . }}-resource-reader' +subjects: +- kind: ServiceAccount + name: default + namespace: '{{ .Release.Namespace }}' diff --git a/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/values.yaml b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/values.yaml new file mode 100644 index 0000000000..9eb5b4a01c --- /dev/null +++ b/ai-ml/mldiagnostics-webhook-and-operator/injection-webhook/chart/values.yaml @@ -0,0 +1,16 @@ +kubernetesClusterDomain: cluster.local +webhookDeployment: + diagonWebhookContainer: + env: + certPath: /etc/webhook/certs/tls.crt + keyPath: /etc/webhook/certs/tls.key + image: + repository: gcr.io/gke-release/diagon-webhook + tag: v1.0.0-gke.7 + replicas: 1 +webhookService: + ports: + - port: 443 + protocol: TCP + targetPort: webhook-port + type: ClusterIP