Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 24 additions & 43 deletions bundle/manifests/k8s-nim-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,18 @@ spec:
- watch
- create
- delete
- apiGroups:
- admissionregistration.k8s.io
resources:
- validatingwebhookconfigurations
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- gateway.networking.k8s.io
resources:
Expand Down Expand Up @@ -1413,6 +1425,8 @@ spec:
fieldPath: metadata.namespace
- name: ENABLE_WEBHOOKS
value: "true"
- name: OPERATOR_NAME_PREFIX
value: "k8s-nim-operator"
image: 'ghcr.io/nvidia/k8s-nim-operator:main'
imagePullPolicy: Always
livenessProbe:
Expand All @@ -1426,6 +1440,10 @@ spec:
successThreshold: 1
timeoutSeconds: 1
name: manager
volumeMounts:
- name: cert
mountPath: /tmp/k8s-webhook-server/serving-certs
readOnly: true
readinessProbe:
failureThreshold: 3
httpGet:
Expand All @@ -1447,6 +1465,11 @@ spec:
allowPrivilegeEscalation: false
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumes:
- name: cert
secret:
secretName: k8s-nim-operator-webhook-server-cert
defaultMode: 420
dnsPolicy: ClusterFirst
imagePullSecrets: []
restartPolicy: Always
Expand All @@ -1469,46 +1492,4 @@ spec:
- type: MultiNamespace
supported: false
- type: AllNamespaces
supported: true
webhookdefinitions:
- type: ValidatingAdmissionWebhook
admissionReviewVersions:
- v1
containerPort: 9443
targetPort: 9443
deploymentName: k8s-nim-operator
failurePolicy: Fail
generateName: vnimcache-v1alpha1.kb.io
rules:
- apiGroups:
- apps.nvidia.com
apiVersions:
- v1alpha1
operations:
- CREATE
- UPDATE
resources:
- nimcaches
sideEffects: None
webhookPath: /validate-apps-nvidia-com-v1alpha1-nimcache
- type: ValidatingAdmissionWebhook
admissionReviewVersions:
- v1
containerPort: 9443
targetPort: 9443
deploymentName: k8s-nim-operator
failurePolicy: Fail
generateName: vnimservice-v1alpha1.kb.io
rules:
- apiGroups:
- apps.nvidia.com
apiVersions:
- v1alpha1
operations:
- CREATE
- UPDATE
resources:
- nimservices
sideEffects: None
webhookPath: /validate-apps-nvidia-com-v1alpha1-nimservice

supported: true
19 changes: 19 additions & 0 deletions bundle/manifests/k8s-nim-operator.webhookservice.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: v1
kind: Service
metadata:
name: k8s-nim-operator-webhook-service
labels:
app.kubernetes.io/name: k8s-nim-operator
app.kubernetes.io/instance: nim-operator
control-plane: controller-manager
annotations:
service.beta.openshift.io/serving-cert-secret-name: k8s-nim-operator-webhook-server-cert
spec:
selector:
app.kubernetes.io/name: k8s-nim-operator
app.kubernetes.io/instance: nim-operator
control-plane: controller-manager
ports:
- port: 443
targetPort: 9443
protocol: TCP
42 changes: 39 additions & 3 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package main

import (
"context"
"crypto/tls"
"flag"
"os"
Expand All @@ -34,6 +35,7 @@ import (
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth"
ctrl "sigs.k8s.io/controller-runtime"
crclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
Expand All @@ -45,7 +47,9 @@ import (

appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1"
"github.com/NVIDIA/k8s-nim-operator/internal/conditions"
"github.com/NVIDIA/k8s-nim-operator/internal/config"
"github.com/NVIDIA/k8s-nim-operator/internal/controller"
"github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"
"github.com/NVIDIA/k8s-nim-operator/internal/render"
webhookappsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/internal/webhook/apps/v1alpha1"
// +kubebuilder:scaffold:imports
Expand Down Expand Up @@ -264,17 +268,38 @@ func main() {

// nolint:goconst
// Parse ENABLE_WEBHOOKS environment variable once as a boolean.
var enableWebhooks bool
if val, ok := os.LookupEnv("ENABLE_WEBHOOKS"); ok {
var err error
enableWebhooks, err = strconv.ParseBool(val)
enableWebhooks, err := strconv.ParseBool(val)
if err != nil {
setupLog.Error(err, "invalid value for ENABLE_WEBHOOKS, expected boolean")
os.Exit(1)
}
config.EnableWebhooks = enableWebhooks
}
if val, ok := os.LookupEnv("OPERATOR_NAME_PREFIX"); ok {
config.OperatorNamePrefix = val
}
if val, ok := os.LookupEnv("OPERATOR_NAMESPACE"); ok {
config.OperatorNamespace = val
}

if enableWebhooks {
cfg := ctrl.GetConfigOrDie()
liveClient, err := crclient.New(cfg, crclient.Options{Scheme: scheme})
if err != nil {
setupLog.Error(err, "unable to construct live client")
os.Exit(1)
}
ctx := context.Background()
orch, err := k8sutil.GetOrchestratorType(ctx, liveClient) // uses direct REST calls
if err != nil {
setupLog.Error(err, "failed to detect orchestrator type")
os.Exit(1)
}
config.OrchestratorType = orch
setupLog.Info("detected orchestrator", "type", orch)

if config.EnableWebhooks {
if err := webhookappsv1alpha1.SetupNIMCacheWebhookWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "NIMCache")
os.Exit(1)
Expand All @@ -284,6 +309,17 @@ func main() {
setupLog.Error(err, "unable to create webhook", "webhook", "NIMService")
os.Exit(1)
}
// Set up cluster-level ValidatingWebhookConfiguration.
if err := webhookappsv1alpha1.EnsureValidatingWebhook(
context.TODO(),
mgr.GetAPIReader(),
mgr.GetClient(),
config.OperatorNamespace,
config.OperatorNamePrefix,
); err != nil {
setupLog.Error(err, "unable to ensure ValidatingWebhookConfiguration")
os.Exit(1)
}
}
// +kubebuilder:scaffold:builder

Expand Down
2 changes: 2 additions & 0 deletions deployments/helm/k8s-nim-operator/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: OPERATOR_NAME_PREFIX
value: {{ include "k8s-nim-operator.fullname" . }}
- name: ENABLE_WEBHOOKS
value: "{{ .Values.operator.admissionController.enabled }}"
livenessProbe:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -582,8 +582,10 @@ rules:
- get
- list
- watch
- patch
- create
- update
- patch
- delete
- apiGroups:
- gateway.networking.k8s.io
resources:
Expand Down
10 changes: 10 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package config

import "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"

var (
EnableWebhooks bool
OperatorNamePrefix string
OperatorNamespace string
OrchestratorType k8sutil.OrchestratorType
)
150 changes: 150 additions & 0 deletions internal/webhook/apps/v1alpha1/configuration.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
package v1alpha1

import (
"context"
"encoding/json"

admissionv1 "k8s.io/api/admissionregistration/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/NVIDIA/k8s-nim-operator/internal/config"
"github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"
)

// EnsureValidatingWebhook creates or updates the ValidatingWebhookConfiguration
// that used to be templated by Helm. It is a best-effort reconciliation and
// returns an error only when we cannot make the desired state match the spec.
func EnsureValidatingWebhook(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be done using the renderAndSyncResource pattern we follow in the controllers?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

VWC is cluster-scoped. Kubernetes doesn't allow setting a namespaced owner as controller of a cluster-scoped object. The current helper calls SetControllerReference, which would fail in that case.

	if err = controllerutil.SetControllerReference(nimService, resource, r.GetScheme()); err != nil {
		logger.Error(err, "failed to set owner", conditionType, namespacedName)
		statusError := r.updater.SetConditionsFailed(ctx, nimService, reason, err.Error())
		if statusError != nil {
			logger.Error(statusError, "failed to update status", "nimservice", nimService.Name)
		}
		return err
	}

To use this exact pattern for the cluster scoped vwc we'd have to either:

  • Add a variant that skips SetControllerReference (and handle cleanup via explicit delete/finalizer), or
  • Manage the webhook from an operator-owned, cluster-scoped controller rather than from a namespaced CR

ctx context.Context,
apiReader client.Reader,
writer client.Client,
namespace string,
fullNamePrefix string,
) error {
// Desired validatingwebhookconfiguration spec.
desired := buildConfigurationSpec(namespace, fullNamePrefix)

// Check if there is already a spec.
existing := &admissionv1.ValidatingWebhookConfiguration{}
err := apiReader.Get(ctx, types.NamespacedName{Name: desired.Name}, existing)
if err != nil && !errors.IsNotFound(err) {
return err
}

if errors.IsNotFound(err) {
return writer.Create(ctx, desired)
}

// Deep-compare; update only if something differs.
cur, _ := json.Marshal(existing.Webhooks)
want, _ := json.Marshal(desired.Webhooks)

if string(cur) == string(want) {
return nil
}

existing.Webhooks = desired.Webhooks
existing.Annotations = desired.Annotations
return writer.Update(ctx, existing)
}

// buildDesired reproduces the spec that used to be in Helm.
func buildConfigurationSpec(namespace, namePrefix string) *admissionv1.ValidatingWebhookConfiguration {
pathCache := "/validate-apps-nvidia-com-v1alpha1-nimcache"
pathService := "/validate-apps-nvidia-com-v1alpha1-nimservice"

// Use appropriate annotations/labels as per deployment mode.
var annotations map[string]string
var labels map[string]string
var clientconfignimcache admissionv1.WebhookClientConfig
var clientconfignimservice admissionv1.WebhookClientConfig

clientconfignimcache = admissionv1.WebhookClientConfig{
Service: &admissionv1.ServiceReference{
Namespace: namespace,
Name: namePrefix + "-webhook-service",
Path: &pathCache,
},
}
clientconfignimservice = admissionv1.WebhookClientConfig{
Service: &admissionv1.ServiceReference{
Namespace: namespace,
Name: namePrefix + "-webhook-service",
Path: &pathService,
},
}

// Deployment specific values.
if config.OrchestratorType == k8sutil.K8s {
annotations = map[string]string{"cert-manager.io/inject-ca-from": namespace + "/" + namePrefix + "-serving-cert"}
labels = map[string]string{
"app.kubernetes.io/name": "k8s-nim-operator",
"app.kubernetes.io/managed-by": "helm",
}
} else {
annotations = map[string]string{"service.beta.openshift.io/inject-cabundle": "true"}
labels = map[string]string{
"app.kubernetes.io/name": "k8s-nim-operator",
"app.kubernetes.io/managed-by": "openshift",
}
}

return &admissionv1.ValidatingWebhookConfiguration{
ObjectMeta: metav1.ObjectMeta{
Name: namePrefix + "-validating-webhook-configuration",
Annotations: annotations,
Labels: labels,
},
Webhooks: []admissionv1.ValidatingWebhook{
{
Name: "vnimcache-v1alpha1.kb.io",
AdmissionReviewVersions: []string{"v1"},
ClientConfig: clientconfignimcache,
FailurePolicy: func() *admissionv1.FailurePolicyType {
fp := admissionv1.Fail
return &fp
}(),
SideEffects: func() *admissionv1.SideEffectClass {
s := admissionv1.SideEffectClassNone
return &s
}(),
Rules: []admissionv1.RuleWithOperations{{
Operations: []admissionv1.OperationType{
admissionv1.Create, admissionv1.Update,
},
Rule: admissionv1.Rule{
APIGroups: []string{"apps.nvidia.com"},
APIVersions: []string{"v1alpha1"},
Resources: []string{"nimcaches"},
},
}},
},
{
Name: "vnimservice-v1alpha1.kb.io",
AdmissionReviewVersions: []string{"v1"},
ClientConfig: clientconfignimservice,
FailurePolicy: func() *admissionv1.FailurePolicyType {
fp := admissionv1.Fail
return &fp
}(),
SideEffects: func() *admissionv1.SideEffectClass {
s := admissionv1.SideEffectClassNone
return &s
}(),
Rules: []admissionv1.RuleWithOperations{{
Operations: []admissionv1.OperationType{
admissionv1.Create, admissionv1.Update,
},
Rule: admissionv1.Rule{
APIGroups: []string{"apps.nvidia.com"},
APIVersions: []string{"v1alpha1"},
Resources: []string{"nimservices"},
},
}},
},
},
}
}
Loading