Skip to content

Commit 9decdeb

Browse files
committed
feat: Added Disruption control for Sandbox
feat: added PDB to Sandbox spec updated rbac generated file nit nit refacted pdb into its own controller nit sandbox controller test nit moved PDB implementation to extensions fixed lint
1 parent a72d5c4 commit 9decdeb

File tree

5 files changed

+509
-42
lines changed

5 files changed

+509
-42
lines changed

extensions/api/v1alpha1/sandboxtemplate_types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ type SandboxTemplateSpec struct {
4444
// an agent sandbox.
4545
// +kubebuilder:validation:Required
4646
PodTemplate corev1.PodTemplateSpec `json:"podTemplate" protobuf:"bytes,3,opt,name=podTemplate"`
47+
48+
// EnableDisruptionControl, if true, will protect sandboxes created
49+
// from this template with a PodDisruptionBudget and a "safe-to-evict"
50+
// annotation, making them resilient to voluntary disruptions.
51+
// +optional
52+
EnableDisruptionControl bool `json:"enableDisruptionControl,omitempty"`
4753
}
4854

4955
// SandboxTemplateStatus defines the observed state of Sandbox.

extensions/controllers/sandboxclaim_controller.go

Lines changed: 192 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,27 @@ import (
2525
"k8s.io/apimachinery/pkg/api/meta"
2626
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2727
"k8s.io/apimachinery/pkg/runtime"
28+
"k8s.io/apimachinery/pkg/types"
29+
"k8s.io/apimachinery/pkg/util/intstr"
2830
ctrl "sigs.k8s.io/controller-runtime"
2931
"sigs.k8s.io/controller-runtime/pkg/client"
3032
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3133
"sigs.k8s.io/controller-runtime/pkg/log"
3234

35+
policyv1 "k8s.io/api/policy/v1"
3336
"sigs.k8s.io/agent-sandbox/api/v1alpha1"
3437
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
3538
extensionsv1alpha1 "sigs.k8s.io/agent-sandbox/extensions/api/v1alpha1"
3639
)
3740

41+
const (
42+
// These are for our PDB management
43+
pdbFinalizerName = "sandboxclaim.agents.x-k8s.io/pdb-cleanup"
44+
pdbName = "sandbox-highly-available"
45+
pdbLabelKey = "sandbox-disruption-policy"
46+
pdbLabelValue = "HighlyAvailable"
47+
)
48+
3849
// ErrTemplateNotFound is a sentinel error indicating a SandboxTemplate was not found.
3950
var ErrTemplateNotFound = errors.New("SandboxTemplate not found")
4051

@@ -48,33 +59,88 @@ type SandboxClaimReconciler struct {
4859
//+kubebuilder:rbac:groups=extensions.agents.x-k8s.io,resources=sandboxclaims/status,verbs=get;update;patch
4960
//+kubebuilder:rbac:groups=agents.x-k8s.io,resources=sandboxes,verbs=get;list;watch;create;update;patch;delete
5061
//+kubebuilder:rbac:groups=extensions.agents.x-k8s.io,resources=sandboxtemplates,verbs=get;list;watch
62+
//+kubebuilder:rbac:groups=policy,resources=poddisruptionbudgets,verbs=get;list;watch;create;update;patch;delete
5163

5264
// Reconcile is part of the main kubernetes reconciliation loop which aims to
5365
// move the current state of the cluster closer to the desired state.
5466
func (r *SandboxClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
67+
logger := log.FromContext(ctx)
5568
claim := &extensionsv1alpha1.SandboxClaim{}
5669
if err := r.Get(ctx, req.NamespacedName, claim); err != nil {
5770
if k8errors.IsNotFound(err) {
5871
return ctrl.Result{}, nil
5972
}
6073
return ctrl.Result{}, fmt.Errorf("failed to get sandbox claim %q: %w", req.NamespacedName, err)
6174
}
75+
originalClaimStatus := claim.Status.DeepCopy()
76+
77+
template, err := r.getTemplate(ctx, claim)
78+
if err != nil && !k8errors.IsNotFound(err) {
79+
// This is a real error, update status and requeue
80+
r.computeAndSetStatus(claim, nil, err)
81+
// We can't update status if the claim is not found, but we also don't need to.
82+
if !k8errors.IsNotFound(err) {
83+
if updateErr := r.updateStatus(ctx, originalClaimStatus, claim); updateErr != nil {
84+
err = errors.Join(err, updateErr)
85+
}
86+
}
87+
return ctrl.Result{}, err
88+
}
89+
if k8errors.IsNotFound(err) {
90+
// Template not found, set status and stop.
91+
r.computeAndSetStatus(claim, nil, ErrTemplateNotFound)
92+
if updateErr := r.updateStatus(ctx, originalClaimStatus, claim); updateErr != nil {
93+
err = errors.Join(err, updateErr)
94+
}
95+
return ctrl.Result{}, err
96+
}
97+
98+
// Determine if we should manage PDBs for this claim
99+
managePDB := template != nil && template.Spec.EnableDisruptionControl
62100

63101
if !claim.DeletionTimestamp.IsZero() {
102+
// This logic only runs if the claim is being deleted
103+
if managePDB && controllerutil.ContainsFinalizer(claim, pdbFinalizerName) {
104+
logger.Info("Reconciling PDB deletion for deleted claim")
105+
if err := r.reconcilePDBDeletion(ctx, claim); err != nil {
106+
return ctrl.Result{}, err
107+
}
108+
109+
// Cleanup successful, remove the finalizer
110+
controllerutil.RemoveFinalizer(claim, pdbFinalizerName)
111+
if err := r.Update(ctx, claim); err != nil {
112+
logger.Error(err, "Failed to remove finalizer")
113+
return ctrl.Result{}, err
114+
}
115+
}
64116
return ctrl.Result{}, nil
65117
}
66118

119+
// If not deleting, add the finalizer if it's needed and missing
120+
if managePDB && !controllerutil.ContainsFinalizer(claim, pdbFinalizerName) {
121+
logger.Info("Adding PDB finalizer to claim")
122+
controllerutil.AddFinalizer(claim, pdbFinalizerName)
123+
if err := r.Update(ctx, claim); err != nil {
124+
logger.Error(err, "Failed to add finalizer")
125+
return ctrl.Result{}, err
126+
}
127+
return ctrl.Result{Requeue: true}, nil // Requeue to process again with the finalizer
128+
}
129+
67130
// cache the original status from sandboxclaim
68-
originalClaimStatus := claim.Status.DeepCopy()
69-
var err error
70131
var sandbox *v1alpha1.Sandbox
71-
var template *extensionsv1alpha1.SandboxTemplate
72132

73-
// Try getting template
74-
if template, err = r.getTemplate(ctx, claim); err == nil || k8errors.IsNotFound(err) {
75-
// Try getting sandbox even if template is not found
76-
// It is possible that the template was deleted after the sandbox was created
77-
sandbox, err = r.getOrCreateSandbox(ctx, claim, template)
133+
// Try getting or creating the sandbox
134+
// We already fetched the template, so we pass it in.
135+
sandbox, err = r.getOrCreateSandbox(ctx, claim, template)
136+
137+
if err == nil { // Only reconcile children if sandbox was found or created
138+
// Reconcile PDB
139+
if managePDB {
140+
if pdbErr := r.reconcilePDB(ctx, claim); pdbErr != nil {
141+
err = errors.Join(err, pdbErr)
142+
}
143+
}
78144
}
79145

80146
// Update claim status
@@ -179,9 +245,27 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
179245
Name: claim.Name,
180246
},
181247
}
248+
replicas := int32(1)
249+
sandbox.Spec.Replicas = &replicas
250+
182251
sandbox.Spec.PodTemplate.Spec = template.Spec.PodTemplate.Spec
183252
sandbox.Spec.PodTemplate.ObjectMeta.Labels = template.Spec.PodTemplate.Labels
184253
sandbox.Spec.PodTemplate.ObjectMeta.Annotations = template.Spec.PodTemplate.Annotations
254+
255+
if template.Spec.EnableDisruptionControl {
256+
// 1. Inject the PDB label for the shared PDB to select
257+
if sandbox.Spec.PodTemplate.ObjectMeta.Labels == nil {
258+
sandbox.Spec.PodTemplate.ObjectMeta.Labels = make(map[string]string)
259+
}
260+
sandbox.Spec.PodTemplate.ObjectMeta.Labels[pdbLabelKey] = pdbLabelValue
261+
262+
// 2. Inject the safe-to-evict annotation
263+
if sandbox.Spec.PodTemplate.ObjectMeta.Annotations == nil {
264+
sandbox.Spec.PodTemplate.ObjectMeta.Annotations = make(map[string]string)
265+
}
266+
sandbox.Spec.PodTemplate.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
267+
}
268+
185269
if err := controllerutil.SetControllerReference(claim, sandbox, r.Scheme); err != nil {
186270
err = fmt.Errorf("failed to set controller reference for sandbox: %w", err)
187271
logger.Error(err, "Error creating sandbox for claim: %q", claim.Name)
@@ -243,6 +327,106 @@ func (r *SandboxClaimReconciler) getTemplate(ctx context.Context, claim *extensi
243327
return template, nil
244328
}
245329

330+
// reconcilePDB ensures the shared PDB exists in the namespace.
331+
func (r *SandboxClaimReconciler) reconcilePDB(ctx context.Context, claim *extensionsv1alpha1.SandboxClaim) error {
332+
logger := log.FromContext(ctx)
333+
pdb := &policyv1.PodDisruptionBudget{}
334+
pdbKey := types.NamespacedName{Name: pdbName, Namespace: claim.Namespace}
335+
336+
err := r.Client.Get(ctx, pdbKey, pdb)
337+
if err != nil {
338+
if k8errors.IsNotFound(err) {
339+
// PDB does not exist, let's create it.
340+
logger.Info("Creating shared PodDisruptionBudget", "PDB.Name", pdbName, "PDB.Namespace", claim.Namespace)
341+
342+
// This PDB will select ALL pods created by the core sandbox-controller
343+
// that have the disruption policy enabled.
344+
pdbToCreate := &policyv1.PodDisruptionBudget{
345+
ObjectMeta: metav1.ObjectMeta{
346+
Name: pdbName,
347+
Namespace: claim.Namespace,
348+
},
349+
Spec: policyv1.PodDisruptionBudgetSpec{
350+
MaxUnavailable: &intstr.IntOrString{Type: intstr.Int, IntVal: 0},
351+
Selector: &metav1.LabelSelector{
352+
MatchLabels: map[string]string{
353+
// This label is injected into the PodTemplate by createSandbox
354+
// and then propagated to the Pod by the sandbox-controller.
355+
},
356+
},
357+
},
358+
}
359+
360+
// The PDB selector will target this common label.
361+
pdbToCreate.Spec.Selector.MatchLabels = map[string]string{
362+
pdbLabelKey: pdbLabelValue,
363+
}
364+
365+
// We DO NOT set an owner ref, as this PDB is shared and its
366+
// lifecycle is managed by our finalizer.
367+
if err := r.Client.Create(ctx, pdbToCreate); err != nil {
368+
logger.Error(err, "Failed to create PDB")
369+
return err
370+
}
371+
return nil
372+
}
373+
// Some other error occurred when trying to Get the PDB
374+
return err
375+
}
376+
// PDB already exists, do nothing.
377+
return nil
378+
}
379+
380+
// reconcilePDBDeletion handles cleanup of the shared PDB.
381+
func (r *SandboxClaimReconciler) reconcilePDBDeletion(ctx context.Context, claim *extensionsv1alpha1.SandboxClaim) error {
382+
logger := log.FromContext(ctx)
383+
384+
// List all other SandboxClaims in the same namespace
385+
claimList := &extensionsv1alpha1.SandboxClaimList{}
386+
if err := r.Client.List(ctx, claimList, client.InNamespace(claim.Namespace)); err != nil {
387+
logger.Error(err, "Failed to list SandboxClaims for PDB cleanup")
388+
return err
389+
}
390+
391+
// Check if any *other* claims that require PDBs still exist.
392+
otherClaimsNeedPDB := false
393+
for _, otherClaim := range claimList.Items {
394+
if otherClaim.UID == claim.UID || !otherClaim.DeletionTimestamp.IsZero() {
395+
continue // Skip self or claims already being deleted
396+
}
397+
398+
// We must check if the other claim also has PDB enabled
399+
template, err := r.getTemplate(ctx, &otherClaim)
400+
if err == nil && template != nil && template.Spec.EnableDisruptionControl {
401+
otherClaimsNeedPDB = true
402+
break
403+
}
404+
}
405+
406+
if !otherClaimsNeedPDB {
407+
// This is the last claim that needs a PDB. Delete the PDB.
408+
logger.Info("Last SandboxClaim with disruption control deleted. Deleting PDB.", "PDB.Name", pdbName, "PDB.Namespace", claim.Namespace)
409+
pdbToDelete := &policyv1.PodDisruptionBudget{
410+
ObjectMeta: metav1.ObjectMeta{
411+
Name: pdbName,
412+
Namespace: claim.Namespace,
413+
},
414+
}
415+
416+
if err := r.Client.Delete(ctx, pdbToDelete); err != nil {
417+
// Ignore "not found" errors, as it might already be gone.
418+
if !k8errors.IsNotFound(err) {
419+
logger.Error(err, "Failed to delete PDB")
420+
return err
421+
}
422+
}
423+
} else {
424+
logger.Info("Other SandboxClaims still require the PDB, not deleting.")
425+
}
426+
427+
return nil
428+
}
429+
246430
// SetupWithManager sets up the controller with the Manager.
247431
func (r *SandboxClaimReconciler) SetupWithManager(mgr ctrl.Manager) error {
248432
return ctrl.NewControllerManagedBy(mgr).

0 commit comments

Comments
 (0)