@@ -25,16 +25,27 @@ import (
2525 "k8s.io/apimachinery/pkg/api/meta"
2626 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2727 "k8s.io/apimachinery/pkg/runtime"
28+ "k8s.io/apimachinery/pkg/types"
29+ "k8s.io/apimachinery/pkg/util/intstr"
2830 ctrl "sigs.k8s.io/controller-runtime"
2931 "sigs.k8s.io/controller-runtime/pkg/client"
3032 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3133 "sigs.k8s.io/controller-runtime/pkg/log"
3234
35+ policyv1 "k8s.io/api/policy/v1"
3336 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
3437 sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
3538 extensionsv1alpha1 "sigs.k8s.io/agent-sandbox/extensions/api/v1alpha1"
3639)
3740
41+ const (
42+ // These are for our PDB management
43+ pdbFinalizerName = "sandboxclaim.agents.x-k8s.io/pdb-cleanup"
44+ pdbName = "sandbox-highly-available"
45+ pdbLabelKey = "sandbox-disruption-policy"
46+ pdbLabelValue = "HighlyAvailable"
47+ )
48+
3849// ErrTemplateNotFound is a sentinel error indicating a SandboxTemplate was not found.
3950var ErrTemplateNotFound = errors .New ("SandboxTemplate not found" )
4051
@@ -48,33 +59,88 @@ type SandboxClaimReconciler struct {
4859//+kubebuilder:rbac:groups=extensions.agents.x-k8s.io,resources=sandboxclaims/status,verbs=get;update;patch
4960//+kubebuilder:rbac:groups=agents.x-k8s.io,resources=sandboxes,verbs=get;list;watch;create;update;patch;delete
5061//+kubebuilder:rbac:groups=extensions.agents.x-k8s.io,resources=sandboxtemplates,verbs=get;list;watch
62+ //+kubebuilder:rbac:groups=policy,resources=poddisruptionbudgets,verbs=get;list;watch;create;update;patch;delete
5163
5264// Reconcile is part of the main kubernetes reconciliation loop which aims to
5365// move the current state of the cluster closer to the desired state.
5466func (r * SandboxClaimReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
67+ logger := log .FromContext (ctx )
5568 claim := & extensionsv1alpha1.SandboxClaim {}
5669 if err := r .Get (ctx , req .NamespacedName , claim ); err != nil {
5770 if k8errors .IsNotFound (err ) {
5871 return ctrl.Result {}, nil
5972 }
6073 return ctrl.Result {}, fmt .Errorf ("failed to get sandbox claim %q: %w" , req .NamespacedName , err )
6174 }
75+ originalClaimStatus := claim .Status .DeepCopy ()
76+
77+ template , err := r .getTemplate (ctx , claim )
78+ if err != nil && ! k8errors .IsNotFound (err ) {
79+ // This is a real error, update status and requeue
80+ r .computeAndSetStatus (claim , nil , err )
81+ // We can't update status if the claim is not found, but we also don't need to.
82+ if ! k8errors .IsNotFound (err ) {
83+ if updateErr := r .updateStatus (ctx , originalClaimStatus , claim ); updateErr != nil {
84+ err = errors .Join (err , updateErr )
85+ }
86+ }
87+ return ctrl.Result {}, err
88+ }
89+ if k8errors .IsNotFound (err ) {
90+ // Template not found, set status and stop.
91+ r .computeAndSetStatus (claim , nil , ErrTemplateNotFound )
92+ if updateErr := r .updateStatus (ctx , originalClaimStatus , claim ); updateErr != nil {
93+ err = errors .Join (err , updateErr )
94+ }
95+ return ctrl.Result {}, err
96+ }
97+
98+ // Determine if we should manage PDBs for this claim
99+ managePDB := template != nil && template .Spec .EnableDisruptionControl
62100
63101 if ! claim .DeletionTimestamp .IsZero () {
102+ // This logic only runs if the claim is being deleted
103+ if managePDB && controllerutil .ContainsFinalizer (claim , pdbFinalizerName ) {
104+ logger .Info ("Reconciling PDB deletion for deleted claim" )
105+ if err := r .reconcilePDBDeletion (ctx , claim ); err != nil {
106+ return ctrl.Result {}, err
107+ }
108+
109+ // Cleanup successful, remove the finalizer
110+ controllerutil .RemoveFinalizer (claim , pdbFinalizerName )
111+ if err := r .Update (ctx , claim ); err != nil {
112+ logger .Error (err , "Failed to remove finalizer" )
113+ return ctrl.Result {}, err
114+ }
115+ }
64116 return ctrl.Result {}, nil
65117 }
66118
119+ // If not deleting, add the finalizer if it's needed and missing
120+ if managePDB && ! controllerutil .ContainsFinalizer (claim , pdbFinalizerName ) {
121+ logger .Info ("Adding PDB finalizer to claim" )
122+ controllerutil .AddFinalizer (claim , pdbFinalizerName )
123+ if err := r .Update (ctx , claim ); err != nil {
124+ logger .Error (err , "Failed to add finalizer" )
125+ return ctrl.Result {}, err
126+ }
127+ return ctrl.Result {Requeue : true }, nil // Requeue to process again with the finalizer
128+ }
129+
67130 // cache the original status from sandboxclaim
68- originalClaimStatus := claim .Status .DeepCopy ()
69- var err error
70131 var sandbox * v1alpha1.Sandbox
71- var template * extensionsv1alpha1.SandboxTemplate
72132
73- // Try getting template
74- if template , err = r .getTemplate (ctx , claim ); err == nil || k8errors .IsNotFound (err ) {
75- // Try getting sandbox even if template is not found
76- // It is possible that the template was deleted after the sandbox was created
77- sandbox , err = r .getOrCreateSandbox (ctx , claim , template )
133+ // Try getting or creating the sandbox
134+ // We already fetched the template, so we pass it in.
135+ sandbox , err = r .getOrCreateSandbox (ctx , claim , template )
136+
137+ if err == nil { // Only reconcile children if sandbox was found or created
138+ // Reconcile PDB
139+ if managePDB {
140+ if pdbErr := r .reconcilePDB (ctx , claim ); pdbErr != nil {
141+ err = errors .Join (err , pdbErr )
142+ }
143+ }
78144 }
79145
80146 // Update claim status
@@ -179,9 +245,27 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
179245 Name : claim .Name ,
180246 },
181247 }
248+ replicas := int32 (1 )
249+ sandbox .Spec .Replicas = & replicas
250+
182251 sandbox .Spec .PodTemplate .Spec = template .Spec .PodTemplate .Spec
183252 sandbox .Spec .PodTemplate .ObjectMeta .Labels = template .Spec .PodTemplate .Labels
184253 sandbox .Spec .PodTemplate .ObjectMeta .Annotations = template .Spec .PodTemplate .Annotations
254+
255+ if template .Spec .EnableDisruptionControl {
256+ // 1. Inject the PDB label for the shared PDB to select
257+ if sandbox .Spec .PodTemplate .ObjectMeta .Labels == nil {
258+ sandbox .Spec .PodTemplate .ObjectMeta .Labels = make (map [string ]string )
259+ }
260+ sandbox .Spec .PodTemplate .ObjectMeta .Labels [pdbLabelKey ] = pdbLabelValue
261+
262+ // 2. Inject the safe-to-evict annotation
263+ if sandbox .Spec .PodTemplate .ObjectMeta .Annotations == nil {
264+ sandbox .Spec .PodTemplate .ObjectMeta .Annotations = make (map [string ]string )
265+ }
266+ sandbox .Spec .PodTemplate .ObjectMeta .Annotations ["cluster-autoscaler.kubernetes.io/safe-to-evict" ] = "false"
267+ }
268+
185269 if err := controllerutil .SetControllerReference (claim , sandbox , r .Scheme ); err != nil {
186270 err = fmt .Errorf ("failed to set controller reference for sandbox: %w" , err )
187271 logger .Error (err , "Error creating sandbox for claim: %q" , claim .Name )
@@ -243,6 +327,106 @@ func (r *SandboxClaimReconciler) getTemplate(ctx context.Context, claim *extensi
243327 return template , nil
244328}
245329
330+ // reconcilePDB ensures the shared PDB exists in the namespace.
331+ func (r * SandboxClaimReconciler ) reconcilePDB (ctx context.Context , claim * extensionsv1alpha1.SandboxClaim ) error {
332+ logger := log .FromContext (ctx )
333+ pdb := & policyv1.PodDisruptionBudget {}
334+ pdbKey := types.NamespacedName {Name : pdbName , Namespace : claim .Namespace }
335+
336+ err := r .Client .Get (ctx , pdbKey , pdb )
337+ if err != nil {
338+ if k8errors .IsNotFound (err ) {
339+ // PDB does not exist, let's create it.
340+ logger .Info ("Creating shared PodDisruptionBudget" , "PDB.Name" , pdbName , "PDB.Namespace" , claim .Namespace )
341+
342+ // This PDB will select ALL pods created by the core sandbox-controller
343+ // that have the disruption policy enabled.
344+ pdbToCreate := & policyv1.PodDisruptionBudget {
345+ ObjectMeta : metav1.ObjectMeta {
346+ Name : pdbName ,
347+ Namespace : claim .Namespace ,
348+ },
349+ Spec : policyv1.PodDisruptionBudgetSpec {
350+ MaxUnavailable : & intstr.IntOrString {Type : intstr .Int , IntVal : 0 },
351+ Selector : & metav1.LabelSelector {
352+ MatchLabels : map [string ]string {
353+ // This label is injected into the PodTemplate by createSandbox
354+ // and then propagated to the Pod by the sandbox-controller.
355+ },
356+ },
357+ },
358+ }
359+
360+ // The PDB selector will target this common label.
361+ pdbToCreate .Spec .Selector .MatchLabels = map [string ]string {
362+ pdbLabelKey : pdbLabelValue ,
363+ }
364+
365+ // We DO NOT set an owner ref, as this PDB is shared and its
366+ // lifecycle is managed by our finalizer.
367+ if err := r .Client .Create (ctx , pdbToCreate ); err != nil {
368+ logger .Error (err , "Failed to create PDB" )
369+ return err
370+ }
371+ return nil
372+ }
373+ // Some other error occurred when trying to Get the PDB
374+ return err
375+ }
376+ // PDB already exists, do nothing.
377+ return nil
378+ }
379+
380+ // reconcilePDBDeletion handles cleanup of the shared PDB.
381+ func (r * SandboxClaimReconciler ) reconcilePDBDeletion (ctx context.Context , claim * extensionsv1alpha1.SandboxClaim ) error {
382+ logger := log .FromContext (ctx )
383+
384+ // List all other SandboxClaims in the same namespace
385+ claimList := & extensionsv1alpha1.SandboxClaimList {}
386+ if err := r .Client .List (ctx , claimList , client .InNamespace (claim .Namespace )); err != nil {
387+ logger .Error (err , "Failed to list SandboxClaims for PDB cleanup" )
388+ return err
389+ }
390+
391+ // Check if any *other* claims that require PDBs still exist.
392+ otherClaimsNeedPDB := false
393+ for _ , otherClaim := range claimList .Items {
394+ if otherClaim .UID == claim .UID || ! otherClaim .DeletionTimestamp .IsZero () {
395+ continue // Skip self or claims already being deleted
396+ }
397+
398+ // We must check if the other claim also has PDB enabled
399+ template , err := r .getTemplate (ctx , & otherClaim )
400+ if err == nil && template != nil && template .Spec .EnableDisruptionControl {
401+ otherClaimsNeedPDB = true
402+ break
403+ }
404+ }
405+
406+ if ! otherClaimsNeedPDB {
407+ // This is the last claim that needs a PDB. Delete the PDB.
408+ logger .Info ("Last SandboxClaim with disruption control deleted. Deleting PDB." , "PDB.Name" , pdbName , "PDB.Namespace" , claim .Namespace )
409+ pdbToDelete := & policyv1.PodDisruptionBudget {
410+ ObjectMeta : metav1.ObjectMeta {
411+ Name : pdbName ,
412+ Namespace : claim .Namespace ,
413+ },
414+ }
415+
416+ if err := r .Client .Delete (ctx , pdbToDelete ); err != nil {
417+ // Ignore "not found" errors, as it might already be gone.
418+ if ! k8errors .IsNotFound (err ) {
419+ logger .Error (err , "Failed to delete PDB" )
420+ return err
421+ }
422+ }
423+ } else {
424+ logger .Info ("Other SandboxClaims still require the PDB, not deleting." )
425+ }
426+
427+ return nil
428+ }
429+
246430// SetupWithManager sets up the controller with the Manager.
247431func (r * SandboxClaimReconciler ) SetupWithManager (mgr ctrl.Manager ) error {
248432 return ctrl .NewControllerManagedBy (mgr ).
0 commit comments