Skip to content

Commit

Permalink
Separate the rolling update loop from the pod stuck restart
Browse files Browse the repository at this point in the history
Refactor the order or restart to simplify the reconcile function
and make it more deterministic.

Before we could maybe restart stuck pods then perform roll
restart. Now, stuck pods are always restarted first.
Also, stuck pods are restarted before checking the number
of ready replicas, because if we check before we will never
reach this point.
Finally, we don't process a rolling restart if any pods stuck
was deleted, to avoid performing any dangerous actions.

Signed-off-by: Geoffrey Beausire <[email protected]>
  • Loading branch information
geobeau committed Mar 28, 2024
1 parent 26b337d commit 995ab31
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 17 deletions.
9 changes: 5 additions & 4 deletions opensearch-operator/pkg/helpers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,16 +455,17 @@ func WorkingPodForRollingRestart(k8sClient k8s.K8sClient, sts *appsv1.StatefulSe
}

// DeleteStuckPodWithOlderRevision deletes the crashed pod only if there is any update in StatefulSet.
func DeleteStuckPodWithOlderRevision(k8sClient k8s.K8sClient, sts *appsv1.StatefulSet) error {
// Return true if a pod was restarded
func DeleteStuckPodWithOlderRevision(k8sClient k8s.K8sClient, sts *appsv1.StatefulSet) (bool, error) {
podWithOlderRevision, err := GetPodWithOlderRevision(k8sClient, sts)
if err != nil {
return err
return false, err
}
if podWithOlderRevision != nil {
for _, container := range podWithOlderRevision.Status.ContainerStatuses {
// If any container is getting crashed, restart it by deleting the pod so that new update in sts can take place.
if !container.Ready && container.State.Waiting != nil && container.State.Waiting.Reason == "CrashLoopBackOff" {
return k8sClient.DeletePod(&corev1.Pod{
return true, k8sClient.DeletePod(&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podWithOlderRevision.Name,
Namespace: sts.Namespace,
Expand All @@ -473,7 +474,7 @@ func DeleteStuckPodWithOlderRevision(k8sClient k8s.K8sClient, sts *appsv1.Statef
}
}
}
return nil
return false, nil
}

// GetPodWithOlderRevision fetches the pod that is not having the updated revision.
Expand Down
32 changes: 19 additions & 13 deletions opensearch-operator/pkg/reconcilers/rollingRestart.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,6 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) {
}

}
if sts.Status.ReadyReplicas != pointer.Int32Deref(sts.Spec.Replicas, 1) {
return ctrl.Result{
Requeue: true,
RequeueAfter: 10 * time.Second,
}, nil
}
}

if !pendingUpdate {
Expand All @@ -122,6 +116,24 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) {
return ctrl.Result{}, nil
}

// Check if there is any crashed pod. Delete it if there is any update in sts.
any_restarted_pod := false
for _, sts := range statefulSets {
restared_pod, err := helpers.DeleteStuckPodWithOlderRevision(r.client, &sts)
if err != nil {
return ctrl.Result{}, err
}
if restared_pod {
any_restarted_pod = true
}
}
if any_restarted_pod {
return ctrl.Result{
Requeue: true,
RequeueAfter: 10 * time.Second,
}, nil
}

// Check that all nodes of all pools are ready before doing work
for _, sts := range statefulSets {
if sts.Status.ReadyReplicas != pointer.Int32Deref(sts.Spec.Replicas, 1) {
Expand Down Expand Up @@ -153,8 +165,7 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) {
return ctrl.Result{}, err
}

// Restart StatefulSet pod. Order is not important So we just pick the first we find

// Restart a single pod of a StatefulSet. Order is not important so we just pick the first we find
for _, nodePool := range r.instance.Spec.NodePools {
sts, err := r.client.GetStatefulSet(builders.StsName(r.instance, &nodePool), r.instance.Namespace)
if err != nil {
Expand All @@ -168,11 +179,6 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) {
lg.Info(fmt.Sprintf("Starting rolling restart of the StatefulSet %s", sts.Name))
return r.restartStatefulSetPod(&sts)
}
} else { // Check if there is any crashed pod. Delete it if there is any update in sts.
err = helpers.DeleteStuckPodWithOlderRevision(r.client, &sts)
if err != nil {
return ctrl.Result{}, err
}
}
}
}
Expand Down

0 comments on commit 995ab31

Please sign in to comment.