From e16fde678883be2b9d12e1202e2622eba30ea50f Mon Sep 17 00:00:00 2001 From: Geoffrey Beausire Date: Tue, 2 Jan 2024 14:34:47 +0100 Subject: [PATCH] Alleviate race conditions in roll restart reconciler When a Roll Restart is triggered with at least 2 pools, a race condition can trigger the roll restart of a pods in each pools. This can lead to a red cluster. Normally to prevent this from happening, there are 3 checks: - check the status.ReadyReplicas of all sts before moving forward. - for each nodePool, check that that all replicas are ready by listing pods directly - before deleting a pod, a check is made on the OpenSearch to see if the cluster is healthy In practice, it is not enough. Considering the rollRestart of 2 nodePool: - data - masterData The following sequence can happen: - a rollRestart is triggered - reconcile function is called - data and masterData have all their pods ready - data pod is deleted; pods is terminating (NOT terminated yet) - reconcile function is recalled - data and masterData have all their pods ready from the status.ReadyReplicas point of view (because it didn't see the change yet) - data is seen as unhealthy thanks to CountRunningPodsForNodePool - masterData is seen as healthy because all its pods are ready - Opensearch is still healthy, because the deleted pod is not terminated yet - A pod in masterData is restarted - Cluster is red! This commit make sure we check readiness of all nodePool using CountRunningPodsForNodePool before trying to restart any pools. Signed-off-by: Geoffrey Beausire --- .../pkg/reconcilers/rollingRestart.go | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/opensearch-operator/pkg/reconcilers/rollingRestart.go b/opensearch-operator/pkg/reconcilers/rollingRestart.go index 81e8be416..70c071217 100644 --- a/opensearch-operator/pkg/reconcilers/rollingRestart.go +++ b/opensearch-operator/pkg/reconcilers/rollingRestart.go @@ -135,13 +135,26 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) { } // Check that all nodes of all pools are ready before doing work - for _, sts := range statefulSets { + for _, nodePool := range r.instance.Spec.NodePools { + sts, err := r.client.GetStatefulSet(builders.StsName(r.instance, &nodePool), r.instance.Namespace) + if err != nil { + return ctrl.Result{}, err + } if sts.Status.ReadyReplicas != pointer.Int32Deref(sts.Spec.Replicas, 1) { return ctrl.Result{ Requeue: true, RequeueAfter: 10 * time.Second, }, nil } + // CountRunningPodsForNodePool provides a more consistent view of the sts. Status of statefulset + // is eventually consistent. Listing pods is more consistent. + numReadyPods, err := helpers.CountRunningPodsForNodePool(r.client, r.instance, &nodePool) + if err != nil || numReadyPods != int(pointer.Int32Deref(sts.Spec.Replicas, 1)) { + return ctrl.Result{ + Requeue: true, + RequeueAfter: 10 * time.Second, + }, nil + } } // Skip a rolling restart if the cluster hasn't finished initializing @@ -173,13 +186,9 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) { } if sts.Status.UpdateRevision != "" && sts.Status.UpdatedReplicas != pointer.Int32Deref(sts.Spec.Replicas, 1) { - // Only restart pods if not all pods are updated and the sts is healthy with no pods terminating - if sts.Status.ReadyReplicas == pointer.Int32Deref(sts.Spec.Replicas, 1) { - if numReadyPods, err := helpers.CountRunningPodsForNodePool(r.client, r.instance, &nodePool); err == nil && numReadyPods == int(pointer.Int32Deref(sts.Spec.Replicas, 1)) { - lg.Info(fmt.Sprintf("Starting rolling restart of the StatefulSet %s", sts.Name)) - return r.restartStatefulSetPod(&sts) - } - } + + lg.Info(fmt.Sprintf("Starting rolling restart of the StatefulSet %s", sts.Name)) + return r.restartStatefulSetPod(&sts) } }