From e16fde678883be2b9d12e1202e2622eba30ea50f Mon Sep 17 00:00:00 2001
From: Geoffrey Beausire <g.beausire@criteo.com>
Date: Tue, 2 Jan 2024 14:34:47 +0100
Subject: [PATCH] Alleviate race conditions in roll restart reconciler

When a Roll Restart is triggered with at least 2 pools, a race
condition can trigger the roll restart of a pods in each pools.
This can lead to a red cluster.
Normally to prevent this from happening, there are 3 checks:
- check the status.ReadyReplicas of all sts before moving forward.
- for each nodePool, check that that all replicas are ready by listing
  pods directly
- before deleting a pod, a check is made on the OpenSearch to see
  if the cluster is healthy

In practice, it is not enough. Considering the rollRestart of 2 nodePool:
- data
- masterData

The following sequence can happen:
- a rollRestart is triggered
- reconcile function is called
- data and masterData have all their pods ready
- data pod is deleted; pods is terminating (NOT terminated yet)
- reconcile function is recalled
- data and masterData have all their pods ready from the status.ReadyReplicas
  point of view (because it didn't see the change yet)
- data is seen as unhealthy thanks to CountRunningPodsForNodePool
- masterData is seen as healthy because all its pods are ready
- Opensearch is still healthy, because the deleted pod is not terminated yet
- A pod in masterData is restarted
- Cluster is red!

This commit make sure we check readiness of all nodePool using
CountRunningPodsForNodePool before trying to restart any pools.

Signed-off-by: Geoffrey Beausire <g.beausire@criteo.com>
---
 .../pkg/reconcilers/rollingRestart.go         | 25 +++++++++++++------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/opensearch-operator/pkg/reconcilers/rollingRestart.go b/opensearch-operator/pkg/reconcilers/rollingRestart.go
index 81e8be416..70c071217 100644
--- a/opensearch-operator/pkg/reconcilers/rollingRestart.go
+++ b/opensearch-operator/pkg/reconcilers/rollingRestart.go
@@ -135,13 +135,26 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) {
 	}
 
 	// Check that all nodes of all pools are ready before doing work
-	for _, sts := range statefulSets {
+	for _, nodePool := range r.instance.Spec.NodePools {
+		sts, err := r.client.GetStatefulSet(builders.StsName(r.instance, &nodePool), r.instance.Namespace)
+		if err != nil {
+			return ctrl.Result{}, err
+		}
 		if sts.Status.ReadyReplicas != pointer.Int32Deref(sts.Spec.Replicas, 1) {
 			return ctrl.Result{
 				Requeue:      true,
 				RequeueAfter: 10 * time.Second,
 			}, nil
 		}
+		// CountRunningPodsForNodePool provides a more consistent view of the sts. Status of statefulset
+		// is eventually consistent. Listing pods is more consistent.
+		numReadyPods, err := helpers.CountRunningPodsForNodePool(r.client, r.instance, &nodePool)
+		if err != nil || numReadyPods != int(pointer.Int32Deref(sts.Spec.Replicas, 1)) {
+			return ctrl.Result{
+				Requeue:      true,
+				RequeueAfter: 10 * time.Second,
+			}, nil
+		}
 	}
 
 	// Skip a rolling restart if the cluster hasn't finished initializing
@@ -173,13 +186,9 @@ func (r *RollingRestartReconciler) Reconcile() (ctrl.Result, error) {
 		}
 		if sts.Status.UpdateRevision != "" &&
 			sts.Status.UpdatedReplicas != pointer.Int32Deref(sts.Spec.Replicas, 1) {
-			// Only restart pods if not all pods are updated and the sts is healthy with no pods terminating
-			if sts.Status.ReadyReplicas == pointer.Int32Deref(sts.Spec.Replicas, 1) {
-				if numReadyPods, err := helpers.CountRunningPodsForNodePool(r.client, r.instance, &nodePool); err == nil && numReadyPods == int(pointer.Int32Deref(sts.Spec.Replicas, 1)) {
-					lg.Info(fmt.Sprintf("Starting rolling restart of the StatefulSet %s", sts.Name))
-					return r.restartStatefulSetPod(&sts)
-				}
-			}
+
+			lg.Info(fmt.Sprintf("Starting rolling restart of the StatefulSet %s", sts.Name))
+			return r.restartStatefulSetPod(&sts)
 		}
 	}