ray-project · GeneDer · Sep 19, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/python/ray/serve/tests/BUILD b/python/ray/serve/tests/BUILD
@@ -443,4 +443,3 @@ py_test_module_list(
         "//python/ray/serve:serve_lib",
     ],
 )
-
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
@@ -16,6 +16,7 @@
     fetch_prometheus_metrics,
     wait_for_condition,
 )
+from ray.serve._private.common import ProxyStatus
 from ray.serve._private.constants import DEFAULT_LATENCY_BUCKET_MS
 from ray.serve._private.long_poll import LongPollHost, UpdatedObject
 from ray.serve._private.test_utils import (
@@ -35,6 +36,10 @@
 @pytest.fixture
 def serve_start_shutdown():
     """Fixture provides a fresh Ray cluster to prevent metrics state sharing."""
+    while len(serve.status().proxies) > 0 or len(serve.status().applications) > 0:
+        serve.shutdown()
+    while ray.is_initialized():
+        ray.shutdown()
     ray.init(
         _metrics_export_port=TEST_METRICS_EXPORT_PORT,
         _system_config={
@@ -53,11 +58,27 @@ def serve_start_shutdown():
             grpc_servicer_functions=grpc_servicer_functions,
         ),
     )
-    serve.shutdown()
-    ray.shutdown()
+    while len(serve.status().proxies) > 0 or len(serve.status().applications) > 0:
+        serve.shutdown()
+    while ray.is_initialized():
+        ray.shutdown()
     ray._private.utils.reset_ray_address()
 
 
+@pytest.fixture
+def wait_for_health_proxies():
+    def check():
+        return all(
+            [
+                status == ProxyStatus.HEALTHY
+                for status in serve.status().proxies.values()
+            ]
+        )
+
+    wait_for_condition(check)
+    yield
+
+
 def extract_tags(line: str) -> Dict[str, str]:
     """Extracts any tags from the metrics line."""
 
@@ -193,7 +214,9 @@ def metric_available() -> bool:
     return metric_dicts
 
 
-def test_serve_metrics_for_successful_connection(serve_start_shutdown):
+def test_serve_metrics_for_successful_connection(
+    serve_start_shutdown, wait_for_health_proxies
+):
     @serve.deployment(name="metrics")
     async def f(request):
         return "hello"
@@ -258,7 +281,7 @@ def verify_metrics(do_assert=False):
         verify_metrics(do_assert=True)
 
 
-def test_http_replica_gauge_metrics(serve_start_shutdown):
+def test_http_replica_gauge_metrics(serve_start_shutdown, wait_for_health_proxies):
     """Test http replica gauge metrics"""
     signal = SignalActor.remote()
 
@@ -291,7 +314,7 @@ def ensure_request_processing():
     wait_for_condition(ensure_request_processing, timeout=5)
 
 
-def test_proxy_metrics_not_found(serve_start_shutdown):
+def test_proxy_metrics_not_found(serve_start_shutdown, wait_for_health_proxies):
     # NOTE: These metrics should be documented at
     # https://docs.ray.io/en/latest/serve/monitoring.html#metrics
     # Any updates here should be reflected there too.
@@ -333,7 +356,7 @@ def verify_metrics(_expected_metrics, do_assert=False):
             verify_metrics,
             retry_interval_ms=1000,
             timeout=10,
-            expected_metrics=expected_metrics,
+            _expected_metrics=expected_metrics,
         )
     except RuntimeError:
         verify_metrics(expected_metrics, True)
@@ -383,7 +406,7 @@ def verify_error_count(do_assert=False):
         verify_error_count(do_assert=True)
 
 
-def test_proxy_metrics_internal_error(serve_start_shutdown):
+def test_proxy_metrics_internal_error(serve_start_shutdown, wait_for_health_proxies):
     # NOTE: These metrics should be documented at
     # https://docs.ray.io/en/latest/serve/monitoring.html#metrics
     # Any updates here should be reflected there too.
@@ -434,7 +457,7 @@ async def __call__(self, *args):
             verify_metrics,
             retry_interval_ms=1000,
             timeout=10,
-            expected_metrics=expected_metrics,
+            _expected_metrics=expected_metrics,
         )
     except RuntimeError:
         verify_metrics(expected_metrics, True)
@@ -478,7 +501,7 @@ def verify_error_count(do_assert=False):
         verify_error_count(do_assert=True)
 
 
-def test_proxy_metrics_fields_not_found(serve_start_shutdown):
+def test_proxy_metrics_fields_not_found(serve_start_shutdown, wait_for_health_proxies):
     """Tests the proxy metrics' fields' behavior for not found."""
 
     # Should generate 404 responses
@@ -522,7 +545,9 @@ def test_proxy_metrics_fields_not_found(serve_start_shutdown):
     print("serve_num_grpc_error_requests working as expected.")
 
 
-def test_proxy_metrics_fields_internal_error(serve_start_shutdown):
+def test_proxy_metrics_fields_internal_error(
+    serve_start_shutdown, wait_for_health_proxies
+):
     """Tests the proxy metrics' fields' behavior for internal error."""
 
     @serve.deployment()
@@ -583,7 +608,7 @@ def f(*args):
     print("serve_grpc_request_latency_ms_sum working as expected.")
 
 
-def test_replica_metrics_fields(serve_start_shutdown):
+def test_replica_metrics_fields(serve_start_shutdown, wait_for_health_proxies):
     """Test replica metrics fields"""
 
     @serve.deployment
@@ -722,7 +747,9 @@ def verify_metrics(self, metric, expected_output):
         for key in expected_output:
             assert metric[key] == expected_output[key]
 
-    def test_request_context_pass_for_http_proxy(self, serve_start_shutdown):
+    def test_request_context_pass_for_http_proxy(
+        self, serve_start_shutdown, wait_for_health_proxies
+    ):
         """Test HTTP proxy passing request context"""
 
         @serve.deployment(graceful_shutdown_timeout_s=0.001)
@@ -816,7 +843,9 @@ def check():
             assert metrics_app_name["g"] == "app2", msg
             assert metrics_app_name["h"] == "app3", msg
 
-    def test_request_context_pass_for_grpc_proxy(self, serve_start_shutdown):
+    def test_request_context_pass_for_grpc_proxy(
+        self, serve_start_shutdown, wait_for_health_proxies
+    ):
         """Test gRPC proxy passing request context"""
 
         @serve.deployment(graceful_shutdown_timeout_s=0.001)
@@ -912,7 +941,9 @@ def check():
             assert metrics_app_name[depl_name2] == "app2", msg
             assert metrics_app_name[depl_name3] == "app3", msg
 
-    def test_request_context_pass_for_handle_passing(self, serve_start_shutdown):
+    def test_request_context_pass_for_handle_passing(
+        self, serve_start_shutdown, wait_for_health_proxies
+    ):
         """Test handle passing contexts between replicas"""
 
         @serve.deployment
@@ -970,7 +1001,9 @@ async def app2(self):
         assert requests_metrics_app_name["g1"] == "app"
         assert requests_metrics_app_name["g2"] == "app"
 
-    def test_customer_metrics_with_context(self, serve_start_shutdown):
+    def test_customer_metrics_with_context(
+        self, serve_start_shutdown, wait_for_health_proxies
+    ):
         @serve.deployment
         class Model:
             def __init__(self):
@@ -1062,7 +1095,9 @@ def __call__(self):
         self.verify_metrics(histogram_metrics[0], expected_metrics)
 
     @pytest.mark.parametrize("use_actor", [False, True])
-    def test_serve_metrics_outside_serve(self, use_actor, serve_start_shutdown):
+    def test_serve_metrics_outside_serve(
+        self, use_actor, serve_start_shutdown, wait_for_health_proxies
+    ):
         """Make sure ray.serve.metrics work in ray actor"""
         if use_actor:
 
@@ -1186,7 +1221,7 @@ async def __call__(self):
         self.verify_metrics(histogram_metrics[0], expected_metrics)
 
 
-def test_multiplexed_metrics(serve_start_shutdown):
+def test_multiplexed_metrics(serve_start_shutdown, wait_for_health_proxies):
     """Tests multiplexed API corresponding metrics."""
 
     @serve.deployment
@@ -1261,7 +1296,7 @@ async def call(self, *args):
 
 
 class TestHandleMetrics:
-    def test_queued_queries_basic(self, serve_start_shutdown):
+    def test_queued_queries_basic(self, serve_start_shutdown, wait_for_health_proxies):
         signal = SignalActor.options(name="signal123").remote()
         serve.run(WaitForSignal.options(max_ongoing_requests=1).bind(), name="app1")
 
@@ -1290,7 +1325,9 @@ def test_queued_queries_basic(self, serve_start_shutdown):
             expected=0,
         )
 
-    def test_queued_queries_multiple_handles(self, serve_start_shutdown):
+    def test_queued_queries_multiple_handles(
+        self, serve_start_shutdown, wait_for_health_proxies
+    ):
         signal = SignalActor.options(name="signal123").remote()
         serve.run(WaitForSignal.options(max_ongoing_requests=1).bind(), name="app1")
 
@@ -1330,7 +1367,9 @@ def test_queued_queries_multiple_handles(self, serve_start_shutdown):
             expected=0,
         )
 
-    def test_queued_queries_disconnected(self, serve_start_shutdown):
+    def test_queued_queries_disconnected(
+        self, serve_start_shutdown, wait_for_health_proxies
+    ):
         """Check that disconnected queued queries are tracked correctly."""
 
         signal = SignalActor.remote()
@@ -1471,7 +1510,9 @@ def do_request():
         # Unblock hanging request.
         ray.get(signal.send.remote())
 
-    def test_running_requests_gauge(self, serve_start_shutdown):
+    def test_running_requests_gauge(
+        self, serve_start_shutdown, wait_for_health_proxies
+    ):
         signal = SignalActor.options(name="signal123").remote()
         serve.run(
             Router.options(num_replicas=2, ray_actor_options={"num_cpus": 0}).bind(
@@ -1531,7 +1572,7 @@ def test_running_requests_gauge(self, serve_start_shutdown):
         )
 
 
-def test_long_poll_host_sends_counted(serve_instance):
+def test_long_poll_host_sends_counted(serve_start_shutdown, wait_for_health_proxies):
     """Check that the transmissions by the long_poll are counted."""
 
     host = ray.remote(LongPollHost).remote(
@@ -1588,7 +1629,7 @@ def test_long_poll_host_sends_counted(serve_instance):
     )
 
 
-def test_actor_summary(serve_instance):
+def test_actor_summary(serve_start_shutdown, wait_for_health_proxies):
     @serve.deployment
     def f():
         pass