Merge remote-tracking branch 'origin/master' into flamegraph-on-lag

Signed-off-by: Ruiyang Wang <[email protected]>
ray-project · Sep 20, 2024 · 1725744 · 1725744
2 parents 18ca5b1 + ab94e48
commit 1725744
Show file tree

Hide file tree

Showing 507 changed files with 13,171 additions and 6,670 deletions.
diff --git a/.buildkite/linux_aarch64.rayci.yml b/.buildkite/linux_aarch64.rayci.yml
@@ -26,6 +26,7 @@ steps:
           - "3.9"
           - "3.10"
           - "3.11"
+          - "3.12"
         cuda:
           - "11.7.1-cudnn8"
           - "11.8.0-cudnn8"
@@ -47,6 +48,7 @@ steps:
       - "3.9"
       - "3.10"
       - "3.11"
+      - "3.12"
     instance_type: builder-arm64
     env:
       PYTHON_VERSION: "{{matrix}}"
@@ -91,6 +93,7 @@ steps:
       - "3.9"
       - "3.10"
       - "3.11"
+      - "3.12"
 
   - label: ":ray: core: wheel-aarch64 tests"
     tags: linux_wheels

diff --git a/.buildkite/rllib.rayci.yml b/.buildkite/rllib.rayci.yml
@@ -96,7 +96,9 @@ steps:
 
   - label: ":brain: rllib: data tests"
     if: build.branch != "master"
-    tags: data
+    tags:
+      - data
+      - rllib
     instance_type: large
     commands:
       # learning tests pytorch
@@ -106,14 +108,6 @@ steps:
         --except-tags multi_gpu,gpu,tf_only,tf2_only
         --test-arg --framework=torch
 
-      # learning tests tF2
-      - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib 
-        --parallelism-per-worker 3
-        --only-tags learning_tests_with_ray_data
-        --except-tags multi_gpu,gpu,torch_only
-        --test-arg --framework=tf2
-        --skip-ray-installation # reuse the same docker image as the previous run
-
       # rllib unittests
       - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib 
         --parallelism-per-worker 3

diff --git a/BUILD.bazel b/BUILD.bazel
@@ -956,6 +956,21 @@ ray_cc_test(
     ],
 )
 
+ray_cc_test(
+    name = "task_event_buffer_export_event_test",
+    size = "small",
+    srcs = ["src/ray/core_worker/test/task_event_buffer_export_event_test.cc"],
+    tags = [
+        "team:core",
+        "no_windows"
+    ],
+    deps = [
+        ":core_worker_lib",
+        ":ray_mock",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 ray_cc_test(
     name = "actor_creator_test",
     size = "small",
@@ -991,6 +1006,7 @@ ray_cc_test(
     tags = ["team:core"],
     deps = [
         ":core_worker_lib",
+        ":ray_mock",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1637,25 +1653,6 @@ ray_cc_test(
     ],
 )
 
-ray_cc_test(
-    name = "gcs_node_manager_export_event_test",
-    size = "small",
-    srcs = [
-        "src/ray/gcs/gcs_server/test/gcs_node_manager_export_event_test.cc",
-    ],
-    tags = [
-        "no_windows",
-        "team:core"
-    ],
-    deps = [
-        ":gcs_server_lib",
-        ":gcs_server_test_util",
-        ":gcs_test_util_lib",
-        ":ray_mock",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 ray_cc_test(
     name = "gcs_job_manager_test",
     size = "small",
@@ -2363,6 +2360,25 @@ ray_cc_test(
     ],
 )
 
+ray_cc_test(
+    name = "gcs_export_event_test",
+    size = "small",
+    srcs = glob([
+        "src/ray/gcs/gcs_server/test/export_api/*.cc",
+    ]),
+    tags = [
+        "no_windows",
+        "team:core"
+    ],
+    deps = [
+        ":gcs_server_lib",
+        ":gcs_server_test_util",
+        ":gcs_test_util_lib",
+        ":ray_mock",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 flatbuffer_cc_library(
     name = "node_manager_fbs",
     srcs = ["src/ray/raylet/format/node_manager.fbs"],

diff --git a/README.rst b/README.rst
@@ -12,6 +12,9 @@
 .. image:: https://img.shields.io/twitter/follow/raydistributed.svg?style=social&logo=twitter
     :target: https://twitter.com/raydistributed
 
+.. image:: https://img.shields.io/badge/Get_started_for_free-3C8AE9?logo=data%3Aimage%2Fpng%3Bbase64%2CiVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8%2F9hAAAAAXNSR0IArs4c6QAAAERlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAA6ABAAMAAAABAAEAAKACAAQAAAABAAAAEKADAAQAAAABAAAAEAAAAAA0VXHyAAABKElEQVQ4Ea2TvWoCQRRGnWCVWChIIlikC9hpJdikSbGgaONbpAoY8gKBdAGfwkfwKQypLQ1sEGyMYhN1Pd%2B6A8PqwBZeOHt%2FvsvMnd3ZXBRFPQjBZ9K6OY8ZxF%2B0IYw9PW3qz8aY6lk92bZ%2BVqSI3oC9T7%2FyCVnrF1ngj93us%2B540sf5BrCDfw9b6jJ5lx%2FyjtGKBBXc3cnqx0INN4ImbI%2Bl%2BPnI8zWfFEr4chLLrWHCp9OO9j19Kbc91HX0zzzBO8EbLK2Iv4ZvNO3is3h6jb%2BCwO0iL8AaWqB7ILPTxq3kDypqvBuYuwswqo6wgYJbT8XxBPZ8KS1TepkFdC79TAHHce%2F7LbVioi3wEfTpmeKtPRGEeoldSP%2FOeoEftpP4BRbgXrYZefsAI%2BP9JU7ImyEAAAAASUVORK5CYII%3D
+   :target: https://console.anyscale.com/register/ha?utm_source=github&utm_medium=ray_readme&utm_campaign=get_started_badge
+
 Ray is a unified framework for scaling AI and Python applications. Ray consists of a core distributed runtime and a set of AI libraries for simplifying ML compute:
 
 .. image:: https://github.com/ray-project/ray/raw/master/doc/source/images/what-is-ray-padded.svg

diff --git a/ci/ray_ci/automation/weekly_green_metric.py b/ci/ray_ci/automation/weekly_green_metric.py
@@ -34,7 +34,7 @@ def main(production: bool, check: bool) -> None:
     blockers = TestStateMachine.get_release_blockers()
 
     if production:
-        logger.info(f"Found {blockers.totalCount} release blockers")
+        logger.info(f"Found {len(blockers)} release blockers")
         blocker_teams = [
             TestStateMachine.get_issue_owner(blocker) for blocker in blockers
         ]
@@ -51,12 +51,17 @@ def main(production: bool, check: bool) -> None:
         )
         logger.info("Weekly green metric updated successfully")
 
-    if check and blockers.totalCount != 0:
-        print(
-            f"Found {blockers.totalCount} release blockers.",
-            file=sys.stderr,
-        )
-        sys.exit(42)  # Not retrying the check on Buildkite jobs
+    if check:
+        if len(blockers) > 0:
+            print(
+                f"Found {len(blockers)} release blockers.",
+                file=sys.stderr,
+            )
+            for issue in blockers:
+                print(f"{issue.html_url} - {issue.title}", file=sys.stderr)
+            sys.exit(42)  # Not retrying the check on Buildkite jobs
+        else:
+            print("No release blockers. Woohoo!", file=sys.stderr)
 
 
 if __name__ == "__main__":

diff --git a/ci/ray_ci/doc/build_cache.py b/ci/ray_ci/doc/build_cache.py
@@ -27,7 +27,7 @@ def __init__(self, cache_dir: str):
         """
         self._cache_dir = cache_dir
 
-    def upload(self) -> None:
+    def upload(self, dry_run: bool) -> None:
         """
         Upload the build artifacts to S3
         """
@@ -40,6 +40,10 @@ def upload(self) -> None:
         logger.info("Creating a tarball of the cache files.")
         doc_tarball = self._zip_cache(cache_files)
 
+        if dry_run:
+            logger.info(f"Skipping upload of {doc_tarball} to S3.")
+            return
+
         logger.info("Upload the tarball to S3.")
         self._upload_cache(doc_tarball)
 

diff --git a/ci/ray_ci/doc/cmd_build.py b/ci/ray_ci/doc/cmd_build.py
@@ -28,23 +28,24 @@ def main(ray_checkout_dir: str) -> None:
     logger.info("Building ray doc.")
     _build(ray_checkout_dir)
 
+    dry_run = False
     if (
         os.environ.get("BUILDKITE_PIPELINE_ID")
         not in get_global_config()["ci_pipeline_postmerge"]
     ):
+        dry_run = True
         logger.info(
             "Not uploading build artifacts because this is not a postmerge pipeline."
         )
-        return
 
     if os.environ.get("BUILDKITE_BRANCH") != "master":
+        dry_run = True
         logger.info(
             "Not uploading build artifacts because this is not the master branch."
         )
-        return
 
     logger.info("Uploading build artifacts to S3.")
-    BuildCache(os.path.join(ray_checkout_dir, "doc")).upload()
+    BuildCache(os.path.join(ray_checkout_dir, "doc")).upload(dry_run=dry_run)
 
     return
 

diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt
@@ -7,7 +7,7 @@ setuptools>=70.0.0
 Pygments==2.16.1
 
 # Sphinx
-sphinx==7.1.2
+sphinx==7.3.7
 sphinx-click==5.1.0
 sphinx-copybutton==0.5.2
 sphinxemoji==0.2.0
@@ -20,6 +20,7 @@ sphinx-autobuild==2024.4.16
 pydata-sphinx-theme==0.14.1
 autodoc_pydantic==2.2.0
 appnope
+sphinx-docsearch==0.0.7
 
 pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3
 

diff --git a/doc/source/_templates/index.html b/doc/source/_templates/index.html
@@ -222,7 +222,7 @@ <h3>Scale with Ray</h3>
           ''') }}
             <div class="tab-pane-links">
               <a href="{{ pathto('serve/index') }}" target="_blank">Learn more about Ray Serve</a>
-              <a href="{{ pathto('serve/examples') }}" target="_blank">Examples</a>
+              <a href="https://console.anyscale.com/register/ha?utm_source=ray_docs&utm_medium=docs&utm_campaign=scale_with_ray&redirectTo=/v2/template-preview/serve-stable-diffusion-v2" target="_blank">Quickstart</a>
             </div>
           </div>
           <!-- prettier-ignore -->

diff --git a/doc/source/cluster/kubernetes/examples.md b/doc/source/cluster/kubernetes/examples.md
@@ -17,6 +17,7 @@ examples/rayjob-kueue-priority-scheduling
 examples/rayjob-kueue-gang-scheduling
 examples/distributed-checkpointing-with-gcsfuse
 examples/modin-example
+examples/vllm-rayservice
 ```
 
 
@@ -34,3 +35,4 @@ This section presents example Ray workloads to try out on your Kubernetes cluste
 - {ref}`kuberay-kueue-gang-scheduling-example`
 - {ref}`kuberay-distributed-checkpointing-gcsefuse`
 - {ref}`kuberay-modin-example`
+- {ref}`kuberay-vllm-rayservice-example`
diff --git a/doc/source/cluster/kubernetes/examples/vllm-rayservice.md b/doc/source/cluster/kubernetes/examples/vllm-rayservice.md
@@ -0,0 +1,117 @@
+(kuberay-vllm-rayservice-example)=
+
+# Serve a Large Language Model with vLLM on Kubernetes
+
+This guide demonstrates how to [Serve a Large Language Model with vLLM](https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html) on Kubernetes using KubeRay. The example in this guide deploys the `meta-llama/Meta-Llama-3-8B-Instruct` model from Hugging Face on Google Kubernetes Engine (GKE).
+
+## Prerequisites
+
+This example downloads model weights from Hugging Face. You need to complete the following
+prerequisites to successfully complete this guide:
+* A [Hugging Face account](https://huggingface.co/)
+* A Hugging Face [access token](https://huggingface.co/docs/hub/security-tokens) with read access to gated repos.
+* Access to the Llama 3 8B model. Getting access usually requires signing an agreement on Hugging Face to access this model. Go to the [Llama 3 model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B) for more details.
+
+## Create a Kubernetes cluster on GKE
+
+Create a GKE cluster with a GPU node pool:
+```sh
+gcloud container clusters create kuberay-gpu-cluster \
+    --machine-type=g2-standard-24 \
+    --location=us-east4-c \
+    --num-nodes=2 \
+    --accelerator=type=nvidia-l4,count=2,gpu-driver-version=latest
+```
+
+This example uses L4 GPUs. Each model replica uses 2 L4 GPUs using vLLM's tensor parallelism.
+
+## Install the KubeRay Operator
+
+Follow [Deploy a KubeRay operator](kuberay-operator-deploy) to install the latest stable KubeRay operator from the Helm repository.
+The KubeRay operator Pod must be on the CPU node if you set up the taint for the GPU node pool correctly.
+
+## Create a Kubernetes Secret containing your Hugging Face access token
+
+Create a Kubernetes Secret containing your Hugging Face access token:
+```sh
+export HF_TOKEN=<Hugging Face access token>
+kubectl create secret generic hf-secret   --from-literal=hf_api_token=${HF_TOKEN}   --dry-run=client -o yaml | kubectl apply -f -
+```
+
+This guide references this secret as an environment variable in the RayCluster used in the next steps.
+
+## Deploy a RayService
+
+Create a RayService custom resource:
+```
+kubectl apply -f https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/vllm/ray-service.vllm.yaml
+```
+
+This step configures RayService to deploy a Ray Serve app, running vLLM as the serving engine for the Llama 3 8B Instruct model. You can find the code for this example [on GitHub](https://github.com/ray-project/kuberay/blob/master/ray-operator/config/samples/vllm/serve.py).
+You can inspect the Serve Config for more details about the Serve deployment:
+```yaml
+  serveConfigV2: |
+    applications:
+    - name: llm
+      route_prefix: /
+      import_path:  ray-operator.config.samples.vllm.serve:model
+      deployments:
+      - name: VLLMDeployment
+        num_replicas: 1
+        ray_actor_options:
+          num_cpus: 8
+          # NOTE: num_gpus is set automatically based on TENSOR_PARALLELISM
+      runtime_env:
+        working_dir: "https://github.com/ray-project/kuberay/archive/master.zip"
+        pip: ["vllm==0.5.4"]
+        env_vars:
+          MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
+          TENSOR_PARALLELISM: "2"
+```
+
+Wait for the RayService resource to be ready. You can inspect its status by running the following command:
+```
+$ kubectl get rayservice llama-3-8b -o yaml
+```
+
+The output should contain the following:
+```
+status:
+  activeServiceStatus:
+    applicationStatuses:
+      llm:
+        healthLastUpdateTime: "2024-08-08T22:56:50Z"
+        serveDeploymentStatuses:
+          VLLMDeployment:
+            healthLastUpdateTime: "2024-08-08T22:56:50Z"
+            status: HEALTHY
+        status: RUNNING
+```
+
+## Send a prompt
+
+Confirm the Ray Serve deployment is healthy, then you can establish a port-forwarding session for the Serve app:
+
+```sh
+$ kubectl port-forward svc/llama-3-8b-serve-svc 8000
+```
+
+Note that KubeRay creates this Kubernetes Service after the Serve apps are ready and running.
+This process may take several minutes after all Pods in the RayCluster are running.
+
+Now you can send a prompt to the model:
+```sh
+$ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+      "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+      "messages": [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Provide a brief sentence describing the Ray open-source project."}
+      ],
+      "temperature": 0.7
+    }'
+```
+
+The output should be similar to the following, containing the generated response from the model:
+```json
+{"id":"cmpl-ce6585cd69ed47638b36ddc87930fded","object":"chat.completion","created":1723161873,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"The Ray open-source project is a high-performance distributed computing framework that allows users to scale Python applications and machine learning models to thousands of nodes, supporting distributed data processing, distributed machine learning, and distributed analytics."},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":32,"total_tokens":74,"completion_tokens":42}}
+```