From bc31e524105dd1f7992a25686c458291e4632cfe Mon Sep 17 00:00:00 2001 From: wuxibin Date: Mon, 1 Apr 2024 23:08:37 +0800 Subject: [PATCH 1/3] fix incorrect gpu ids if placement group bundle index specified Signed-off-by: wuxibin --- python/ray/_private/worker.py | 13 +++++-- python/ray/runtime_context.py | 8 ++++- python/ray/tests/test_actor_resources.py | 44 ++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index 3b376e194fccd..93bfd9accaaa8 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -994,6 +994,13 @@ def get_gpu_ids() -> Union[List[int], List[str]]: """ worker = global_worker worker.check_connected() + # respect placement_group_bundle_index if specified. + gpu_ids = worker.get_accelerator_ids_for_accelerator_resource( + ray_constants.GPU, f"^{ray_constants.GPU}_group_[0-9]+_[0-9A-Za-z]+$" + ) + if len(gpu_ids) != 0: + return gpu_ids + return worker.get_accelerator_ids_for_accelerator_resource( ray_constants.GPU, f"^{ray_constants.GPU}_group_[0-9A-Za-z]+$" ) @@ -2671,9 +2678,9 @@ def get( port=None, patch_stdstreams=False, quiet=None, - breakpoint_uuid=debugger_breakpoint.decode() - if debugger_breakpoint - else None, + breakpoint_uuid=( + debugger_breakpoint.decode() if debugger_breakpoint else None + ), debugger_external=worker.ray_debugger_external, ) rdb.set_trace(frame=frame) diff --git a/python/ray/runtime_context.py b/python/ray/runtime_context.py index e1ee610fbf458..83ebfa5639e2e 100644 --- a/python/ray/runtime_context.py +++ b/python/ray/runtime_context.py @@ -420,10 +420,16 @@ def get_accelerator_ids(self) -> Dict[str, List[str]]: for ( accelerator_resource_name ) in ray._private.accelerators.get_all_accelerator_resource_names(): + # respect placement_group_bundle_index if specified. accelerator_ids = worker.get_accelerator_ids_for_accelerator_resource( accelerator_resource_name, - f"^{accelerator_resource_name}_group_[0-9A-Za-z]+$", + f"^{accelerator_resource_name}_group_[0-9]+_[0-9A-Za-z]+$", ) + if len(accelerator_ids) == 0: + accelerator_ids = worker.get_accelerator_ids_for_accelerator_resource( + accelerator_resource_name, + f"^{accelerator_resource_name}_group_[0-9A-Za-z]+$", + ) ids_dict[accelerator_resource_name] = [str(id) for id in accelerator_ids] return ids_dict diff --git a/python/ray/tests/test_actor_resources.py b/python/ray/tests/test_actor_resources.py index 27243052779f1..f9c3c7f86095d 100644 --- a/python/ray/tests/test_actor_resources.py +++ b/python/ray/tests/test_actor_resources.py @@ -7,6 +7,7 @@ import ray import ray.cluster_utils +from ray.util.placement_group import placement_group, PlacementGroupSchedulingStrategy def test_actor_deletion_with_gpus(shutdown_only): @@ -678,6 +679,49 @@ def get_cuda_visible_devices(self): assert ray.get(actor.get_cuda_visible_devices.remote()) == "0,1" +def test_actor_cuda_visible_devices_placement_group_bundle_index(shutdown_only): + ray.init(num_cpus=4, num_gpus=2) + + @ray.remote + class Actor: + def __init__(self) -> None: + self.gpu_ids = ray.get_gpu_ids() + + def get_cuda_visible_devices(self): + gpu_ids = os.environ["CUDA_VISIBLE_DEVICES"] + assert int(gpu_ids) == self.gpu_ids[0] + return gpu_ids + + bundles = [{"GPU": 1, "CPU": 2}] * 2 + pg = placement_group(bundles) + pg.ready() + + m1 = [ + Actor.options( + num_gpus=0.1, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, placement_group_bundle_index=i + ), + ).remote() + for i in range(2) + ] + + m2 = [ + Actor.options( + num_gpus=0.1, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, placement_group_bundle_index=i + ), + ).remote() + for i in range(2) + ] + + assert ray.get(m1[0].get_cuda_visible_devices.remote()) == "0" + assert ray.get(m1[1].get_cuda_visible_devices.remote()) == "1" + assert ray.get(m2[0].get_cuda_visible_devices.remote()) == "0" + assert ray.get(m2[1].get_cuda_visible_devices.remote()) == "1" + + if __name__ == "__main__": import pytest From 5225b5284ed77cfe4ee5e1363113bcfc2f9ee64c Mon Sep 17 00:00:00 2001 From: wuxibin Date: Mon, 15 Apr 2024 11:58:36 +0800 Subject: [PATCH 2/3] fix comment --- python/ray/_private/worker.py | 2 +- python/ray/runtime_context.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index 93bfd9accaaa8..feb87ac17d675 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -998,7 +998,7 @@ def get_gpu_ids() -> Union[List[int], List[str]]: gpu_ids = worker.get_accelerator_ids_for_accelerator_resource( ray_constants.GPU, f"^{ray_constants.GPU}_group_[0-9]+_[0-9A-Za-z]+$" ) - if len(gpu_ids) != 0: + if not gpu_ids: return gpu_ids return worker.get_accelerator_ids_for_accelerator_resource( diff --git a/python/ray/runtime_context.py b/python/ray/runtime_context.py index 83ebfa5639e2e..944845fa23193 100644 --- a/python/ray/runtime_context.py +++ b/python/ray/runtime_context.py @@ -425,7 +425,7 @@ def get_accelerator_ids(self) -> Dict[str, List[str]]: accelerator_resource_name, f"^{accelerator_resource_name}_group_[0-9]+_[0-9A-Za-z]+$", ) - if len(accelerator_ids) == 0: + if not accelerator_ids: accelerator_ids = worker.get_accelerator_ids_for_accelerator_resource( accelerator_resource_name, f"^{accelerator_resource_name}_group_[0-9A-Za-z]+$", From b634954669559aa51e7827b7387814853b19fc7d Mon Sep 17 00:00:00 2001 From: wuxibin Date: Mon, 15 Apr 2024 12:11:08 +0800 Subject: [PATCH 3/3] fix comment --- python/ray/_private/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index feb87ac17d675..20317baf553d9 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -998,7 +998,7 @@ def get_gpu_ids() -> Union[List[int], List[str]]: gpu_ids = worker.get_accelerator_ids_for_accelerator_resource( ray_constants.GPU, f"^{ray_constants.GPU}_group_[0-9]+_[0-9A-Za-z]+$" ) - if not gpu_ids: + if gpu_ids: return gpu_ids return worker.get_accelerator_ids_for_accelerator_resource(