Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable usage of multiple CUDA GPUs in PyKokkos #85

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a392821
Merge pull request #72 from kokkos/develop
NaderAlAwar Aug 29, 2022
0a10c60
PyKokkos: enable multi gpu usage in pykokkos
NaderAlAwar Sep 19, 2022
4f699bf
Examples: add multi gpu usage example
NaderAlAwar Sep 19, 2022
bc7661e
BUG: uint32/64 wrapping
tylerjereddy Sep 20, 2022
1c9ba1b
Merge pull request #87 from tylerjereddy/treddy_issue_86
NaderAlAwar Sep 21, 2022
1eaecd9
ENH, TST: more 0-D handling
tylerjereddy Aug 26, 2022
9458ced
MAINT: PR 67 revisions
tylerjereddy Sep 21, 2022
50cc53b
Merge pull request #67 from tylerjereddy/treddy_issue_66
NaderAlAwar Sep 21, 2022
a602359
MAINT, BUG: typing cleanups
tylerjereddy Sep 22, 2022
35726f5
Merge pull request #88 from tylerjereddy/treddy_type_cleanups
NaderAlAwar Sep 22, 2022
52daba8
ENH: pk.ones() to API standard
tylerjereddy Aug 29, 2022
e47c773
MAINT: cleanup after rebase
tylerjereddy Sep 6, 2022
170a053
MAINT: PR 73 revisions
tylerjereddy Sep 22, 2022
b9e54d9
TST, MAINT: test durations
tylerjereddy Sep 23, 2022
b3b927f
Merge pull request #73 from tylerjereddy/treddy_issue_70
NaderAlAwar Sep 23, 2022
251bf08
Merge pull request #89 from tylerjereddy/treddy_runtests_durations
tylerjereddy Sep 23, 2022
5878590
ENH: pk.ones() to API standard
tylerjereddy Aug 29, 2022
cf996b1
ENH: more creation API
tylerjereddy Sep 6, 2022
d19586e
MAINT: PR 81 revisions
tylerjereddy Sep 23, 2022
5ea6a6b
Merge pull request #81 from tylerjereddy/treddy_ones_like_api
NaderAlAwar Sep 26, 2022
e289d48
PyKokkos: enable multi gpu usage in pykokkos
NaderAlAwar Sep 19, 2022
0ea0589
Examples: add multi gpu usage example
NaderAlAwar Sep 19, 2022
5c73042
CppSetup: pass compiler path to script and account for differences in…
NaderAlAwar Sep 27, 2022
eac7334
kokkos_manager: fix constant values when multi gpu is not enabled
NaderAlAwar Sep 27, 2022
15837b0
Runtime: check if multi gpu is enabled before selecting kokkos gpu mo…
NaderAlAwar Sep 27, 2022
399d9cb
Merge branch 'kokkos:main' into multi_gpu
NaderAlAwar Sep 27, 2022
9a708bd
Merge branch 'multi_gpu' of https://github.com/NaderAlAwar/pykokkos i…
NaderAlAwar Sep 27, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/array_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ jobs:
pip install -r requirements.txt
export ARRAY_API_TESTS_MODULE=pykokkos
# only run a subset of the conformance tests to get started
pytest array_api_tests/meta/test_broadcasting.py array_api_tests/meta/test_equality_mapping.py array_api_tests/meta/test_signatures.py array_api_tests/meta/test_special_cases.py array_api_tests/test_constants.py
pytest array_api_tests/meta/test_broadcasting.py array_api_tests/meta/test_equality_mapping.py array_api_tests/meta/test_signatures.py array_api_tests/meta/test_special_cases.py array_api_tests/test_constants.py array_api_tests/meta/test_utils.py array_api_tests/test_creation_functions.py::test_ones array_api_tests/test_creation_functions.py::test_ones_like
60 changes: 60 additions & 0 deletions examples/pykokkos/multi_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pykokkos as pk

import numpy as np
import cupy as cp

pk.set_default_space(pk.Cuda)

size = 10000

pk.set_device_id(0)
cp_arr_0 = cp.arange(size).astype(np.int32)

pk.set_device_id(1)
cp_arr_1 = cp.arange(size).astype(np.int32)

print(cp_arr_0.device)
print(cp_arr_1.device)

@pk.workunit(cp_arr = pk.ViewTypeInfo(space=pk.CudaSpace))
def reduction_cp(i: int, acc: pk.Acc[int], cp_arr: pk.View1D[int]):
acc += cp_arr[i]

pk.set_device_id(1)
cp_view_0 = pk.from_cupy(cp_arr_1)
result_0 = pk.parallel_reduce(pk.RangePolicy(pk.Cuda, 0, size), reduction_cp, cp_arr=cp_view_0)
print(result_0)

pk.set_device_id(0)
cp_view_1 = pk.from_cupy(cp_arr_0)
result_1 = pk.parallel_reduce(pk.RangePolicy(pk.Cuda, 0, size), reduction_cp, cp_arr=cp_view_1)

print(f"Reducing array 0: {result_0}")
print(f"Reducing array 1: {result_1}")
print(f"Sum: {result_0 + result_1}")

pk.set_device_id(0)
view_0 = pk.View((size,), dtype=int)

pk.set_device_id(1)
view_1 = pk.View((size,), dtype=int)

@pk.workunit
def init_view(i: int, view: pk.View1D[int]):
view[i] = i

@pk.workunit
def reduce_view(i: int, acc: pk.Acc[int], view: pk.View1D[int]):
acc += view[i]

pk.set_device_id(0)
pk.parallel_for(pk.RangePolicy(pk.Cuda, 0, size), init_view, view=view_0)
result_0 = pk.parallel_reduce(pk.RangePolicy(pk.Cuda, 0, size), reduce_view, view=view_0)

pk.set_device_id(1)
pk.parallel_for(pk.RangePolicy(pk.Cuda, 0, size), init_view, view=view_1)
result_1 = pk.parallel_reduce(pk.RangePolicy(pk.Cuda, 0, size), reduce_view, view=view_1)

print(f"Reducing view 0: {result_0}")
print(f"Reducing view 1: {result_1}")
print(f"Sum: {result_0 + result_1}")
13 changes: 10 additions & 3 deletions pykokkos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
initialize, finalize,
get_default_space, set_default_space,
get_default_precision, set_default_precision,
is_uvm_enabled, enable_uvm, disable_uvm
is_uvm_enabled, enable_uvm, disable_uvm,
set_device_id
)

initialize()
Expand Down Expand Up @@ -46,9 +47,15 @@
exp,
exp2,
isinf,
isnan)
isnan,
equal,
isfinite)
from pykokkos.lib.info import iinfo, finfo
from pykokkos.lib.create import zeros
from pykokkos.lib.create import (zeros,
ones,
ones_like,
full)
from pykokkos.lib.manipulate import reshape
from pykokkos.lib.util import all, any
from pykokkos.lib.constants import e, pi, inf, nan

Expand Down
18 changes: 12 additions & 6 deletions pykokkos/core/compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ PK_REAL="${6}"
KOKKOS_LIB_PATH="${7}"
KOKKOS_INCLUDE_PATH="${8}"
COMPUTE_CAPABILITY="${9}"
LIB_SUFFIX="${10}"
COMPILER_PATH="${11}"
SRC=$(find -name "*.cpp")


Expand All @@ -34,11 +36,11 @@ if [ "${COMPILER}" == "g++" ]; then
-shared \
-fopenmp \
"${SRC}".o -o "${MODULE}" \
"${KOKKOS_LIB_PATH}/libkokkoscontainers.so" \
"${KOKKOS_LIB_PATH}/libkokkoscore.so"
"${KOKKOS_LIB_PATH}/libkokkoscontainers${LIB_SUFFIX}.so" \
"${KOKKOS_LIB_PATH}/libkokkoscore${LIB_SUFFIX}.so"

elif [ "${COMPILER}" == "nvcc" ]; then
"${KOKKOS_LIB_PATH}/../bin/nvcc_wrapper" \
"${COMPILER_PATH}" \
`python3 -m pybind11 --includes` \
-I.. \
-O3 \
Expand All @@ -54,14 +56,18 @@ elif [ "${COMPILER}" == "nvcc" ]; then
-Dpk_exec_space="Kokkos::${EXEC_SPACE}" \
-Dpk_real="${PK_REAL}"

"${KOKKOS_LIB_PATH}/../bin/nvcc_wrapper" \
<<<<<<< HEAD
"${COMPILER_PATH}" \
=======
"${KOKKOS_LIB_PATH}/../../bin/nvcc_wrapper" \
>>>>>>> 399d9cb45296bbfa2f1f88a3c71e1f45799e7d26
-I.. \
-O3 \
-shared \
-arch="${COMPUTE_CAPABILITY}" \
--expt-extended-lambda \
-fopenmp \
"${SRC}".o -o "${MODULE}" \
"${KOKKOS_LIB_PATH}/libkokkoscontainers.so" \
"${KOKKOS_LIB_PATH}/libkokkoscore.so"
"${KOKKOS_LIB_PATH}/libkokkoscontainers${LIB_SUFFIX}.so" \
"${KOKKOS_LIB_PATH}/libkokkoscore${LIB_SUFFIX}.so"
fi
2 changes: 1 addition & 1 deletion pykokkos/core/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def compile_entity(
if module_setup.is_compiled():
return

cpp_setup = CppSetup(module_setup.module_file, self.functor_file, self.bindings_file)
cpp_setup = CppSetup(module_setup.module_file, module_setup.gpu_module_files, self.functor_file, self.bindings_file)
translator = StaticTranslator(module_setup.name, self.functor_file, members)

t_start: float = time.perf_counter()
Expand Down
132 changes: 114 additions & 18 deletions pykokkos/core/cpp_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
import shutil
import subprocess
import sys
from types import ModuleType
from typing import List, Tuple


from pykokkos.interface import ExecutionSpace, get_default_layout, get_default_memory_space
from pykokkos.interface import (
ExecutionSpace, get_default_layout, get_default_memory_space,
is_host_execution_space
)
import pykokkos.kokkos_manager as km


Expand All @@ -15,16 +18,18 @@ class CppSetup:
Creates the directory to hold the translation and invokes the compiler
"""

def __init__(self, module_file: str, functor: str, bindings: str):
def __init__(self, module_file: str, gpu_module_files: List[str], functor: str, bindings: str):
"""
CppSetup constructor

:param module: the name of the file containing the compiled Python module
:param gpu_module_files: the list of names of files containing for each gpu module
:param functor: the name of the generated functor file
:param bindings: the name of the generated bindings file
"""

self.module_file: str = module_file
self.gpu_module_files: List[str] = gpu_module_files
self.functor_file: str = functor
self.bindings_file: str = bindings

Expand Down Expand Up @@ -58,6 +63,8 @@ def compile(
self.write_source(output_dir, functor, bindings)
self.copy_script(output_dir)
self.invoke_script(output_dir, space, enable_uvm, compiler)
if space is ExecutionSpace.Cuda and km.is_multi_gpu_enabled():
self.copy_multi_gpu_kernel(output_dir)


def initialize_directory(self, name: Path) -> None:
Expand Down Expand Up @@ -115,15 +122,17 @@ def copy_script(self, output_dir: Path) -> None:
print(f"Exception while copying views and makefile: {ex}")
sys.exit(1)

def get_kokkos_paths(self) -> Tuple[Path, Path]:
def get_kokkos_paths(self, space: ExecutionSpace, compiler: str) -> Tuple[Path, Path, Path]:
"""
Get the paths of the Kokkos instal lib and include
directories. If the environment variable is set, use that
Kokkos install. If not, fall back to installed pykokkos-base
package.
Kokkos install. If not, fall back to the installed
pykokkos-base package.

:returns: a tuple of paths to the Kokkos lib/ and include/
directories respectively
:param space: the execution space to compile for
:param compiler: what compiler to use
:returns: a tuple of paths to the Kokkos lib/, include/,
and compiler to be used
"""

lib_path: Path
Expand All @@ -139,20 +148,60 @@ def get_kokkos_paths(self) -> Tuple[Path, Path]:

return lib_path, include_path

from pykokkos.bindings import kokkos
install_path = Path(kokkos.__path__[0]).parent
is_cpu: bool = is_host_execution_space(space)
kokkos_lib: ModuleType = km.get_kokkos_module(is_cpu)
install_path = Path(kokkos_lib.__path__[0])
lib_parent_path: Path
if km.is_multi_gpu_enabled():
lib_parent_path = install_path
else:
lib_parent_path = install_path.parent

if (install_path / "lib").is_dir():
lib_path = install_path / "lib"
elif (install_path / "lib64").is_dir():
lib_path = install_path / "lib64"
if (lib_parent_path / "lib").is_dir():
lib_path = lib_parent_path / "lib"
elif (lib_parent_path / "lib64").is_dir():
lib_path = lib_parent_path / "lib64"
else:
raise RuntimeError("lib/ or lib64/ directories not found in installed pykokkos-base package."
f" Try setting {self.lib_path_env} instead.")

include_path = lib_path.parent / "include/kokkos"
include_path = install_path.parent / "include/kokkos"

compiler_path: Path
if compiler != "nvcc":
compiler_path = Path("g++")
else:
compiler_path = install_path.parent / "bin/nvcc_wrapper"

return lib_path, include_path, compiler_path

def get_kokkos_lib_suffix(self, space: ExecutionSpace) -> str:
"""
Get the suffix of the libkokkoscore and libkokkoscontainers
libraries corresponding to the enabled device

:param space: the execution space to compile for
:returns: the suffix as a string
"""

if is_host_execution_space(space) or not km.is_multi_gpu_enabled():
return ""

return f"_{km.get_device_id()}"

return lib_path, include_path
def get_kokkos_lib_suffix(self, space: ExecutionSpace) -> str:
"""
Get the suffix of the libkokkoscore and libkokkoscontainers
libraries corresponding to the enabled device

:param space: the execution space to compile for
:returns: the suffix as a string
"""

if is_host_execution_space(space) or not km.is_multi_gpu_enabled():
return ""

return f"_{km.get_device_id()}"

def invoke_script(self, output_dir: Path, space: ExecutionSpace, enable_uvm: bool, compiler: str) -> None:
"""
Expand All @@ -176,8 +225,10 @@ def invoke_script(self, output_dir: Path, space: ExecutionSpace, enable_uvm: boo
precision: str = km.get_default_precision().__name__.split(".")[-1]
lib_path: Path
include_path: Path
lib_path, include_path = self.get_kokkos_paths()
compiler_path: Path
lib_path, include_path, compiler_path = self.get_kokkos_paths(space, compiler)
compute_capability: str = self.get_cuda_compute_capability(compiler)
lib_suffix: str = self.get_kokkos_lib_suffix(space)

command: List[str] = [f"./{self.script}",
compiler, # What compiler to use
Expand All @@ -188,7 +239,9 @@ def invoke_script(self, output_dir: Path, space: ExecutionSpace, enable_uvm: boo
precision, # Default real precision
str(lib_path), # Path to Kokkos install lib/ directory
str(include_path), # Path to Kokkos install include/ directory
compute_capability] # Device compute capability
compute_capability, # Device compute capability
lib_suffix, # The libkokkos* suffix identifying the gpu
str(compiler_path)] # The path to the compiler to use
compile_result = subprocess.run(command, cwd=output_dir, capture_output=True, check=False)

if compile_result.returncode != 0:
Expand All @@ -207,6 +260,49 @@ def invoke_script(self, output_dir: Path, space: ExecutionSpace, enable_uvm: boo
print(f"patchelf failed")
sys.exit(1)

def copy_multi_gpu_kernel(self, output_dir: Path) -> None:
"""
Copy the kernel .so file once for each device and run patchelf
to point to the right library

:param output_dir: the base directory
"""

original_module: Path = output_dir / self.module_file
for id, (kernel_filename, kokkos_gpu_module) in enumerate(zip(self.gpu_module_files, km.get_kokkos_gpu_modules())):
kernel_path: Path = output_dir / kernel_filename

try:
shutil.copy(original_module, kernel_path)
except Exception as ex:
print(f"Exception while copying kernel: {ex}")
sys.exit(1)

lib_path: Path = Path(kokkos_gpu_module.__path__[0]) / "lib"
patchelf: List[str] = ["patchelf",
"--set-rpath",
str(lib_path),
kernel_filename]

patchelf_result = subprocess.run(patchelf, cwd=output_dir, capture_output=True, check=False)
if patchelf_result.returncode != 0:
print(patchelf_result.stderr.decode("utf-8"))
print(f"patchelf failed")
sys.exit(1)

# Now replace the needed libkokkos* libraries with the correct version
needed_libraries: str = subprocess.run(["patchelf", "--print-needed", kernel_filename], cwd=output_dir, capture_output=True, check=False).stdout.decode("utf-8")

for line in needed_libraries.splitlines():
if "libkokkoscore" in line or "libkokkoscontainers" in line:
# Line will be of the form f"libkokkoscore_{id}.so.3.4"
# This will extract id
current_id: int = int(line.split("_")[1].split(".")[0])
to_remove: str = line
to_add: str = line.replace(f"_{current_id}", f"_{id}")

subprocess.run(["patchelf", "--replace-needed", to_remove, to_add, kernel_filename], cwd=output_dir, capture_output=True, check=False)

def get_cuda_compute_capability(self, compiler: str) -> str:
"""
Get the compute capability of an Nvidia GPU
Expand Down
8 changes: 7 additions & 1 deletion pykokkos/core/module_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys
import sysconfig
import time
from typing import Callable, Optional, Union
from typing import Callable, List, Optional, Union

from pykokkos.interface import ExecutionSpace
import pykokkos.kokkos_manager as km
Expand Down Expand Up @@ -105,9 +105,15 @@ def __init__(

self.main: Path = self.get_main_path()
self.output_dir: Optional[Path] = self.get_output_dir(self.main, self.metadata, space)
self.gpu_module_files: List[str] = []
if km.is_multi_gpu_enabled():
self.gpu_module_files = [f"kernel{device_id}{suffix}" for device_id in range(km.get_num_gpus())]

if self.output_dir is not None:
self.path: str = os.path.join(self.output_dir, self.module_file)
if km.is_multi_gpu_enabled():
self.gpu_module_paths: str = [os.path.join(self.output_dir, module_file) for module_file in self.gpu_module_files]

self.name: str = self.path.replace("/", "_")
self.name: str = self.name.replace("-", "_")
self.name: str = self.name.replace(".", "_")
Expand Down
Loading