Skip to content

Commit 69a532c

Browse files
mfbalinyaox12
andauthored
[Feature] Gpu cache for node and edge data (#4341)
Co-authored-by: xiny <[email protected]>
1 parent 7ec78bb commit 69a532c

22 files changed

+4686
-62
lines changed

.lintrunner.toml

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ include_patterns = [
3939
'**/*.cu',
4040
]
4141
exclude_patterns = [
42+
'third_party/**',
4243
]
4344
init_command = [
4445
'python3',

CMakeLists.txt

+15
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,21 @@ if((NOT MSVC) AND (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin"))
227227
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--exclude-libs,ALL")
228228
endif()
229229

230+
# Compile gpu_cache
231+
if(USE_CUDA)
232+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GPU_CACHE")
233+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU_CACHE")
234+
# Manually build gpu_cache because CMake always builds it as shared
235+
file(GLOB gpu_cache_src
236+
third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
237+
)
238+
cuda_add_library(gpu_cache STATIC ${gpu_cache_src})
239+
target_include_directories(gpu_cache PRIVATE "third_party/HugeCTR/gpu_cache/include")
240+
target_include_directories(dgl PRIVATE "third_party/HugeCTR/gpu_cache/include")
241+
list(APPEND DGL_LINKER_LIBS gpu_cache)
242+
message(STATUS "Build with HugeCTR GPU embedding cache.")
243+
endif(USE_CUDA)
244+
230245
# support PARALLEL_ALGORITHMS
231246
if (LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
232247
add_definitions(-DPARALLEL_ALGORITHMS)

cmake/modules/CUDA.cmake

+2-62
Original file line numberDiff line numberDiff line change
@@ -202,59 +202,6 @@ function(dgl_select_nvcc_arch_flags out_variable)
202202
set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
203203
endfunction()
204204

205-
################################################################################################
206-
# Short command for cuda compilation
207-
# Usage:
208-
# dgl_cuda_compile(<objlist_variable> <cuda_files>)
209-
macro(dgl_cuda_compile objlist_variable)
210-
foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
211-
set(${var}_backup_in_cuda_compile_ "${${var}}")
212-
213-
# we remove /EHa as it generates warnings under windows
214-
string(REPLACE "/EHa" "" ${var} "${${var}}")
215-
216-
endforeach()
217-
if(UNIX OR APPLE)
218-
list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC --std=c++14)
219-
endif()
220-
221-
if(APPLE)
222-
list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
223-
endif()
224-
225-
set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G")
226-
227-
if(MSVC)
228-
# disable noisy warnings:
229-
# 4819: The file contains a character that cannot be represented in the current code page (number).
230-
list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819")
231-
foreach(flag_var
232-
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
233-
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
234-
if(${flag_var} MATCHES "/MD")
235-
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
236-
endif(${flag_var} MATCHES "/MD")
237-
endforeach(flag_var)
238-
endif()
239-
240-
# If the build system is a container, make sure the nvcc intermediate files
241-
# go into the build output area rather than in /tmp, which may run out of space
242-
if(IS_CONTAINER_BUILD)
243-
set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
244-
message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}")
245-
list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}")
246-
endif()
247-
248-
cuda_compile(cuda_objcs ${ARGN})
249-
250-
foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
251-
set(${var} "${${var}_backup_in_cuda_compile_}")
252-
unset(${var}_backup_in_cuda_compile_)
253-
endforeach()
254-
255-
set(${objlist_variable} ${cuda_objcs})
256-
endmacro()
257-
258205
################################################################################################
259206
# Config cuda compilation.
260207
# Usage:
@@ -289,7 +236,7 @@ macro(dgl_config_cuda out_variable)
289236
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
290237

291238
# 0. Add host flags
292-
message(STATUS "${CMAKE_CXX_FLAGS}")
239+
message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
293240
string(REGEX REPLACE "[ \t\n\r]" "," CXX_HOST_FLAGS "${CMAKE_CXX_FLAGS}")
294241
if(MSVC AND NOT USE_MSVC_MT)
295242
string(CONCAT CXX_HOST_FLAGS ${CXX_HOST_FLAGS} ",/MD")
@@ -303,14 +250,7 @@ macro(dgl_config_cuda out_variable)
303250
# 2. flags in third_party/moderngpu
304251
list(APPEND CUDA_NVCC_FLAGS "--expt-extended-lambda;-Wno-deprecated-declarations")
305252

306-
307-
# 3. CUDA 11 requires c++14 by default
308-
include(CheckCXXCompilerFlag)
309-
check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14)
310-
string(REPLACE "-std=c++11" "" CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
311-
list(APPEND CUDA_NVCC_FLAGS "-std=c++14")
312-
313-
message(STATUS "CUDA flags: ${CUDA_NVCC_FLAGS}")
253+
message(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
314254

315255
list(APPEND DGL_LINKER_LIBS
316256
${CUDA_CUDART_LIBRARY}

python/dgl/cuda/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
""" CUDA wrappers """
22
from .. import backend as F
33

4+
from .gpu_cache import GPUCache
5+
46
if F.get_preferred_backend() == "pytorch":
57
from . import nccl

python/dgl/cuda/gpu_cache.py

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""API wrapping HugeCTR gpu_cache."""
2+
# Copyright (c) 2022, NVIDIA Corporation
3+
# All rights reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
# @file gpu_cache.py
18+
# @brief API for managing a GPU Cache
19+
20+
from .. import backend as F
21+
from .._ffi.function import _init_api
22+
23+
24+
class GPUCache(object):
25+
"""High-level wrapper for GPU embedding cache"""
26+
27+
def __init__(self, num_items, num_feats, idtype=F.int64):
28+
assert idtype in [F.int32, F.int64]
29+
self._cache = _CAPI_DGLGpuCacheCreate(
30+
num_items, num_feats, 32 if idtype == F.int32 else 64
31+
)
32+
self.idtype = idtype
33+
self.total_miss = 0
34+
self.total_queries = 0
35+
36+
def query(self, keys):
37+
"""Queries the GPU cache.
38+
39+
Parameters
40+
----------
41+
keys : Tensor
42+
The keys to query the GPU cache with.
43+
44+
Returns
45+
-------
46+
tuple(Tensor, Tensor, Tensor)
47+
A tuple containing (values, missing_indices, missing_keys) where
48+
values[missing_indices] corresponds to cache misses that should be
49+
filled by quering another source with missing_keys.
50+
"""
51+
self.total_queries += keys.shape[0]
52+
keys = F.astype(keys, self.idtype)
53+
values, missing_index, missing_keys = _CAPI_DGLGpuCacheQuery(
54+
self._cache, F.to_dgl_nd(keys)
55+
)
56+
self.total_miss += missing_keys.shape[0]
57+
return (
58+
F.from_dgl_nd(values),
59+
F.from_dgl_nd(missing_index),
60+
F.from_dgl_nd(missing_keys),
61+
)
62+
63+
def replace(self, keys, values):
64+
"""Inserts key-value pairs into the GPU cache using the Least-Recently
65+
Used (LRU) algorithm to remove old key-value pairs if it is full.
66+
67+
Parameters
68+
----------
69+
keys: Tensor
70+
The keys to insert to the GPU cache.
71+
values: Tensor
72+
The values to insert to the GPU cache.
73+
"""
74+
keys = F.astype(keys, self.idtype)
75+
values = F.astype(values, F.float32)
76+
_CAPI_DGLGpuCacheReplace(
77+
self._cache, F.to_dgl_nd(keys), F.to_dgl_nd(values)
78+
)
79+
80+
@property
81+
def miss_rate(self):
82+
"""Returns the cache miss rate since creation."""
83+
return self.total_miss / self.total_queries
84+
85+
86+
_init_api("dgl.cuda", __name__)

src/runtime/cuda/gpu_cache.cu

+189
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
/*!
2+
* Copyright (c) 2022 by Contributors
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
* \file gpu_cache.cu
17+
* \brief Implementation of wrapper HugeCTR gpu_cache routines.
18+
*/
19+
20+
#ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
21+
#define DGL_RUNTIME_CUDA_GPU_CACHE_H_
22+
23+
#include <cuda_runtime.h>
24+
#include <dgl/array.h>
25+
#include <dgl/aten/array_ops.h>
26+
#include <dgl/packed_func_ext.h>
27+
#include <dgl/runtime/container.h>
28+
#include <dgl/runtime/device_api.h>
29+
#include <dgl/runtime/object.h>
30+
#include <dgl/runtime/registry.h>
31+
32+
#include <nv_gpu_cache.hpp>
33+
34+
#include "../../runtime/cuda/cuda_common.h"
35+
36+
namespace dgl {
37+
namespace runtime {
38+
namespace cuda {
39+
40+
template <typename key_t>
41+
class GpuCache : public runtime::Object {
42+
constexpr static int set_associativity = 2;
43+
constexpr static int WARP_SIZE = 32;
44+
constexpr static int bucket_size = WARP_SIZE * set_associativity;
45+
using gpu_cache_t = gpu_cache::gpu_cache<
46+
key_t, uint64_t, std::numeric_limits<key_t>::max(), set_associativity,
47+
WARP_SIZE>;
48+
49+
public:
50+
static constexpr const char *_type_key =
51+
sizeof(key_t) == 4 ? "cuda.GpuCache32" : "cuda.GpuCache64";
52+
DGL_DECLARE_OBJECT_TYPE_INFO(GpuCache, Object);
53+
54+
GpuCache(size_t num_items, size_t num_feats)
55+
: num_feats(num_feats),
56+
cache(std::make_unique<gpu_cache_t>(
57+
(num_items + bucket_size - 1) / bucket_size, num_feats)) {
58+
CUDA_CALL(cudaGetDevice(&cuda_device));
59+
}
60+
61+
std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
62+
const auto &ctx = keys->ctx;
63+
cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
64+
auto device = dgl::runtime::DeviceAPI::Get(ctx);
65+
CHECK_EQ(ctx.device_type, kDGLCUDA)
66+
<< "The keys should be on a CUDA device";
67+
CHECK_EQ(ctx.device_id, cuda_device)
68+
<< "The keys should be on the correct CUDA device";
69+
CHECK_EQ(keys->ndim, 1)
70+
<< "The tensor of requested indices must be of dimension one.";
71+
NDArray values = NDArray::Empty(
72+
{keys->shape[0], (int64_t)num_feats}, DGLDataType{kDGLFloat, 32, 1},
73+
ctx);
74+
IdArray missing_index = aten::NewIdArray(keys->shape[0], ctx, 64);
75+
IdArray missing_keys =
76+
aten::NewIdArray(keys->shape[0], ctx, sizeof(key_t) * 8);
77+
size_t *missing_len =
78+
static_cast<size_t *>(device->AllocWorkspace(ctx, sizeof(size_t)));
79+
cache->Query(
80+
static_cast<const key_t *>(keys->data), keys->shape[0],
81+
static_cast<float *>(values->data),
82+
static_cast<uint64_t *>(missing_index->data),
83+
static_cast<key_t *>(missing_keys->data), missing_len, stream);
84+
size_t missing_len_host;
85+
device->CopyDataFromTo(
86+
missing_len, 0, &missing_len_host, 0, sizeof(missing_len_host), ctx,
87+
DGLContext{kDGLCPU, 0}, keys->dtype);
88+
device->FreeWorkspace(ctx, missing_len);
89+
missing_index = missing_index.CreateView(
90+
{(int64_t)missing_len_host}, missing_index->dtype);
91+
missing_keys =
92+
missing_keys.CreateView({(int64_t)missing_len_host}, keys->dtype);
93+
return std::make_tuple(values, missing_index, missing_keys);
94+
}
95+
96+
void Replace(IdArray keys, NDArray values) {
97+
cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
98+
CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
99+
<< "The keys should be on a CUDA device";
100+
CHECK_EQ(keys->ctx.device_id, cuda_device)
101+
<< "The keys should be on the correct CUDA device";
102+
CHECK_EQ(values->ctx.device_type, kDGLCUDA)
103+
<< "The values should be on a CUDA device";
104+
CHECK_EQ(values->ctx.device_id, cuda_device)
105+
<< "The values should be on the correct CUDA device";
106+
CHECK_EQ(keys->shape[0], values->shape[0])
107+
<< "First dimensions of keys and values must match";
108+
CHECK_EQ(values->shape[1], num_feats) << "Embedding dimension must match";
109+
cache->Replace(
110+
static_cast<const key_t *>(keys->data), keys->shape[0],
111+
static_cast<const float *>(values->data), stream);
112+
}
113+
114+
private:
115+
size_t num_feats;
116+
std::unique_ptr<gpu_cache_t> cache;
117+
int cuda_device;
118+
};
119+
120+
static_assert(sizeof(unsigned int) == 4);
121+
DGL_DEFINE_OBJECT_REF(GpuCacheRef32, GpuCache<unsigned int>);
122+
// The cu file in HugeCTR gpu cache uses unsigned int and long long.
123+
// Changing to int64_t results in a mismatch of template arguments.
124+
static_assert(sizeof(long long) == 8); // NOLINT
125+
DGL_DEFINE_OBJECT_REF(GpuCacheRef64, GpuCache<long long>); // NOLINT
126+
127+
/* CAPI **********************************************************************/
128+
129+
using namespace dgl::runtime;
130+
131+
DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheCreate")
132+
.set_body([](DGLArgs args, DGLRetValue *rv) {
133+
const size_t num_items = args[0];
134+
const size_t num_feats = args[1];
135+
const int num_bits = args[2];
136+
137+
if (num_bits == 32)
138+
*rv = GpuCacheRef32(
139+
std::make_shared<GpuCache<unsigned int>>(num_items, num_feats));
140+
else
141+
*rv = GpuCacheRef64(std::make_shared<GpuCache<long long>>( // NOLINT
142+
num_items, num_feats));
143+
});
144+
145+
DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheQuery")
146+
.set_body([](DGLArgs args, DGLRetValue *rv) {
147+
IdArray keys = args[1];
148+
149+
List<ObjectRef> ret;
150+
if (keys->dtype.bits == 32) {
151+
GpuCacheRef32 cache = args[0];
152+
auto result = cache->Query(keys);
153+
154+
ret.push_back(Value(MakeValue(std::get<0>(result))));
155+
ret.push_back(Value(MakeValue(std::get<1>(result))));
156+
ret.push_back(Value(MakeValue(std::get<2>(result))));
157+
} else {
158+
GpuCacheRef64 cache = args[0];
159+
auto result = cache->Query(keys);
160+
161+
ret.push_back(Value(MakeValue(std::get<0>(result))));
162+
ret.push_back(Value(MakeValue(std::get<1>(result))));
163+
ret.push_back(Value(MakeValue(std::get<2>(result))));
164+
}
165+
166+
*rv = ret;
167+
});
168+
169+
DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheReplace")
170+
.set_body([](DGLArgs args, DGLRetValue *rv) {
171+
IdArray keys = args[1];
172+
NDArray values = args[2];
173+
174+
if (keys->dtype.bits == 32) {
175+
GpuCacheRef32 cache = args[0];
176+
cache->Replace(keys, values);
177+
} else {
178+
GpuCacheRef64 cache = args[0];
179+
cache->Replace(keys, values);
180+
}
181+
182+
*rv = List<ObjectRef>{};
183+
});
184+
185+
} // namespace cuda
186+
} // namespace runtime
187+
} // namespace dgl
188+
189+
#endif

0 commit comments

Comments
 (0)