dmlc
diff --git a/‎.lintrunner.toml
+1 b/‎.lintrunner.toml
+1
diff --git a/‎CMakeLists.txt
+15 b/‎CMakeLists.txt
+15
diff --git a/‎cmake/modules/CUDA.cmake
+2-62 b/‎cmake/modules/CUDA.cmake
+2-62
diff --git a/‎python/dgl/cuda/__init__.py
+2 b/‎python/dgl/cuda/__init__.py
+2
diff --git a/‎python/dgl/cuda/gpu_cache.py
+86 b/‎python/dgl/cuda/gpu_cache.py
+86
diff --git a/‎src/runtime/cuda/gpu_cache.cu
+189 b/‎src/runtime/cuda/gpu_cache.cu
+189
@@ -39,6 +39,7 @@ include_patterns = [
     '**/*.cu',
 ]
 exclude_patterns = [
+    'third_party/**',
 ]
 init_command = [
     'python3',
 
@@ -227,6 +227,21 @@ if((NOT MSVC) AND (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin"))
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--exclude-libs,ALL")
 endif()
 
+# Compile gpu_cache
+if(USE_CUDA)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GPU_CACHE")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU_CACHE")
+  # Manually build gpu_cache because CMake always builds it as shared
+  file(GLOB gpu_cache_src
+    third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
+  )
+  cuda_add_library(gpu_cache STATIC ${gpu_cache_src})
+  target_include_directories(gpu_cache PRIVATE "third_party/HugeCTR/gpu_cache/include")
+  target_include_directories(dgl PRIVATE "third_party/HugeCTR/gpu_cache/include")
+  list(APPEND DGL_LINKER_LIBS gpu_cache)
+  message(STATUS "Build with HugeCTR GPU embedding cache.")
+endif(USE_CUDA)
+
 # support PARALLEL_ALGORITHMS
 if (LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
   add_definitions(-DPARALLEL_ALGORITHMS)
 
@@ -202,59 +202,6 @@ function(dgl_select_nvcc_arch_flags out_variable)
   set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
 endfunction()
 
-################################################################################################
-# Short command for cuda compilation
-# Usage:
-#   dgl_cuda_compile(<objlist_variable> <cuda_files>)
-macro(dgl_cuda_compile objlist_variable)
-  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
-    set(${var}_backup_in_cuda_compile_ "${${var}}")
-
-    # we remove /EHa as it generates warnings under windows
-    string(REPLACE "/EHa" "" ${var} "${${var}}")
-
-  endforeach()
-  if(UNIX OR APPLE)
-    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC --std=c++14)
-  endif()
-
-  if(APPLE)
-    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
-  endif()
-
-  set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G")
-
-  if(MSVC)
-    # disable noisy warnings:
-    # 4819: The file contains a character that cannot be represented in the current code page (number).
-    list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819")
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-  endif()
-
-  # If the build system is a container, make sure the nvcc intermediate files
-  # go into the build output area rather than in /tmp, which may run out of space
-  if(IS_CONTAINER_BUILD)
-    set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-    message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}")
-    list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}")
-  endif()
-
-  cuda_compile(cuda_objcs ${ARGN})
-
-  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
-    set(${var} "${${var}_backup_in_cuda_compile_}")
-    unset(${var}_backup_in_cuda_compile_)
-  endforeach()
-
-  set(${objlist_variable} ${cuda_objcs})
-endmacro()
-
 ################################################################################################
 # Config cuda compilation.
 # Usage:
@@ -289,7 +236,7 @@ macro(dgl_config_cuda out_variable)
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
   # 0. Add host flags
-  message(STATUS "${CMAKE_CXX_FLAGS}")
+  message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
   string(REGEX REPLACE "[ \t\n\r]" "," CXX_HOST_FLAGS "${CMAKE_CXX_FLAGS}")
   if(MSVC AND NOT USE_MSVC_MT)
     string(CONCAT CXX_HOST_FLAGS ${CXX_HOST_FLAGS} ",/MD")
@@ -303,14 +250,7 @@ macro(dgl_config_cuda out_variable)
   # 2. flags in third_party/moderngpu
   list(APPEND CUDA_NVCC_FLAGS "--expt-extended-lambda;-Wno-deprecated-declarations")
 
-
-  # 3. CUDA 11 requires c++14 by default
-  include(CheckCXXCompilerFlag)
-  check_cxx_compiler_flag("-std=c++14"    SUPPORT_CXX14)
-  string(REPLACE "-std=c++11" "" CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-  list(APPEND CUDA_NVCC_FLAGS "-std=c++14")
-
-  message(STATUS "CUDA flags: ${CUDA_NVCC_FLAGS}")
+  message(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
 
   list(APPEND DGL_LINKER_LIBS
     ${CUDA_CUDART_LIBRARY}
 
@@ -1,5 +1,7 @@
 """ CUDA wrappers """
 from .. import backend as F
 
+from .gpu_cache import GPUCache
+
 if F.get_preferred_backend() == "pytorch":
     from . import nccl
@@ -0,0 +1,86 @@
+"""API wrapping HugeCTR gpu_cache."""
+#    Copyright (c) 2022, NVIDIA Corporation
+#    All rights reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+#
+#  @file gpu_cache.py
+#  @brief API for managing a GPU Cache
+
+from .. import backend as F
+from .._ffi.function import _init_api
+
+
+class GPUCache(object):
+    """High-level wrapper for GPU embedding cache"""
+
+    def __init__(self, num_items, num_feats, idtype=F.int64):
+        assert idtype in [F.int32, F.int64]
+        self._cache = _CAPI_DGLGpuCacheCreate(
+            num_items, num_feats, 32 if idtype == F.int32 else 64
+        )
+        self.idtype = idtype
+        self.total_miss = 0
+        self.total_queries = 0
+
+    def query(self, keys):
+        """Queries the GPU cache.
+
+        Parameters
+        ----------
+        keys : Tensor
+            The keys to query the GPU cache with.
+
+        Returns
+        -------
+        tuple(Tensor, Tensor, Tensor)
+            A tuple containing (values, missing_indices, missing_keys) where
+            values[missing_indices] corresponds to cache misses that should be
+            filled by quering another source with missing_keys.
+        """
+        self.total_queries += keys.shape[0]
+        keys = F.astype(keys, self.idtype)
+        values, missing_index, missing_keys = _CAPI_DGLGpuCacheQuery(
+            self._cache, F.to_dgl_nd(keys)
+        )
+        self.total_miss += missing_keys.shape[0]
+        return (
+            F.from_dgl_nd(values),
+            F.from_dgl_nd(missing_index),
+            F.from_dgl_nd(missing_keys),
+        )
+
+    def replace(self, keys, values):
+        """Inserts key-value pairs into the GPU cache using the Least-Recently
+        Used (LRU) algorithm to remove old key-value pairs if it is full.
+
+        Parameters
+        ----------
+        keys: Tensor
+            The keys to insert to the GPU cache.
+        values: Tensor
+            The values to insert to the GPU cache.
+        """
+        keys = F.astype(keys, self.idtype)
+        values = F.astype(values, F.float32)
+        _CAPI_DGLGpuCacheReplace(
+            self._cache, F.to_dgl_nd(keys), F.to_dgl_nd(values)
+        )
+
+    @property
+    def miss_rate(self):
+        """Returns the cache miss rate since creation."""
+        return self.total_miss / self.total_queries
+
+
+_init_api("dgl.cuda", __name__)
@@ -0,0 +1,189 @@
+/*!
+ *  Copyright (c) 2022 by Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * \file gpu_cache.cu
+ * \brief Implementation of wrapper HugeCTR gpu_cache routines.
+ */
+
+#ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
+#define DGL_RUNTIME_CUDA_GPU_CACHE_H_
+
+#include <cuda_runtime.h>
+#include <dgl/array.h>
+#include <dgl/aten/array_ops.h>
+#include <dgl/packed_func_ext.h>
+#include <dgl/runtime/container.h>
+#include <dgl/runtime/device_api.h>
+#include <dgl/runtime/object.h>
+#include <dgl/runtime/registry.h>
+
+#include <nv_gpu_cache.hpp>
+
+#include "../../runtime/cuda/cuda_common.h"
+
+namespace dgl {
+namespace runtime {
+namespace cuda {
+
+template <typename key_t>
+class GpuCache : public runtime::Object {
+  constexpr static int set_associativity = 2;
+  constexpr static int WARP_SIZE = 32;
+  constexpr static int bucket_size = WARP_SIZE * set_associativity;
+  using gpu_cache_t = gpu_cache::gpu_cache<
+      key_t, uint64_t, std::numeric_limits<key_t>::max(), set_associativity,
+      WARP_SIZE>;
+
+ public:
+  static constexpr const char *_type_key =
+      sizeof(key_t) == 4 ? "cuda.GpuCache32" : "cuda.GpuCache64";
+  DGL_DECLARE_OBJECT_TYPE_INFO(GpuCache, Object);
+
+  GpuCache(size_t num_items, size_t num_feats)
+      : num_feats(num_feats),
+        cache(std::make_unique<gpu_cache_t>(
+            (num_items + bucket_size - 1) / bucket_size, num_feats)) {
+    CUDA_CALL(cudaGetDevice(&cuda_device));
+  }
+
+  std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
+    const auto &ctx = keys->ctx;
+    cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
+    auto device = dgl::runtime::DeviceAPI::Get(ctx);
+    CHECK_EQ(ctx.device_type, kDGLCUDA)
+        << "The keys should be on a CUDA device";
+    CHECK_EQ(ctx.device_id, cuda_device)
+        << "The keys should be on the correct CUDA device";
+    CHECK_EQ(keys->ndim, 1)
+        << "The tensor of requested indices must be of dimension one.";
+    NDArray values = NDArray::Empty(
+        {keys->shape[0], (int64_t)num_feats}, DGLDataType{kDGLFloat, 32, 1},
+        ctx);
+    IdArray missing_index = aten::NewIdArray(keys->shape[0], ctx, 64);
+    IdArray missing_keys =
+        aten::NewIdArray(keys->shape[0], ctx, sizeof(key_t) * 8);
+    size_t *missing_len =
+        static_cast<size_t *>(device->AllocWorkspace(ctx, sizeof(size_t)));
+    cache->Query(
+        static_cast<const key_t *>(keys->data), keys->shape[0],
+        static_cast<float *>(values->data),
+        static_cast<uint64_t *>(missing_index->data),
+        static_cast<key_t *>(missing_keys->data), missing_len, stream);
+    size_t missing_len_host;
+    device->CopyDataFromTo(
+        missing_len, 0, &missing_len_host, 0, sizeof(missing_len_host), ctx,
+        DGLContext{kDGLCPU, 0}, keys->dtype);
+    device->FreeWorkspace(ctx, missing_len);
+    missing_index = missing_index.CreateView(
+        {(int64_t)missing_len_host}, missing_index->dtype);
+    missing_keys =
+        missing_keys.CreateView({(int64_t)missing_len_host}, keys->dtype);
+    return std::make_tuple(values, missing_index, missing_keys);
+  }
+
+  void Replace(IdArray keys, NDArray values) {
+    cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
+    CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
+        << "The keys should be on a CUDA device";
+    CHECK_EQ(keys->ctx.device_id, cuda_device)
+        << "The keys should be on the correct CUDA device";
+    CHECK_EQ(values->ctx.device_type, kDGLCUDA)
+        << "The values should be on a CUDA device";
+    CHECK_EQ(values->ctx.device_id, cuda_device)
+        << "The values should be on the correct CUDA device";
+    CHECK_EQ(keys->shape[0], values->shape[0])
+        << "First dimensions of keys and values must match";
+    CHECK_EQ(values->shape[1], num_feats) << "Embedding dimension must match";
+    cache->Replace(
+        static_cast<const key_t *>(keys->data), keys->shape[0],
+        static_cast<const float *>(values->data), stream);
+  }
+
+ private:
+  size_t num_feats;
+  std::unique_ptr<gpu_cache_t> cache;
+  int cuda_device;
+};
+
+static_assert(sizeof(unsigned int) == 4);
+DGL_DEFINE_OBJECT_REF(GpuCacheRef32, GpuCache<unsigned int>);
+// The cu file in HugeCTR gpu cache uses unsigned int and long long.
+// Changing to int64_t results in a mismatch of template arguments.
+static_assert(sizeof(long long) == 8);                      // NOLINT
+DGL_DEFINE_OBJECT_REF(GpuCacheRef64, GpuCache<long long>);  // NOLINT
+
+/* CAPI **********************************************************************/
+
+using namespace dgl::runtime;
+
+DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheCreate")
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      const size_t num_items = args[0];
+      const size_t num_feats = args[1];
+      const int num_bits = args[2];
+
+      if (num_bits == 32)
+        *rv = GpuCacheRef32(
+            std::make_shared<GpuCache<unsigned int>>(num_items, num_feats));
+      else
+        *rv = GpuCacheRef64(std::make_shared<GpuCache<long long>>(  // NOLINT
+            num_items, num_feats));
+    });
+
+DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheQuery")
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      IdArray keys = args[1];
+
+      List<ObjectRef> ret;
+      if (keys->dtype.bits == 32) {
+        GpuCacheRef32 cache = args[0];
+        auto result = cache->Query(keys);
+
+        ret.push_back(Value(MakeValue(std::get<0>(result))));
+        ret.push_back(Value(MakeValue(std::get<1>(result))));
+        ret.push_back(Value(MakeValue(std::get<2>(result))));
+      } else {
+        GpuCacheRef64 cache = args[0];
+        auto result = cache->Query(keys);
+
+        ret.push_back(Value(MakeValue(std::get<0>(result))));
+        ret.push_back(Value(MakeValue(std::get<1>(result))));
+        ret.push_back(Value(MakeValue(std::get<2>(result))));
+      }
+
+      *rv = ret;
+    });
+
+DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheReplace")
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      IdArray keys = args[1];
+      NDArray values = args[2];
+
+      if (keys->dtype.bits == 32) {
+        GpuCacheRef32 cache = args[0];
+        cache->Replace(keys, values);
+      } else {
+        GpuCacheRef64 cache = args[0];
+        cache->Replace(keys, values);
+      }
+
+      *rv = List<ObjectRef>{};
+    });
+
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace dgl
+
+#endif
Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ include_patterns = [`
`39`	`39`	`'*/.cu',`
`40`	`40`	`]`
`41`	`41`	`exclude_patterns = [`
	`42`	`+ 'third_party/**',`
`42`	`43`	`]`
`43`	`44`	`init_command = [`
`44`	`45`	`'python3',`