Lock thread/block computation (pytorch#73800)

Elias Ellison · pytorchmergebot · commit 52ccbf449495 · 2022-03-04T22:32:08.000Z
Summary: Pull Request resolved: pytorch#73800 Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D34647281 Pulled By: eellison fbshipit-source-id: adbdaf24191c4c1b85e0b62564388f2481002ed2 (cherry picked from commit 6cf3801)
diff --git a/test/cpp/tensorexpr/test_dynamic_shapes.cpp b/test/cpp/tensorexpr/test_dynamic_shapes.cpp
@@ -1,5 +1,7 @@
 #include <gtest/gtest.h>
 
+#include <ATen/code_template.h>
+#include <c10/core/DeviceType.h>
 #include <test/cpp/tensorexpr/test_base.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
@@ -629,59 +631,68 @@ TEST(DynamicShapes, GraphFromModel) {
 
 TEST(DynamicShapes, MultiThreadedExecution) {
 #ifdef TORCH_ENABLE_LLVM
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  const auto graph_string = R"IR(
-      graph(%x : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
-            %y : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
+  const auto graph_template = R"IR(
+      graph(%x : Float(SS(-2), SS(-3), requires_grad=0, device=${device}),
+            %y : Float(SS(-2), SS(-3), requires_grad=0, device=${device}),
             %SS_2 : int,
             %SS_3 : int):
-        %3 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::tanh(%x)
-        %4 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::erf(%3)
-        %5 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::mul(%4, %y)
+        %3 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::tanh(%x)
+        %4 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::erf(%3)
+        %5 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::mul(%4, %y)
         return (%5))IR";
-  torch::jit::parseIR(graph_string, graph.get());
-
-  std::vector<int64_t> symbolic_shape_inputs = {-2, -3};
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[graph->inputs().at(0)] = input_desc;
-  symbolic_strides[graph->inputs().at(1)] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = input_desc;
-
-  TensorExprKernel kernel(
-      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-  auto run_kernel = [&](int dim1, int dim2) {
-    auto a =
-        at::rand({dim1, dim2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto b =
-        at::rand({dim1, dim2}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-
-    auto ref = at::mul(at::erf(at::tanh(a)), b);
-
-    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
-    stack.emplace_back(dim1);
-    stack.emplace_back(dim2);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  };
-
-  // Run the kernel in parallel to ensure that the run() method calls in
-  // TensorExprKernel are not changing any state.
-  constexpr size_t kNumThreads = 4;
-  std::vector<std::thread> threads;
-  for (size_t id = 0; id < kNumThreads; ++id) {
-    threads.emplace_back(run_kernel, id + 5, id + 20);
-  }
-  for (auto& t : threads) {
-    t.join();
+  for (bool use_cuda : {false, true}) {
+    if (!torch::cuda::is_available() && use_cuda) {
+      continue;
+    }
+    auto device = use_cuda ? at::kCUDA : at::kCPU;
+    at::jit::TemplateEnv env;
+    env.s("device", use_cuda ? "cuda:0" : "cpu");
+    const auto graph_string = format(graph_template, env);
+    std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+    torch::jit::parseIR(graph_string, graph.get());
+
+    std::vector<int64_t> symbolic_shape_inputs = {-2, -3};
+
+    std::vector<torch::jit::StrideInput> input_desc = {
+        torch::jit::StrideInput::TENSOR_CONT};
+    std::unordered_map<
+        const torch::jit::Value*,
+        std::vector<torch::jit::StrideInput>>
+        symbolic_strides;
+    symbolic_strides[graph->inputs().at(0)] = input_desc;
+    symbolic_strides[graph->inputs().at(1)] = input_desc;
+    symbolic_strides[graph->outputs().at(0)] = input_desc;
+
+    TensorExprKernel kernel(
+        graph, {}, symbolic_shape_inputs, false, symbolic_strides);
+
+    auto run_kernel = [&](int dim1, int dim2) {
+      auto a =
+          at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat));
+      auto b =
+          at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat));
+
+      auto ref = at::mul(at::erf(at::tanh(a)), b);
+
+      std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
+      stack.emplace_back(dim1);
+      stack.emplace_back(dim2);
+      kernel.run(stack);
+
+      auto o = stack[0].toTensor();
+      ASSERT_TRUE(at::allclose(o, ref));
+    };
+
+    // Run the kernel in parallel to ensure that the run() method calls in
+    // TensorExprKernel are not changing any state.
+    constexpr size_t kNumThreads = 4;
+    std::vector<std::thread> threads;
+    for (size_t id = 0; id < kNumThreads; ++id) {
+      threads.emplace_back(run_kernel, id + 5, id + 20);
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
   }
 #endif
 }
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -1170,15 +1170,24 @@ void CudaCodeGen::call_raw(const std::vector<void*>& raw_args) {
       gpu_block_extents_v[i] = immediateAs<int64_t>(gpu_block_extents[i]);
       continue;
     }
-    gpu_block_extents_v[i] = block_extents_eval_[i].value<int64_t>(extent_args);
+    {
+      // invocation of block_extents_eval_ isn't thread safe and this function
+      // may be invoked by multiple threads
+      std::lock_guard<std::mutex> guard(eval_lock_);
+      gpu_block_extents_v[i] =
+          block_extents_eval_[i].value<int64_t>(extent_args);
+    }
   }
   for (size_t i = 0; i < gpu_thread_extents.size(); i++) {
     if (gpu_thread_extents[i]->isConstant()) {
       gpu_thread_extents_v[i] = immediateAs<int64_t>(gpu_thread_extents[i]);
       continue;
     }
-    gpu_thread_extents_v[i] =
-        thread_extents_eval_[i].value<int64_t>(extent_args);
+    {
+      std::lock_guard<std::mutex> guard(eval_lock_);
+      gpu_thread_extents_v[i] =
+          thread_extents_eval_[i].value<int64_t>(extent_args);
+    }
   }
 
   // Skip launching the kernel if there are no elements to process.
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.h b/torch/csrc/jit/tensorexpr/cuda_codegen.h
@@ -273,6 +273,7 @@ class TORCH_CUDA_CU_API CudaCodeGen : public CodeGen {
   std::unique_ptr<CudaAnalysis> cuda_analysis_;
   std::unique_ptr<GPUMetaVarRewriter> metavar_rewriter_;
   std::unordered_set<std::string> taken_func_names;
+  std::mutex eval_lock_;
   CUfunction function_;
   bool has_random_ = false;
   int thread_block_size_ = -1;