metascroy
diff --git a/‎benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+4-8 b/‎benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+4-8
diff --git a/‎benchmarks/cpp/tensorexpr/bench_compile.cpp
+29-40 b/‎benchmarks/cpp/tensorexpr/bench_compile.cpp
+29-40
diff --git a/‎benchmarks/cpp/tensorexpr/bench_concat.cpp
+1-1 b/‎benchmarks/cpp/tensorexpr/bench_concat.cpp
+1-1
diff --git a/‎benchmarks/cpp/tensorexpr/bench_gemm.cpp
+10-10 b/‎benchmarks/cpp/tensorexpr/bench_gemm.cpp
+10-10
diff --git a/‎benchmarks/cpp/tensorexpr/bench_parallel.cpp
+1-1 b/‎benchmarks/cpp/tensorexpr/bench_parallel.cpp
+1-1
diff --git a/‎benchmarks/cpp/tensorexpr/bench_reduce.cpp
+7-7 b/‎benchmarks/cpp/tensorexpr/bench_reduce.cpp
+7-7
diff --git a/‎benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
+6-6 b/‎benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
+6-6
@@ -82,10 +82,8 @@ BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
   VarHandle eps("eps", kFloat);
 
   using axis = const VarHandle&;
-  Tensor output = Compute(
-      "output",
-      {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
-      [&](axis n, axis c, axis h, axis w) {
+  Tensor output =
+      Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) {
         // Compute affine terms.
         auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
         auto weight_v = weight.load(c);
@@ -143,10 +141,8 @@ BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
   VarHandle eps("eps", kFloat);
 
   using axis = const VarHandle&;
-  Tensor output = Compute(
-      "output",
-      {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
-      [&](axis n, axis c, axis h, axis w) {
+  Tensor output =
+      Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) {
         // Compute affine terms.
         auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps);
         auto weight_v = weight.load(c);
 
@@ -12,26 +12,21 @@ static void BM_CompileSwish(benchmark::State& state) {
     constexpr int N = 512;
     te::VarHandle n("n", te::kInt);
     te::BufHandle A("A", {N}, te::kFloat);
-    te::Tensor relu =
-        te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
-          return te::Max::make(A.load(i), 0.f, false);
-        });
-    te::Tensor min6 =
-        te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
-          return te::Min::make(relu.load(i), 6.f, false);
-        });
-    te::Tensor plus3 =
-        te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
-          return min6.load(i) + 3.f;
-        });
-    te::Tensor times =
-        te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
-          return A.load(i) * plus3.load(i);
-        });
-    te::Tensor sixth =
-        te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
-          return times.load(i) * 1.f / 6.f;
-        });
+    te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) {
+      return te::Max::make(A.load(i), 0.f, false);
+    });
+    te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) {
+      return te::Min::make(relu.load(i), 6.f, false);
+    });
+    te::Tensor plus3 = te::Compute("plus3", {n}, [&](const te::VarHandle& i) {
+      return min6.load(i) + 3.f;
+    });
+    te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) {
+      return A.load(i) * plus3.load(i);
+    });
+    te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) {
+      return times.load(i) * 1.f / 6.f;
+    });
     te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
     for (auto tensor : {relu, min6, plus3, times}) {
       nest.computeInline(tensor.buf());
@@ -46,26 +41,20 @@ static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
   constexpr int N = 512;
   te::VarHandle n("n", te::kInt);
   te::BufHandle A("A", {N}, te::kFloat);
-  te::Tensor relu =
-      te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
-        return te::Max::make(A.load(i), 0.f, false);
-      });
-  te::Tensor min6 =
-      te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
-        return te::Min::make(relu.load(i), 6.f, false);
-      });
-  te::Tensor plus3 =
-      te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
-        return min6.load(i) + 3.f;
-      });
-  te::Tensor times =
-      te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
-        return A.load(i) * plus3.load(i);
-      });
-  te::Tensor sixth =
-      te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
-        return times.load(i) * 1.f / 6.f;
-      });
+  te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) {
+    return te::Max::make(A.load(i), 0.f, false);
+  });
+  te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) {
+    return te::Min::make(relu.load(i), 6.f, false);
+  });
+  te::Tensor plus3 = te::Compute(
+      "plus3", {n}, [&](const te::VarHandle& i) { return min6.load(i) + 3.f; });
+  te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) {
+    return A.load(i) * plus3.load(i);
+  });
+  te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) {
+    return times.load(i) * 1.f / 6.f;
+  });
   te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
   for (auto tensor : {relu, min6, plus3, times}) {
     nest.computeInline(tensor.buf());
 
@@ -61,7 +61,7 @@ class ConcatBench : public benchmark::Fixture {
 
     Tensor output = Compute(
         "aten_cat",
-        {{output_size_[0], "M"}, {output_size_[1], "N"}},
+        {output_size_[0], output_size_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           int d = 0;
           std::vector<int> cumulative_concat_dim_sizes(num_inputs);
 
@@ -44,12 +44,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
   te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       te::Sum(),
       [&](const te::ExprHandle& m,
           const te::ExprHandle& n,
           const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {{K, "K"}});
+      {K});
   te::LoopNest loop({CT});
   loop.prepareForCodegen();
   te::StmtPtr s = loop.root_stmt();
@@ -66,12 +66,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
   te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       te::Sum(),
       [&](const te::ExprHandle& m,
           const te::ExprHandle& n,
           const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {{K, "K"}});
+      {K});
   te::LoopNest loop({CT});
 
   {
@@ -124,12 +124,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
   te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       te::Sum(),
       [&](const te::ExprHandle& m,
           const te::ExprHandle& n,
           const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {{K, "K"}});
+      {K});
   te::LoopNest loop({CT});
 
   {
@@ -182,12 +182,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
   te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       te::Sum(),
       [&](const te::ExprHandle& m,
           const te::ExprHandle& n,
           const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {{K, "K"}});
+      {K});
   te::LoopNest loop({CT});
 
   {
@@ -248,12 +248,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
   te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
-      {{M, "M"}, {N, "N"}},
+      {M, N},
       te::Sum(),
       [&](const te::ExprHandle& m,
           const te::ExprHandle& n,
           const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); },
-      {{K, "K"}});
+      {K});
   te::LoopNest loop({CT});
 
   {
 
@@ -38,7 +38,7 @@ class ParallelAdd : public benchmark::Fixture {
 BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
   BufHandle a_buf("a", {M}, kFloat);
   BufHandle b_buf("b", {M}, kFloat);
-  Tensor c_tensor = Compute("c", {{M, "m"}}, [&](const VarHandle& m) {
+  Tensor c_tensor = Compute("c", {M}, [&](const VarHandle& m) {
     return a_buf.load(m) + b_buf.load(m);
   });
   LoopNest loop_nest({c_tensor});
 
@@ -235,12 +235,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
   te::BufHandle AP("A", {M}, te::kFloat);
   te::Tensor BT = te::Reduce(
       "reduce_full",
-      {{1, "N"}},
+      {1},
       te::Sum(),
       [&](const te::ExprHandle& n, const te::ExprHandle& m) {
         return AP.load(m);
       },
-      {{M, "M"}});
+      {M});
 
   te::LoopNest loop({BT});
   loop.prepareForCodegen();
@@ -266,12 +266,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
   te::BufHandle AP("A", {M}, te::kFloat);
   te::Tensor BT = te::Reduce(
       "reduce_full",
-      {{1, "N"}},
+      {1},
       te::Sum(),
       [&](const te::ExprHandle& n, const te::ExprHandle& m) {
         return AP.load(m);
       },
-      {{M, "M"}});
+      {M});
 
   te::LoopNest loop({BT});
   const int kChunkSize = 8;
@@ -305,12 +305,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
   te::BufHandle AP("A", {M}, te::kFloat);
   te::Tensor BT = te::Reduce(
       "reduce_full",
-      {{1, "N"}},
+      {1},
       te::Sum(),
       [&](const te::ExprHandle& n, const te::ExprHandle& m) {
         return AP.load(m);
       },
-      {{M, "M"}});
+      {M});
 
   te::LoopNest loop({BT});
   const int kChunkSize = 8;
@@ -349,7 +349,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
       {},
       te::Sum(),
       [&](const te::ExprHandle& m) { return AP.load(m); },
-      {{M, "M"}});
+      {M});
 
   te::LoopNest loop({BT});
   te::BufPtr rfac_buf;
 
@@ -46,21 +46,21 @@ class SignedLog1pBench : public benchmark::Fixture {
         "input", {input_size_int_[0], input_size_int_[1]}, kFloat);
     Tensor abs_result = Compute(
         "aten_abs",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return abs(input_ph.load(m, n));
         });
     Tensor log1p_result = Compute(
         "aten_log1p",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return log1p(abs_result.load(m, n));
         });
     Tensor sign_result =
         computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
     Tensor output = Compute(
         "aten_mul",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return sign_result.load(m, n) * log1p_result.load(m, n);
         });
@@ -94,21 +94,21 @@ class SignedLog1pBench : public benchmark::Fixture {
         "input", {input_size_int_[0], input_size_int_[1]}, kFloat);
     Tensor abs_result = Compute(
         "aten_abs",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return abs(input_ph.load(m, n));
         });
     Tensor log_vml_result = Compute(
         "aten_log1p",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return log_vml(abs_result.load(m, n) + ExprHandle(1));
         });
     Tensor sign_result =
         computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
     Tensor output = Compute(
         "aten_mul",
-        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        {input_size_int_[0], input_size_int_[1]},
         [&](const VarHandle& m, const VarHandle& n) {
           return sign_result.load(m, n) * log_vml_result.load(m, n);
         });