google
diff --git a/‎CONTRIBUTORS
+1 b/‎CONTRIBUTORS
+1
diff --git a/‎fixedpoint/fixedpoint.h
+4-4 b/‎fixedpoint/fixedpoint.h
+4-4
diff --git a/‎fixedpoint/fixedpoint_neon.h
+26 b/‎fixedpoint/fixedpoint_neon.h
+26
diff --git a/‎internal/dispatch_gemm_shape.h
+16 b/‎internal/dispatch_gemm_shape.h
+16
diff --git a/‎internal/kernel.h
+21-4 b/‎internal/kernel.h
+21-4
diff --git a/‎internal/kernel_default.h
+39-29 b/‎internal/kernel_default.h
+39-29
diff --git a/‎internal/kernel_neon.h
+22 b/‎internal/kernel_neon.h
+22
diff --git a/‎internal/output.h
+66-2 b/‎internal/output.h
+66-2
@@ -18,6 +18,7 @@ Maciek Chociej <[email protected]>
 Justine Tunney <[email protected]>
 Mark J. Matthews <[email protected]>
 Marie White <[email protected]>
+Suharsh Sivakumar <[email protected]>
 
 Intel:
 Sagi Marcovich <[email protected]>
 
@@ -121,8 +121,8 @@ tIntegerType Neg(tIntegerType a) {
 // in the overflow case, we just want to avoid undefined behavior.
 //
 // tIntegerType may be int32 or any narrower signed type.
-template <typename tIntegerType>
-tIntegerType ShiftLeft(tIntegerType a, int offset) {
+template <typename tIntegerType, typename OffsetType>
+tIntegerType ShiftLeft(tIntegerType a, OffsetType offset) {
   const std::int64_t wide_a = static_cast<std::int64_t>(a);
   const std::int64_t wide_shifted = wide_a * (1 << offset);
   const auto min = std::numeric_limits<tIntegerType>::min();
@@ -353,8 +353,8 @@ inline std::int16_t SaturatingRoundingDoublingHighMul(std::int16_t a,
 
 // Correctly-rounded-to-nearest division by a power-of-two.
 // Also known as a rounding arithmetic right shift.
-template <typename IntegerType>
-inline IntegerType RoundingDivideByPOT(IntegerType x, int exponent) {
+template <typename IntegerType, typename ExponentType>
+inline IntegerType RoundingDivideByPOT(IntegerType x, ExponentType exponent) {
   assert(exponent >= 0);
   assert(exponent <= 31);
   const IntegerType mask = Dup<IntegerType>((1ll << exponent) - 1);
 
@@ -114,6 +114,16 @@ inline int16x8_t ShiftLeft(int16x8_t a, int offset) {
   return vshlq_s16(a, vdupq_n_s16(offset));
 }
 
+template <>
+inline int32x4_t ShiftLeft(int32x4_t a, int32x4_t offset) {
+  return vshlq_s32(a, offset);
+}
+
+template <>
+inline int16x8_t ShiftLeft(int16x8_t a, int16x8_t offset) {
+  return vshlq_s32(a, offset);
+}
+
 template <>
 inline int32x4_t ShiftRight(int32x4_t a, int offset) {
   return vshlq_s32(a, vdupq_n_s32(-offset));
@@ -282,6 +292,22 @@ inline int16x8_t RoundingDivideByPOT(int16x8_t x, int exponent) {
   return vrshlq_s16(fixed_up_x, shift_vec);
 }
 
+template <>
+inline int32x4_t RoundingDivideByPOT(int32x4_t x, int32x4_t exponent) {
+  const int32x4_t shift_vec = vnegq_s32(exponent);
+  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+  const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+  return vrshlq_s32(fixed_up_x, shift_vec);
+}
+
+template <>
+inline int16x8_t RoundingDivideByPOT(int16x8_t x, int16x8_t exponent) {
+  const int16x8_t shift_vec = vnegq_s16(exponent);
+  const int16x8_t fixup = vshrq_n_s16(vandq_s16(x, shift_vec), 15);
+  const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
+  return vrshlq_s16(fixed_up_x, shift_vec);
+}
+
 template <int Exponent>
 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
   static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
 
@@ -85,6 +85,22 @@ struct TransposeImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>> {
   }
 };
 
+template <VectorShape Shape>
+struct TransposeImpl<OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>> {
+  typedef OutputStageScaleInt32ByFixedPointAndExponentPC<Shape> SrcType;
+  static const VectorShape TransposedShape = TransposeVectorShape<Shape>::Value;
+  typedef OutputStageScaleInt32ByFixedPointAndExponentPC<TransposedShape>
+      DstType;
+  static DstType Run(const SrcType& src) {
+    DstType dst;
+    dst.result_fixedpoint_multiplier =
+        Transpose(src.result_fixedpoint_multiplier);
+    dst.result_exponent = Transpose(src.result_exponent);
+    dst.result_offset_after_shift = src.result_offset_after_shift;
+    return dst;
+  }
+};
+
 template <typename VectorMapType>
 struct TransposeImpl<OutputStageBiasAddition<VectorMapType>> {
   typedef OutputStageBiasAddition<VectorMapType> SrcType;
 
@@ -145,12 +145,24 @@ struct KernelSideFormat {
   static const int kCells = tCells;
   static const int kWidth = kCells * Cell::kWidth;
   static const int kDepth = Cell::kDepth;
-  typedef std::uint8_t Scalar;
+  typedef std::uint8_t Scalar;       // The scalar type of the Format.
+  typedef std::uint8_t InputScalar;  // The scalar type of the original input.
 };
 
+// KernelSideFormat for int8 fast kernel trick. The original input is uint8, but
+// packs converts it to int8.
 template <typename tCellFormat, int tCells>
 struct KernelSideFormatInt8 : KernelSideFormat<tCellFormat, tCells> {
   typedef std::int8_t Scalar;
+  typedef std::uint8_t InputScalar;
+};
+
+// KernelSideFormat for int8 inputs, enabling int8 fast kernel trick without
+// pack conversion.
+template <typename tCellFormat, int tCells>
+struct KernelSideFormatInt8Inputs : KernelSideFormat<tCellFormat, tCells> {
+  typedef std::int8_t Scalar;
+  typedef std::int8_t InputScalar;
 };
 
 // KernelFormat describes fully the input data layout that a kernel expects.
@@ -216,19 +228,24 @@ struct KernelBase {
   virtual ~KernelBase() {}
 };
 
-template <typename KernelScalarType>
+template <typename InputKernelScalarType, typename KernelScalarType>
 struct ZeroPointInputValue {};
 
 template <>
-struct ZeroPointInputValue<std::uint8_t> {
+struct ZeroPointInputValue<std::uint8_t, std::uint8_t> {
   static constexpr std::uint8_t kValue = 0;
 };
 
 template <>
-struct ZeroPointInputValue<std::int8_t> {
+struct ZeroPointInputValue<std::uint8_t, std::int8_t> {
   static constexpr std::uint8_t kValue = 128;
 };
 
+template <>
+struct ZeroPointInputValue<std::int8_t, std::int8_t> {
+  static constexpr std::uint8_t kValue = 0;
+};
+
 }  // namespace gemmlowp
 
 #endif  // GEMMLOWP_INTERNAL_KERNEL_H_
@@ -20,74 +20,84 @@
 
 #include "../public/bit_depth.h"
 #include "common.h"
+#include "kernel.h"
 #include "kernel_reference.h"
 
 namespace gemmlowp {
 
-template <bool MaxProductIsLessThan4096, bool LhsAlwaysNonzero>
+template <bool MaxProductIsLessThan4096, bool IsUnsigned, bool LhsNonZero>
 struct DefaultKernelImpl {};
 
-// Partial specialization implementing the logic that if we want to use
-// a kernel for LhsAlwaysNonzero but do not have such a kernel, then we fall
-// back to a generic kernel not taking advantage of LhsAlwaysNonzero.
-template <bool LhsAlwaysNonzero>
-struct DefaultKernelImpl<true, LhsAlwaysNonzero>
-    : DefaultKernelImpl<false, LhsAlwaysNonzero> {};
-
 // Partial specialization implementing the logic that if we want to use
 // a kernel for MaxProductIsLessThan4096 but do not have such a kernel, then we
 // fall back to a generic kernel not taking advantage of
 // MaxProductIsLessThan4096.
+template <bool LhsNonZero>
+struct DefaultKernelImpl<true, true, LhsNonZero>
+    : DefaultKernelImpl<false, true, LhsNonZero> {};
+
+// Partial specialization implementing the logic that if we want to use
+// a kernel for LhsNonZero but do not have such a kernel, then we fall
+// back to a generic kernel not taking advantage of LhsNonZero.
 template <bool MaxProductIsLessThan4096>
-struct DefaultKernelImpl<MaxProductIsLessThan4096, true>
-    : DefaultKernelImpl<MaxProductIsLessThan4096, false> {};
+struct DefaultKernelImpl<MaxProductIsLessThan4096, true, true>
+    : DefaultKernelImpl<MaxProductIsLessThan4096, true, false> {};
 
 template <typename BitDepthParams>
 struct DefaultKernel
     : DefaultKernelImpl<(BitDepthParams::LhsRange::kMaxValue *
                              BitDepthParams::RhsRange::kMaxValue <
                          4096),
-                        (BitDepthParams::LhsRange::kMinValue > 0)> {};
+                        (BitDepthParams::LhsRange::kMinValue >= 0),
+                        (BitDepthParams::LhsRange::kMinValue > 0 ||
+                         (BitDepthParams::LhsRange::kMaxValue <= 127 &&
+                          BitDepthParams::LhsRange::kMinValue > -128))> {};
 
 }  // end namespace gemmlowp
 
-#define GEMMLOWP_SET_DEFAULT_KERNEL(MaxProductIsLessThan4096,          \
-                                    LhsAlwaysNonzero, Kernel)          \
-  namespace gemmlowp {                                                 \
-  template <>                                                          \
-  struct DefaultKernelImpl<MaxProductIsLessThan4096, LhsAlwaysNonzero> \
-      : Kernel {};                                                     \
+#define GEMMLOWP_SET_DEFAULT_KERNEL(MaxProductIsLessThan4096, IsUnsigned, \
+                                    LhsAlwaysNonZero, Kernel)             \
+  namespace gemmlowp {                                                    \
+  template <>                                                             \
+  struct DefaultKernelImpl<MaxProductIsLessThan4096, IsUnsigned,          \
+                           LhsAlwaysNonZero> : Kernel {};                 \
   }
 
+// User-provided int8 inputs is only supported in the NEON path currently.
 #if defined GEMMLOWP_NEON_32
 #include "kernel_neon.h"
-GEMMLOWP_SET_DEFAULT_KERNEL(false, false, NEON_32_Kernel12x4Depth2)
-GEMMLOWP_SET_DEFAULT_KERNEL(true, false,
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, NEON_32_Kernel12x4Depth2)
+GEMMLOWP_SET_DEFAULT_KERNEL(true, true, false,
                             NEON_32_Kernel12x4Depth2Assuming12BitProducts)
-GEMMLOWP_SET_DEFAULT_KERNEL(false, true,
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true,
                             NEON_32bit_GEMM_Int8Operands_LhsNonzero)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, false, true,
+                            NEON_32bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs)
 #elif defined GEMMLOWP_NEON_64
 #include "kernel_neon.h"
 #if defined GEMMLOWP_DOTPROD_KERNEL
-GEMMLOWP_SET_DEFAULT_KERNEL(false, false, NEON_64_Kernel12x8Depth4_dotprod)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false,
+                            NEON_64_Kernel12x8Depth4_dotprod)
 #else
-GEMMLOWP_SET_DEFAULT_KERNEL(false, false, NEON_64_Kernel12x8Depth2)
-GEMMLOWP_SET_DEFAULT_KERNEL(false, true,
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, NEON_64_Kernel12x8Depth2)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true,
                             NEON_64bit_GEMM_Int8Operands_LhsNonzero)
 #endif
+GEMMLOWP_SET_DEFAULT_KERNEL(false, false, true,
+                            NEON_64bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs)
 #elif defined(GEMMLOWP_MSA)
 #include "kernel_msa.h"
-GEMMLOWP_SET_DEFAULT_KERNEL(false, false, MSA_Kernel12x8Depth2)
-GEMMLOWP_SET_DEFAULT_KERNEL(false, true, MSA_GEMM_Int8Operands_LhsNonzero)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, MSA_Kernel12x8Depth2)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true, MSA_GEMM_Int8Operands_LhsNonzero)
 #elif defined GEMMLOWP_SSE4_32
 #include "kernel_sse.h"
-GEMMLOWP_SET_DEFAULT_KERNEL(false, false, SSE4_32_Kernel4x4Depth2)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, SSE4_32_Kernel4x4Depth2)
 #elif defined GEMMLOWP_SSE4_64
 #include "kernel_sse.h"
-GEMMLOWP_SET_DEFAULT_KERNEL(false, false, SSE4_64_Kernel12x4Depth2)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, SSE4_64_Kernel12x4Depth2)
 #elif defined GEMMLOWP_AVX2_64
 #include "kernel_avx.h"
-GEMMLOWP_SET_DEFAULT_KERNEL(false, false, AVX2_64_Kernel24x8Depth2)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, AVX2_64_Kernel24x8Depth2)
 #else
 #include "kernel_reference.h"
 namespace gemmlowp {
@@ -96,7 +106,7 @@ typedef ReferenceKernel<KernelFormat<
     KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> > >
     DefaultReferenceKernel;
 }
-GEMMLOWP_SET_DEFAULT_KERNEL(false, false, DefaultReferenceKernel)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, DefaultReferenceKernel)
 #endif
 
 #endif  // GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
@@ -924,6 +924,17 @@ struct NEON_32bit_GEMM_Int8Operands_LhsNonzero : KernelBase {
   }
 };
 
+// Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
+// requires that user inputs were originally int8. This avoids the uint8->int8
+// conversion in the pack step.
+struct NEON_32bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs
+    : NEON_32bit_GEMM_Int8Operands_LhsNonzero {
+  typedef KernelFormat<
+      KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
+      KernelSideFormatInt8Inputs<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
+      Format;
+};
+
 #endif  // GEMMLOWP_NEON_32
 
 // The kernels here are specifically arm 64bit assembly, not arm 32bit.
@@ -1265,6 +1276,17 @@ struct NEON_64bit_GEMM_Int8Operands_LhsNonzero : KernelBase {
   }
 };
 
+// Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
+// requires that user inputs were originally int8. This avoids the uint8->int8
+// conversion in the pack step.
+struct NEON_64bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs
+    : NEON_64bit_GEMM_Int8Operands_LhsNonzero {
+  typedef KernelFormat<
+      KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
+      KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
+      Format;
+};
+
 // Our main GEMM kernel.
 struct NEON_64_Kernel12x8Depth2 : KernelBase {
   typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
 
@@ -22,6 +22,7 @@
 #include <cmath>
 #include <tuple>
 #include <type_traits>
+#include <typeinfo>
 
 #include "../fixedpoint/fixedpoint.h"
 #include "../public/output_stages.h"
@@ -179,7 +180,47 @@ struct OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent,
   int right_shift;
 };
 
-// Implementation of OutputStageSaturatingCastToUint8 for scalar data
+template <int Rows, int Cols, VectorShape Shape>
+struct OutputStageEvalImpl<
+    OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>,
+    RegisterBlock<std::int32_t, Rows, Cols>> {
+  typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
+  typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
+
+  typedef OutputStageScaleInt32ByFixedPointAndExponentPC<Shape> OutputStage;
+
+  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input, int row, int col) const {
+    OutputType output;
+    const int pos = Shape == VectorShape::Row ? col : row;
+    using RegisterType = typename InputType::RegisterType;
+    const RegisterType result_offset_after_shift =
+        Dup<RegisterType>(output_stage.result_offset_after_shift);
+    auto left_shift =
+        LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
+    auto right_shift =
+        LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
+    const auto result_fixedpoint_multiplier = LoadForBroadcasting<InputType>(
+        output_stage.result_fixedpoint_multiplier, pos);
+    for (int i = 0; i < decltype(left_shift)::kRegisterCount; i++) {
+      left_shift.buf.reg[i] = Max(left_shift.buf.reg[i], 0);
+      right_shift.buf.reg[i] = Max(-right_shift.buf.reg[i], 0);
+    }
+    const auto mulhigh_val = BroadcastSaturatingRoundingDoublingHighMul(
+        BroadcastShiftLeft(input, left_shift), result_fixedpoint_multiplier);
+    const auto rdpot_val =
+        BroadcastRoundingDivideByPOT(mulhigh_val, right_shift);
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      output.buf.reg[i] = Add(rdpot_val.buf.reg[i], result_offset_after_shift);
+    }
+    return output;
+  }
+
+  const OutputStage& output_stage;
+};
+
+// Implementation of OutputStageSaturatingCastToUint8 for scalar data.
 template <int Size>
 struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
                                  RegisterBuffer<std::int32_t, Size>> {
@@ -202,7 +243,30 @@ struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
   }
 };
 
-// Implementation of OutputStageSaturatingCastToInt16 for scalar data
+// Implementation of OutputStageSaturatingCastToInt8 for scalar data.
+template <int Size>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8,
+                                 RegisterBuffer<std::int32_t, Size>> {
+  typedef RegisterBuffer<std::int32_t, Size> InputType;
+  typedef RegisterBuffer<std::int8_t, Size> OutputType;
+  static_assert(InputType::kRegisterLanes == 1,
+                "This path is only for scalar values");
+
+  typedef OutputStageSaturatingCastToInt8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      std::int32_t data = input.reg[i];
+      output.reg[i] = data > 127 ? 127 : data < -128 ? -128 : data;
+    }
+    return output;
+  }
+};
+
+// Implementation of OutputStageSaturatingCastToInt16 for scalar data.
 template <int Size>
 struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
                                  RegisterBuffer<std::int32_t, Size>> {