Remove batch argument for FILTERBANK-ACCUMULATE microkernels

PiperOrigin-RevId: 469378551
author: Marat Dukhan <maratek@google.com> 2022-08-22 23:46:05 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-08-22 23:47:00 -0700
commit: 0f51d3524b7d5ea4d40f24ab3aa9a5c0215bea50 (patch)
tree: 5ccb81ccd296a1ccd19278be7bc3ecb4dbd1e8ce
parent: 4bd21df6ac8c62bae76a6cf9ddd5e2bd9513f373 (diff)
download: XNNPACK-0f51d3524b7d5ea4d40f24ab3aa9a5c0215bea50.tar.gz
11 files changed, 39 insertions, 128 deletions
diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc
index c899f65ba..a5c0a9a52 100644
--- a/bench/u32-filterbank-accumulate.cc
+++ b/bench/u32-filterbank-accumulate.cc
@@ -40,7 +40,7 @@ void filterbank_accumulate(
   std::iota(output.begin(), output.end(), 0);
 
   for (auto _ : state) {
-    filterbank_accumulate(rows, batch, input.data(), weight_widths.data(), weights.data(), output.data());
+    filterbank_accumulate(rows, input.data(), weight_widths.data(), weights.data(), output.data());
   }
 
   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
diff --git a/src/u32-filterbank-accumulate/gen/neon-x1.c b/src/u32-filterbank-accumulate/gen/neon-x1.c
index 1181f3e3d..d57abef42 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x1.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x1.c
@@ -19,14 +19,12 @@
 
 void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
     size_t rows,
-    size_t batch_size,
     const uint32_t* input,
     const uint8_t* weight_widths,
     const uint16_t* weights,
     uint64_t* output) {
 
   assert(rows != 0);
-  assert(batch_size != 0);
   assert(input != NULL);
   assert(weight_widths != NULL);
   assert(weights != NULL);
diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c
index 375ba5f08..155516885 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x2.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x2.c
@@ -19,14 +19,12 @@
 
 void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
     size_t rows,
-    size_t batch_size,
     const uint32_t* input,
     const uint8_t* weight_widths,
     const uint16_t* weights,
     uint64_t* output) {
 
   assert(rows != 0);
-  assert(batch_size != 0);
   assert(input != NULL);
   assert(weight_widths != NULL);
   assert(weights != NULL);
diff --git a/src/u32-filterbank-accumulate/gen/scalar-x1.c b/src/u32-filterbank-accumulate/gen/scalar-x1.c
index fdca24be5..b0e8e5749 100644
--- a/src/u32-filterbank-accumulate/gen/scalar-x1.c
+++ b/src/u32-filterbank-accumulate/gen/scalar-x1.c
@@ -17,14 +17,12 @@
 
 void xnn_u32_filterbank_accumulate_ukernel__scalar_x1(
     size_t rows,
-    size_t batch_size,
     const uint32_t* input,
     const uint8_t* weight_widths,
     const uint16_t* weights,
     uint64_t* output) {
 
   assert(rows != 0);
-  assert(batch_size != 0);
   assert(input != NULL);
   assert(weight_widths != NULL);
   assert(weights != NULL);
diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in
index ea7e67aac..b51ac5a65 100644
--- a/src/u32-filterbank-accumulate/neon.c.in
+++ b/src/u32-filterbank-accumulate/neon.c.in
@@ -15,14 +15,12 @@
 
 void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
     size_t rows,
-    size_t batch_size,
     const uint32_t* input,
     const uint8_t* weight_widths,
     const uint16_t* weights,
     uint64_t* output) {
 
   assert(rows != 0);
-  assert(batch_size != 0);
   assert(input != NULL);
   assert(weight_widths != NULL);
   assert(weights != NULL);
diff --git a/src/u32-filterbank-accumulate/scalar.c.in b/src/u32-filterbank-accumulate/scalar.c.in
index e2d30374e..cdcf3bb4c 100644
--- a/src/u32-filterbank-accumulate/scalar.c.in
+++ b/src/u32-filterbank-accumulate/scalar.c.in
@@ -14,14 +14,12 @@ $assert BATCH_TILE == 1
 
 void xnn_u32_filterbank_accumulate_ukernel__scalar_x${BATCH_TILE}(
     size_t rows,
-    size_t batch_size,
     const uint32_t* input,
     const uint8_t* weight_widths,
     const uint16_t* weights,
     uint64_t* output) {
 
   assert(rows != 0);
-  assert(batch_size != 0);
   assert(input != NULL);
   assert(weight_widths != NULL);
   assert(weights != NULL);
diff --git a/src/xnnpack/filterbank.h b/src/xnnpack/filterbank.h
index 5349cf42c..90ce94051 100644
--- a/src/xnnpack/filterbank.h
+++ b/src/xnnpack/filterbank.h
@@ -18,7 +18,6 @@ extern "C" {
 #define DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                                        \
     size_t rows,                                                    \
-    size_t batch_size,                                              \
     const uint32_t* input,                                          \
     const uint8_t* weight_widths,                                   \
     const uint16_t* weights,                                        \
diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h
index bc66c5e49..8555ea44c 100644
--- a/src/xnnpack/microfnptr.h
+++ b/src/xnnpack/microfnptr.h
@@ -1641,7 +1641,6 @@ typedef void (*xnn_s16_window_ukernel_function)(
 
 typedef void (*xnn_u32_filterbank_accumulate_ukernel_function)(
     size_t rows,
-    size_t batch_size,
     const uint32_t* input,
     const uint8_t* weight_widths,
     const uint16_t* weights,
diff --git a/test/filterbank-accumulate-microkernel-tester.h b/test/filterbank-accumulate-microkernel-tester.h
index cda264807..6a85e81df 100644
--- a/test/filterbank-accumulate-microkernel-tester.h
+++ b/test/filterbank-accumulate-microkernel-tester.h
@@ -32,16 +32,6 @@ class FilterbankAccumulateMicrokernelTester {
     return this->rows_;
   }
 
-  inline FilterbankAccumulateMicrokernelTester& batch(size_t batch) {
-    assert(batch != 0);
-    this->batch_ = batch;
-    return *this;
-  }
-
-  inline size_t batch() const {
-    return this->batch_;
-  }
-
   inline FilterbankAccumulateMicrokernelTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
@@ -54,37 +44,42 @@ class FilterbankAccumulateMicrokernelTester {
   void Test(xnn_u32_filterbank_accumulate_ukernel_function filterbank_accumulate) const {
     std::random_device random_device;
     auto rng = std::mt19937(random_device());
+    auto u8rng = std::bind(std::uniform_int_distribution<uint16_t>(1, 10), std::ref(rng));
     auto u16rng = std::bind(std::uniform_int_distribution<uint16_t>(), std::ref(rng));
     auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), std::ref(rng));
 
-    std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> input(batch() + XNN_EXTRA_BYTES / sizeof(int16_t));
-    std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> weight_widths(rows());
-    std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> weights(batch() * 2 + XNN_EXTRA_BYTES / sizeof(uint16_t));
-    std::vector<uint64_t, AlignedAllocator<uint64_t, 64>> output(rows());
-    std::vector<uint64_t, AlignedAllocator<uint64_t, 64>> output_ref(rows());
+    std::vector<uint8_t> filterbank_widths(rows());
+    std::vector<uint64_t> output(rows());
+    std::vector<uint64_t> output_ref(rows());
 
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(filterbank_widths.begin(), filterbank_widths.end(), std::ref(u8rng));
+      const size_t num_channels = std::accumulate(filterbank_widths.cbegin(), filterbank_widths.cend(), 0);
+
+      std::vector<uint32_t> input(num_channels + XNN_EXTRA_BYTES / sizeof(int16_t));
       std::generate(input.begin(), input.end(), std::ref(u32rng));
-      std::fill(weight_widths.begin(), weight_widths.end(), rows());
+
+      std::vector<uint16_t> weights(num_channels * 2 + XNN_EXTRA_BYTES / sizeof(uint16_t));
       std::generate(weights.begin(), weights.end(), std::ref(u16rng));
-      std::iota(output.begin(), output.end(), 0);
-      std::iota(output_ref.begin(), output_ref.end(), 1);
+
+      std::fill(output.begin(), output.end(), UINT64_C(0xCAFEB0BADEADBEAF));
 
       uint64_t weight_accumulator = 0;
       uint64_t unweight_accumulator = 0;
       size_t i = 0;
       for (size_t m = 0; m < rows(); m++) {
-        const size_t weight_width = (size_t) weight_widths[m];
-        for (size_t n = 0; n < weight_width; n++, i++) {
-          weight_accumulator += (uint64_t) input[i] * (uint64_t) weights[i * 2];
-          unweight_accumulator += (uint64_t) input[i] * (uint64_t) weights[i * 2 + 1];
+        const size_t weight_width = filterbank_widths[m];
+        for (size_t n = 0; n < weight_width; n++) {
+          weight_accumulator += uint64_t(input[i]) * uint64_t(weights[i * 2]);
+          unweight_accumulator += uint64_t(input[i]) * uint64_t(weights[i * 2 + 1]);
+          i += 1;
         }
         output_ref[m] = weight_accumulator;
         weight_accumulator = unweight_accumulator;
       }
 
       // Call optimized micro-kernel.
-      filterbank_accumulate(rows(), batch(), input.data(), weight_widths.data(), weights.data(), output.data());
+      filterbank_accumulate(rows(), input.data(), filterbank_widths.data(), weights.data(), output.data());
 
       // Verify results.
       for (size_t m = 0; m < rows(); m++) {
@@ -96,6 +91,5 @@ class FilterbankAccumulateMicrokernelTester {
 
  private:
   size_t rows_{1};
-  size_t batch_{1};
   size_t iterations_{15};
 };
diff --git a/test/u32-filterbank-accumulate.cc b/test/u32-filterbank-accumulate.cc
index 7a87cdae3..a1c0fd5d8 100644
--- a/test/u32-filterbank-accumulate.cc
+++ b/test/u32-filterbank-accumulate.cc
@@ -18,94 +18,53 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, batch_eq_1) {
+  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_eq_1) {
     TEST_REQUIRES_ARM_NEON;
     FilterbankAccumulateMicrokernelTester()
-      .batch(1)
+      .rows(1)
       .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1);
   }
 
-  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, batch_gt_1) {
+  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_gt_1) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t batch = 2; batch < 10; batch++) {
+    for (size_t rows = 2; rows <= 10; rows++) {
       FilterbankAccumulateMicrokernelTester()
-        .batch(batch)
+        .rows(2)
         .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1);
     }
   }
-
-  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_eq_2) {
-    TEST_REQUIRES_ARM_NEON;
-    FilterbankAccumulateMicrokernelTester()
-      .rows(2)
-      .batch(1)
-      .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1);
-  }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, batch_eq_2) {
+  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, rows_eq_1) {
     TEST_REQUIRES_ARM_NEON;
     FilterbankAccumulateMicrokernelTester()
-      .batch(2)
+      .rows(1)
       .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2);
   }
 
-  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, batch_div_2) {
+  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, rows_gt_1) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t batch = 4; batch < 20; batch += 2) {
+    for (size_t rows = 2; rows <= 10; rows++) {
       FilterbankAccumulateMicrokernelTester()
-        .batch(batch)
+        .rows(2)
         .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2);
     }
   }
-
-  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, batch_lt_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch = 1; batch < 2; batch++) {
-      FilterbankAccumulateMicrokernelTester()
-        .batch(batch)
-        .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2);
-    }
-  }
-
-  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, batch_gt_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch = 3; batch < 4; batch++) {
-      FilterbankAccumulateMicrokernelTester()
-        .batch(batch)
-        .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2);
-    }
-  }
-
-  TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, rows_eq_2) {
-    TEST_REQUIRES_ARM_NEON;
-    FilterbankAccumulateMicrokernelTester()
-      .rows(2)
-      .batch(2)
-      .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2);
-  }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, batch_eq_1) {
+TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_eq_1) {
   FilterbankAccumulateMicrokernelTester()
-    .batch(1)
+    .rows(1)
     .Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1);
 }
 
-TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, batch_gt_1) {
-  for (size_t batch = 2; batch < 10; batch++) {
+TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_gt_1) {
+  for (size_t rows = 2; rows <= 10; rows++) {
     FilterbankAccumulateMicrokernelTester()
-      .batch(batch)
+      .rows(2)
       .Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1);
   }
 }
-
-TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_eq_2) {
-  FilterbankAccumulateMicrokernelTester()
-    .rows(2)
-    .batch(1)
-    .Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1);
-}
diff --git a/tools/generate-filterbank-accumulate-test.py b/tools/generate-filterbank-accumulate-test.py
index 41bf97009..2a06a9396 100755
--- a/tools/generate-filterbank-accumulate-test.py
+++ b/tools/generate-filterbank-accumulate-test.py
@@ -37,54 +37,24 @@ def split_ukernel_name(name):
 
 
 FILTERBANK_ACCUMULATE_TEST_TEMPLATE = """\
-TEST(${TEST_NAME}, batch_eq_${BATCH_TILE}) {
+TEST(${TEST_NAME}, rows_eq_1) {
   $if ISA_CHECK:
     ${ISA_CHECK};
   FilterbankAccumulateMicrokernelTester()
-    .batch(${BATCH_TILE})
+    .rows(1)
     .Test(${", ".join(TEST_ARGS)});
 }
 
-$if BATCH_TILE > 1:
-  TEST(${TEST_NAME}, batch_div_${BATCH_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t batch = ${BATCH_TILE*2}; batch < ${BATCH_TILE*10}; batch += ${BATCH_TILE}) {
-      FilterbankAccumulateMicrokernelTester()
-        .batch(batch)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-
-  TEST(${TEST_NAME}, batch_lt_${BATCH_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t batch = 1; batch < ${BATCH_TILE}; batch++) {
-      FilterbankAccumulateMicrokernelTester()
-        .batch(batch)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-
-TEST(${TEST_NAME}, batch_gt_${BATCH_TILE}) {
+TEST(${TEST_NAME}, rows_gt_1) {
   $if ISA_CHECK:
     ${ISA_CHECK};
-  for (size_t batch = ${BATCH_TILE+1}; batch < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch++) {
+  for (size_t rows = 2; rows <= 10; rows++) {
     FilterbankAccumulateMicrokernelTester()
-      .batch(batch)
+      .rows(2)
       .Test(${", ".join(TEST_ARGS)});
   }
 }
 
-TEST(${TEST_NAME}, rows_eq_2) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  FilterbankAccumulateMicrokernelTester()
-    .rows(2)
-    .batch(${BATCH_TILE})
-    .Test(${", ".join(TEST_ARGS)});
-}
-
 
 """
author	Marat Dukhan <maratek@google.com>	2022-08-22 23:46:05 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-08-22 23:47:00 -0700
commit	0f51d3524b7d5ea4d40f24ab3aa9a5c0215bea50 (patch)
tree	5ccb81ccd296a1ccd19278be7bc3ecb4dbd1e8ce
parent	4bd21df6ac8c62bae76a6cf9ddd5e2bd9513f373 (diff)
download	XNNPACK-0f51d3524b7d5ea4d40f24ab3aa9a5c0215bea50.tar.gz