FILTERBANK-ACCUMULATE Neon microkernels with unweights accumulator set to 0

PiperOrigin-RevId: 469816851
author: Frank Barchard <fbarchard@google.com> 2022-08-24 13:57:31 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-08-24 13:58:08 -0700
commit: b1aad88bb522c36dce8a9ffe1f7f0d2b2743088b (patch)
tree: 519dfd7013e4d0e19e5e9489d36e14629fab6802
parent: 84e5af66f53cd9b2b57dff1ef3aacf2b988824ff (diff)
download: XNNPACK-b1aad88bb522c36dce8a9ffe1f7f0d2b2743088b.tar.gz
10 files changed, 334 insertions, 1 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index ec2005453..4c45afc0f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4214,6 +4214,8 @@ ALL_NEON_MICROKERNEL_SRCS = [
     "src/u8-maxpool/9p8x-minmax-neon-c16.c",
     "src/u8-rmax/neon.c",
     "src/u8-vclamp/neon-x64.c",
+    "src/u32-filterbank-accumulate/gen/neon-x1.c",
+    "src/u32-filterbank-accumulate/gen/neon-x2.c",
     "src/xx-fill/neon-x64.c",
     "src/xx-pad/neon.c",
     "src/x8-transposec/gen/8x8-multi-dec-zip-neon.c",
@@ -8307,6 +8309,8 @@ AARCH32_ASM_MICROKERNEL_SRCS = [
     "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
     "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
     "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "src/u32-filterbank-accumulate/aarch32-neon-x1.S",
+    "src/u32-filterbank-accumulate/aarch32-neon-x2.S",
 ]
 
 AARCH64_ASM_MICROKERNEL_SRCS = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ecc493cb..bb154ca33 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2708,6 +2708,8 @@ SET(ALL_NEON_MICROKERNEL_SRCS
   src/u8-maxpool/9p8x-minmax-neon-c16.c
   src/u8-rmax/neon.c
   src/u8-vclamp/neon-x64.c
+  src/u32-filterbank-accumulate/gen/neon-x1.c
+  src/u32-filterbank-accumulate/gen/neon-x2.c
   src/xx-fill/neon-x64.c
   src/xx-pad/neon.c
   src/x8-transposec/gen/8x8-multi-dec-zip-neon.c
@@ -6748,7 +6750,9 @@ SET(AARCH32_ASM_MICROKERNEL_SRCS
   src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
   src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S
   src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
-  src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S)
+  src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
+  src/u32-filterbank-accumulate/aarch32-neon-x1.S
+  src/u32-filterbank-accumulate/aarch32-neon-x2.S)
 
 SET(AARCH64_ASM_MICROKERNEL_SRCS
   src/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S
diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc
index 948998339..ba064f945 100644
--- a/bench/u32-filterbank-accumulate.cc
+++ b/bench/u32-filterbank-accumulate.cc
@@ -67,6 +67,16 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
   b->Args({1, 13});
 }
 
+#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x1,  xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1,  benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x2,  xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2,  benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x1,  xnn_u32_filterbank_accumulate_ukernel__neon_x1,  benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x2,  xnn_u32_filterbank_accumulate_ukernel__neon_x2,  benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
 BENCHMARK_CAPTURE(filterbank_accumulate, u32_scalar_x1, xnn_u32_filterbank_accumulate_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
diff --git a/scripts/generate-u32-filterbank-accumulate.sh b/scripts/generate-u32-filterbank-accumulate.sh
index f2dee01d0..36f44f266 100755
--- a/scripts/generate-u32-filterbank-accumulate.sh
+++ b/scripts/generate-u32-filterbank-accumulate.sh
@@ -7,6 +7,10 @@
 ################################### SCALAR ###################################
 tools/xngen src/u32-filterbank-accumulate/scalar.c.in -D BATCH_TILE=1 -o src/u32-filterbank-accumulate/gen/scalar-x1.c &
 
+################################### NEON ###################################
+tools/xngen src/u32-filterbank-accumulate/neon.c.in -D BATCH_TILE=1 -o src/u32-filterbank-accumulate/gen/neon-x1.c &
+tools/xngen src/u32-filterbank-accumulate/neon.c.in -D BATCH_TILE=2 -o src/u32-filterbank-accumulate/gen/neon-x2.c &
+
 ################################## Unit tests #################################
 tools/generate-filterbank-accumulate-test.py --spec test/u32-filterbank-accumulate.yaml --output test/u32-filterbank-accumulate.cc &
 
diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x1.S b/src/u32-filterbank-accumulate/aarch32-neon-x1.S
new file mode 100644
index 000000000..55011c80d
--- /dev/null
+++ b/src/u32-filterbank-accumulate/aarch32-neon-x1.S
@@ -0,0 +1,57 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1(
+//     size_t rows,                          r0
+//     const uint32_t* input,                r1
+//     const uint8_t* weight_widths,         r2
+//     const uint16_t* weights,              r3
+//     uint64_t* output)                     sp -> r12
+
+// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
+
+// Register usage
+// input   r1  d2
+// weights r3  d3 d4 d5
+// output  r12 d0 d1
+
+// weight_widths r2 r4
+
+BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
+        .arm
+#ifndef __APPLE__
+        .arch   armv7-a
+        .fpu    neon
+#endif
+        LDR     r12, [sp]               // output
+        PUSH    {r4,lr}                 // push 8 bytes
+        VMOV.U8 d0, #0                  // weight_accumulator
+0:
+        LDRB    r4, [r2], #1            // weight_widths
+        VMOV.U8 d1, #0                  // unweight_accumulator
+1:
+        VLD1.32 {d3[]}, [r3]!           // weights
+        VLD1.32 {d2[]}, [r1]!           // input
+        SUBS    r4, r4, #1
+        VMOVL.U16 q2, d3
+        VMLAL.U32 q0, d4, d2
+        BHI     1b
+
+        VST1.64 {d0}, [r12]!
+        SUBS    r0, r0, #1
+        VMOV    d0, d1
+        BNE     0b
+
+        POP     {r4,pc}
+
+END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x2.S b/src/u32-filterbank-accumulate/aarch32-neon-x2.S
new file mode 100644
index 000000000..840d7c9f3
--- /dev/null
+++ b/src/u32-filterbank-accumulate/aarch32-neon-x2.S
@@ -0,0 +1,69 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2(
+//     size_t rows,                          r0
+//     const uint32_t* input,                r1
+//     const uint8_t* weight_widths,         r2
+//     const uint16_t* weights,              r3
+//     uint64_t* output)                     sp -> r12
+
+// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
+
+// Register usage
+// input   r1  d2
+// weights r3  d3 d4 d5
+// output  r12 d0 d1
+
+// weight_widths r2 r4
+
+BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
+        .arm
+#ifndef __APPLE__
+        .arch   armv7-a
+        .fpu    neon
+#endif
+        LDR     r12, [sp]               // output
+        PUSH    {r4,lr}                 // push 8 bytes
+        VMOV.U8 d0, #0                  // weight_accumulator
+0:
+        LDRB    r4, [r2], #1            // weight_widths
+        SUBS    r4, r4, #1
+        VMOV.U8 d1, #0                  // unweight_accumulator
+        BLS     2f                      // less than 2 weights?
+
+1:
+        VLD1.16 {d3}, [r3]!             // weights
+        VLD1.32 {d2}, [r1]!             // input
+        SUBS    r4, r4, #2
+        VMOVL.U16 q2, d3
+        VMLAL.U32 q0, d4, d2[0]
+        VMLAL.U32 q0, d5, d2[1]
+        BHI     1b
+
+        BLO     3f                      // is there a remainder?
+2:
+        VLD1.32 {d3[]}, [r3]!           // weights
+        VLD1.32 {d2[]}, [r1]!           // input
+        VMOVL.U16 q2, d3
+        VMLAL.U32 q0, d4, d2
+
+3:
+        VST1.64 {d0}, [r12]!
+        SUBS    r0, r0, #1
+        VMOV    d0, d1
+        BNE     0b
+
+        POP     {r4,pc}
+
+END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/u32-filterbank-accumulate/gen/neon-x1.c b/src/u32-filterbank-accumulate/gen/neon-x1.c
new file mode 100644
index 000000000..0a59096c4
--- /dev/null
+++ b/src/u32-filterbank-accumulate/gen/neon-x1.c
@@ -0,0 +1,51 @@
+// Auto-generated file. Do not edit!
+//   Template: src/u32-filterbank-accumulate/neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/filterbank.h>
+
+
+void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
+    size_t rows,
+    const uint32_t* input,
+    const uint8_t* weight_widths,
+    const uint16_t* weights,
+    uint64_t* output) {
+
+  assert(rows != 0);
+  assert(input != NULL);
+  assert(weight_widths != NULL);
+  assert(weights != NULL);
+  assert(output != NULL);
+
+  uint64x2_t weight_accumulator = vdupq_n_u64(0);
+
+  do {
+    size_t n = (size_t) *weight_widths++;
+    assert(n != 0);
+
+    do {
+      const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+      const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+      const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+      weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+    } while (--n != 0);
+
+    vst1_u64(output, vget_low_u64(weight_accumulator));  output += 1;
+    weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
+  } while (--rows != 0);
+}
diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c
new file mode 100644
index 000000000..743244a83
--- /dev/null
+++ b/src/u32-filterbank-accumulate/gen/neon-x2.c
@@ -0,0 +1,60 @@
+// Auto-generated file. Do not edit!
+//   Template: src/u32-filterbank-accumulate/neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/filterbank.h>
+
+
+void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
+    size_t rows,
+    const uint32_t* input,
+    const uint8_t* weight_widths,
+    const uint16_t* weights,
+    uint64_t* output) {
+
+  assert(rows != 0);
+  assert(input != NULL);
+  assert(weight_widths != NULL);
+  assert(weights != NULL);
+  assert(output != NULL);
+
+  uint64x2_t weight_accumulator = vdupq_n_u64(0);
+
+  do {
+    size_t n = (size_t) *weight_widths++;
+    assert(n != 0);
+
+    for (;n >= 2; n -= 2) {
+      const uint32x2_t vi = vld1_u32(input); input += 2;
+      const uint16x4_t vw = vld1_u16(weights); weights += 4;
+      const uint32x4_t vw32 = vmovl_u16(vw);
+
+      weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0);
+      weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1);
+    }
+
+    if XNN_UNPREDICTABLE(n != 0) {
+      const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+      const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+      const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+      weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+    }
+
+    vst1_u64(output, vget_low_u64(weight_accumulator));  output += 1;
+    weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
+  } while (--rows != 0);
+}
diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in
new file mode 100644
index 000000000..77d88bea6
--- /dev/null
+++ b/src/u32-filterbank-accumulate/neon.c.in
@@ -0,0 +1,65 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/filterbank.h>
+
+
+void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
+    size_t rows,
+    const uint32_t* input,
+    const uint8_t* weight_widths,
+    const uint16_t* weights,
+    uint64_t* output) {
+
+  assert(rows != 0);
+  assert(input != NULL);
+  assert(weight_widths != NULL);
+  assert(weights != NULL);
+  assert(output != NULL);
+
+  uint64x2_t weight_accumulator = vdupq_n_u64(0);
+
+  do {
+    size_t n = (size_t) *weight_widths++;
+    assert(n != 0);
+
+    $if BATCH_TILE > 1:
+      for (;n >= 2; n -= 2) {
+        const uint32x2_t vi = vld1_u32(input); input += 2;
+        const uint16x4_t vw = vld1_u16(weights); weights += 4;
+        const uint32x4_t vw32 = vmovl_u16(vw);
+
+        weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0);
+        weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1);
+      }
+
+      if XNN_UNPREDICTABLE(n != 0) {
+        const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+        const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+        const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+        weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+      }
+    $else:
+      do {
+        const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+        const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+        const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+        weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+      } while (--n != 0);
+
+    vst1_u64(output, vget_low_u64(weight_accumulator));  output += 1;
+    weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
+  } while (--rows != 0);
+}
diff --git a/test/u32-filterbank-accumulate.yaml b/test/u32-filterbank-accumulate.yaml
index a44f9d45c..54798504c 100644
--- a/test/u32-filterbank-accumulate.yaml
+++ b/test/u32-filterbank-accumulate.yaml
@@ -3,5 +3,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+
+# AArch32 assembly
+- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
+- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
+
+# ARM NEON
+- name: xnn_u32_filterbank_accumulate_ukernel__neon_x1
+- name: xnn_u32_filterbank_accumulate_ukernel__neon_x2
+
 # Scalar
 - name: xnn_u32_filterbank_accumulate_ukernel__scalar_x1
author	Frank Barchard <fbarchard@google.com>	2022-08-24 13:57:31 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-08-24 13:58:08 -0700
commit	b1aad88bb522c36dce8a9ffe1f7f0d2b2743088b (patch)
tree	519dfd7013e4d0e19e5e9489d36e14629fab6802
parent	84e5af66f53cd9b2b57dff1ef3aacf2b988824ff (diff)
download	XNNPACK-b1aad88bb522c36dce8a9ffe1f7f0d2b2743088b.tar.gz