aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-08-24 13:57:31 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-08-24 13:58:08 -0700
commitb1aad88bb522c36dce8a9ffe1f7f0d2b2743088b (patch)
tree519dfd7013e4d0e19e5e9489d36e14629fab6802
parent84e5af66f53cd9b2b57dff1ef3aacf2b988824ff (diff)
downloadXNNPACK-b1aad88bb522c36dce8a9ffe1f7f0d2b2743088b.tar.gz
FILTERBANK-ACCUMULATE Neon microkernels with unweights accumulator set to 0
PiperOrigin-RevId: 469816851
-rw-r--r--BUILD.bazel4
-rwxr-xr-xCMakeLists.txt6
-rw-r--r--bench/u32-filterbank-accumulate.cc10
-rwxr-xr-xscripts/generate-u32-filterbank-accumulate.sh4
-rw-r--r--src/u32-filterbank-accumulate/aarch32-neon-x1.S57
-rw-r--r--src/u32-filterbank-accumulate/aarch32-neon-x2.S69
-rw-r--r--src/u32-filterbank-accumulate/gen/neon-x1.c51
-rw-r--r--src/u32-filterbank-accumulate/gen/neon-x2.c60
-rw-r--r--src/u32-filterbank-accumulate/neon.c.in65
-rw-r--r--test/u32-filterbank-accumulate.yaml9
10 files changed, 334 insertions, 1 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index ec2005453..4c45afc0f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4214,6 +4214,8 @@ ALL_NEON_MICROKERNEL_SRCS = [
"src/u8-maxpool/9p8x-minmax-neon-c16.c",
"src/u8-rmax/neon.c",
"src/u8-vclamp/neon-x64.c",
+ "src/u32-filterbank-accumulate/gen/neon-x1.c",
+ "src/u32-filterbank-accumulate/gen/neon-x2.c",
"src/xx-fill/neon-x64.c",
"src/xx-pad/neon.c",
"src/x8-transposec/gen/8x8-multi-dec-zip-neon.c",
@@ -8307,6 +8309,8 @@ AARCH32_ASM_MICROKERNEL_SRCS = [
"src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
"src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
"src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S",
+ "src/u32-filterbank-accumulate/aarch32-neon-x1.S",
+ "src/u32-filterbank-accumulate/aarch32-neon-x2.S",
]
AARCH64_ASM_MICROKERNEL_SRCS = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ecc493cb..bb154ca33 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2708,6 +2708,8 @@ SET(ALL_NEON_MICROKERNEL_SRCS
src/u8-maxpool/9p8x-minmax-neon-c16.c
src/u8-rmax/neon.c
src/u8-vclamp/neon-x64.c
+ src/u32-filterbank-accumulate/gen/neon-x1.c
+ src/u32-filterbank-accumulate/gen/neon-x2.c
src/xx-fill/neon-x64.c
src/xx-pad/neon.c
src/x8-transposec/gen/8x8-multi-dec-zip-neon.c
@@ -6748,7 +6750,9 @@ SET(AARCH32_ASM_MICROKERNEL_SRCS
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
- src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S)
+ src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
+ src/u32-filterbank-accumulate/aarch32-neon-x1.S
+ src/u32-filterbank-accumulate/aarch32-neon-x2.S)
SET(AARCH64_ASM_MICROKERNEL_SRCS
src/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S
diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc
index 948998339..ba064f945 100644
--- a/bench/u32-filterbank-accumulate.cc
+++ b/bench/u32-filterbank-accumulate.cc
@@ -67,6 +67,16 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
b->Args({1, 13});
}
+#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
BENCHMARK_CAPTURE(filterbank_accumulate, u32_scalar_x1, xnn_u32_filterbank_accumulate_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
#ifndef XNNPACK_BENCHMARK_NO_MAIN
diff --git a/scripts/generate-u32-filterbank-accumulate.sh b/scripts/generate-u32-filterbank-accumulate.sh
index f2dee01d0..36f44f266 100755
--- a/scripts/generate-u32-filterbank-accumulate.sh
+++ b/scripts/generate-u32-filterbank-accumulate.sh
@@ -7,6 +7,10 @@
################################### SCALAR ###################################
tools/xngen src/u32-filterbank-accumulate/scalar.c.in -D BATCH_TILE=1 -o src/u32-filterbank-accumulate/gen/scalar-x1.c &
+################################### NEON ###################################
+tools/xngen src/u32-filterbank-accumulate/neon.c.in -D BATCH_TILE=1 -o src/u32-filterbank-accumulate/gen/neon-x1.c &
+tools/xngen src/u32-filterbank-accumulate/neon.c.in -D BATCH_TILE=2 -o src/u32-filterbank-accumulate/gen/neon-x2.c &
+
################################## Unit tests #################################
tools/generate-filterbank-accumulate-test.py --spec test/u32-filterbank-accumulate.yaml --output test/u32-filterbank-accumulate.cc &
diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x1.S b/src/u32-filterbank-accumulate/aarch32-neon-x1.S
new file mode 100644
index 000000000..55011c80d
--- /dev/null
+++ b/src/u32-filterbank-accumulate/aarch32-neon-x1.S
@@ -0,0 +1,57 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1(
+// size_t rows, r0
+// const uint32_t* input, r1
+// const uint8_t* weight_widths, r2
+// const uint16_t* weights, r3
+// uint64_t* output) sp -> r12
+
+// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
+
+// Register usage
+// input r1 d2
+// weights r3 d3 d4 d5
+// output r12 d0 d1
+
+// weight_widths r2 r4
+
+BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
+ .arm
+#ifndef __APPLE__
+ .arch armv7-a
+ .fpu neon
+#endif
+ LDR r12, [sp] // output
+ PUSH {r4,lr} // push 8 bytes
+ VMOV.U8 d0, #0 // weight_accumulator
+0:
+ LDRB r4, [r2], #1 // weight_widths
+ VMOV.U8 d1, #0 // unweight_accumulator
+1:
+ VLD1.32 {d3[]}, [r3]! // weights
+ VLD1.32 {d2[]}, [r1]! // input
+ SUBS r4, r4, #1
+ VMOVL.U16 q2, d3
+ VMLAL.U32 q0, d4, d2
+ BHI 1b
+
+ VST1.64 {d0}, [r12]!
+ SUBS r0, r0, #1
+ VMOV d0, d1
+ BNE 0b
+
+ POP {r4,pc}
+
+END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x2.S b/src/u32-filterbank-accumulate/aarch32-neon-x2.S
new file mode 100644
index 000000000..840d7c9f3
--- /dev/null
+++ b/src/u32-filterbank-accumulate/aarch32-neon-x2.S
@@ -0,0 +1,69 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2(
+// size_t rows, r0
+// const uint32_t* input, r1
+// const uint8_t* weight_widths, r2
+// const uint16_t* weights, r3
+// uint64_t* output) sp -> r12
+
+// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
+
+// Register usage
+// input r1 d2
+// weights r3 d3 d4 d5
+// output r12 d0 d1
+
+// weight_widths r2 r4
+
+BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
+ .arm
+#ifndef __APPLE__
+ .arch armv7-a
+ .fpu neon
+#endif
+ LDR r12, [sp] // output
+ PUSH {r4,lr} // push 8 bytes
+ VMOV.U8 d0, #0 // weight_accumulator
+0:
+ LDRB r4, [r2], #1 // weight_widths
+ SUBS r4, r4, #1
+ VMOV.U8 d1, #0 // unweight_accumulator
+ BLS 2f // less than 2 weights?
+
+1:
+ VLD1.16 {d3}, [r3]! // weights
+ VLD1.32 {d2}, [r1]! // input
+ SUBS r4, r4, #2
+ VMOVL.U16 q2, d3
+ VMLAL.U32 q0, d4, d2[0]
+ VMLAL.U32 q0, d5, d2[1]
+ BHI 1b
+
+ BLO 3f // is there a remainder?
+2:
+ VLD1.32 {d3[]}, [r3]! // weights
+ VLD1.32 {d2[]}, [r1]! // input
+ VMOVL.U16 q2, d3
+ VMLAL.U32 q0, d4, d2
+
+3:
+ VST1.64 {d0}, [r12]!
+ SUBS r0, r0, #1
+ VMOV d0, d1
+ BNE 0b
+
+ POP {r4,pc}
+
+END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/u32-filterbank-accumulate/gen/neon-x1.c b/src/u32-filterbank-accumulate/gen/neon-x1.c
new file mode 100644
index 000000000..0a59096c4
--- /dev/null
+++ b/src/u32-filterbank-accumulate/gen/neon-x1.c
@@ -0,0 +1,51 @@
+// Auto-generated file. Do not edit!
+// Template: src/u32-filterbank-accumulate/neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/filterbank.h>
+
+
+void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
+ size_t rows,
+ const uint32_t* input,
+ const uint8_t* weight_widths,
+ const uint16_t* weights,
+ uint64_t* output) {
+
+ assert(rows != 0);
+ assert(input != NULL);
+ assert(weight_widths != NULL);
+ assert(weights != NULL);
+ assert(output != NULL);
+
+ uint64x2_t weight_accumulator = vdupq_n_u64(0);
+
+ do {
+ size_t n = (size_t) *weight_widths++;
+ assert(n != 0);
+
+ do {
+ const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+ const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+ const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+ weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+ } while (--n != 0);
+
+ vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1;
+ weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
+ } while (--rows != 0);
+}
diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c
new file mode 100644
index 000000000..743244a83
--- /dev/null
+++ b/src/u32-filterbank-accumulate/gen/neon-x2.c
@@ -0,0 +1,60 @@
+// Auto-generated file. Do not edit!
+// Template: src/u32-filterbank-accumulate/neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/filterbank.h>
+
+
+void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
+ size_t rows,
+ const uint32_t* input,
+ const uint8_t* weight_widths,
+ const uint16_t* weights,
+ uint64_t* output) {
+
+ assert(rows != 0);
+ assert(input != NULL);
+ assert(weight_widths != NULL);
+ assert(weights != NULL);
+ assert(output != NULL);
+
+ uint64x2_t weight_accumulator = vdupq_n_u64(0);
+
+ do {
+ size_t n = (size_t) *weight_widths++;
+ assert(n != 0);
+
+ for (;n >= 2; n -= 2) {
+ const uint32x2_t vi = vld1_u32(input); input += 2;
+ const uint16x4_t vw = vld1_u16(weights); weights += 4;
+ const uint32x4_t vw32 = vmovl_u16(vw);
+
+ weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0);
+ weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1);
+ }
+
+ if XNN_UNPREDICTABLE(n != 0) {
+ const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+ const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+ const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+ weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+ }
+
+ vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1;
+ weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
+ } while (--rows != 0);
+}
diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in
new file mode 100644
index 000000000..77d88bea6
--- /dev/null
+++ b/src/u32-filterbank-accumulate/neon.c.in
@@ -0,0 +1,65 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/filterbank.h>
+
+
+void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
+ size_t rows,
+ const uint32_t* input,
+ const uint8_t* weight_widths,
+ const uint16_t* weights,
+ uint64_t* output) {
+
+ assert(rows != 0);
+ assert(input != NULL);
+ assert(weight_widths != NULL);
+ assert(weights != NULL);
+ assert(output != NULL);
+
+ uint64x2_t weight_accumulator = vdupq_n_u64(0);
+
+ do {
+ size_t n = (size_t) *weight_widths++;
+ assert(n != 0);
+
+ $if BATCH_TILE > 1:
+ for (;n >= 2; n -= 2) {
+ const uint32x2_t vi = vld1_u32(input); input += 2;
+ const uint16x4_t vw = vld1_u16(weights); weights += 4;
+ const uint32x4_t vw32 = vmovl_u16(vw);
+
+ weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0);
+ weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1);
+ }
+
+ if XNN_UNPREDICTABLE(n != 0) {
+ const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+ const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+ const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+ weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+ }
+ $else:
+ do {
+ const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+ const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+ const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+ weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+ } while (--n != 0);
+
+ vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1;
+ weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
+ } while (--rows != 0);
+}
diff --git a/test/u32-filterbank-accumulate.yaml b/test/u32-filterbank-accumulate.yaml
index a44f9d45c..54798504c 100644
--- a/test/u32-filterbank-accumulate.yaml
+++ b/test/u32-filterbank-accumulate.yaml
@@ -3,5 +3,14 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+
+# AArch32 assembly
+- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
+- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
+
+# ARM NEON
+- name: xnn_u32_filterbank_accumulate_ukernel__neon_x1
+- name: xnn_u32_filterbank_accumulate_ukernel__neon_x2
+
# Scalar
- name: xnn_u32_filterbank_accumulate_ukernel__scalar_x1