aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-08-29 21:12:19 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-08-29 21:13:15 -0700
commitdaa7762ba75b09cb0b89ed19f66114c13e2cb832 (patch)
tree56e5d30ef43e5aede8e670bb1d15f489bbd07ef8
parentbb836fb415197caf8feed54bc7bb81d003608d2c (diff)
downloadXNNPACK-daa7762ba75b09cb0b89ed19f66114c13e2cb832.tar.gz
Filterbank Accumulate in ARM assembly
PiperOrigin-RevId: 470890302
-rw-r--r--BUILD.bazel1
-rwxr-xr-xCMakeLists.txt1
-rw-r--r--bench/u32-filterbank-accumulate.cc5
-rw-r--r--src/u32-filterbank-accumulate/aarch32-arm-x1.S63
-rw-r--r--src/xnnpack/filterbank.h1
-rw-r--r--test/u32-filterbank-accumulate.cc17
-rw-r--r--test/u32-filterbank-accumulate.yaml1
7 files changed, 87 insertions, 2 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index d4ab826d9..dacf9d300 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -8310,6 +8310,7 @@ AARCH32_ASM_MICROKERNEL_SRCS = [
"src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
"src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
"src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S",
+ "src/u32-filterbank-accumulate/aarch32-arm-x1.S",
"src/u32-filterbank-accumulate/aarch32-neon-x1.S",
"src/u32-filterbank-accumulate/aarch32-neon-x2.S",
]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f11ac9480..9ceec5cbc 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6752,6 +6752,7 @@ SET(AARCH32_ASM_MICROKERNEL_SRCS
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
+ src/u32-filterbank-accumulate/aarch32-arm-x1.S
src/u32-filterbank-accumulate/aarch32-neon-x1.S
src/u32-filterbank-accumulate/aarch32-neon-x2.S)
diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc
index ba064f945..aa1133dd1 100644
--- a/bench/u32-filterbank-accumulate.cc
+++ b/bench/u32-filterbank-accumulate.cc
@@ -68,8 +68,9 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
}
#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
-BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
-BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_arm_x1, xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/src/u32-filterbank-accumulate/aarch32-arm-x1.S b/src/u32-filterbank-accumulate/aarch32-arm-x1.S
new file mode 100644
index 000000000..ba349bca0
--- /dev/null
+++ b/src/u32-filterbank-accumulate/aarch32-arm-x1.S
@@ -0,0 +1,63 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1(
+// size_t rows, r0
+// const uint32_t* input, r1
+// const uint8_t* weight_widths, r2
+// const uint16_t* weights, r3
+// uint64_t* output) sp -> r12
+
+// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
+
+// Register usage
+// input r1 r6
+// weights r3 r5 r7
+// weight_accumulator r12 r8 r9
+// unweight_accumulator r10 r11
+// weight_widths r2 r4
+
+BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1
+ .arm
+#ifndef __APPLE__
+ .arch armv7-a
+ .fpu neon
+#endif
+ LDR r12, [sp] // output
+ PUSH {r4,r5,r6,r7,r8,r9,r10,r11} // push 32 bytes
+ MOV r8, 0 // weight_accumulator
+ MOV r9, 0
+0:
+ LDRB r4, [r2], #1 // weight_widths
+ MOV r10, 0 // unweight_accumulator
+ MOV r11, 0
+1:
+ LDR r5, [r3], #4 // weight+unweight
+ LDR r6, [r1], #4 // input
+ SUBS r4, r4, #1
+ UXTH r7, r5 // weight
+ UXTH r5, r5, ror #16 // unweight
+ UMLAL r8, r9, r6, r7 // weight_accumulator
+ UMLAL r10, r11, r6, r5 // unweight_accumulator
+ BHI 1b
+
+ STMIA r12!, {r8, r9}
+ SUBS r0, r0, #1
+ MOV r8, r10 // weight_accumulator = unweight_accumulator
+ MOV r9, r11
+ BNE 0b
+
+ POP {r4,r5,r6,r7,r8,r9,r10,r11}
+ BX lr
+
+END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/xnnpack/filterbank.h b/src/xnnpack/filterbank.h
index f6d7a59ef..dae11b92a 100644
--- a/src/xnnpack/filterbank.h
+++ b/src/xnnpack/filterbank.h
@@ -23,6 +23,7 @@ extern "C" {
const uint16_t* weights, \
uint64_t* output);
+DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1)
DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1)
DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2)
DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__neon_x1)
diff --git a/test/u32-filterbank-accumulate.cc b/test/u32-filterbank-accumulate.cc
index 0669af136..50bfca124 100644
--- a/test/u32-filterbank-accumulate.cc
+++ b/test/u32-filterbank-accumulate.cc
@@ -18,6 +18,23 @@
#if XNN_ARCH_ARM
+ TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_ARM_X1, rows_eq_1) {
+ FilterbankAccumulateMicrokernelTester()
+ .rows(1)
+ .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1);
+ }
+
+ TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_ARM_X1, rows_gt_1) {
+ for (size_t rows = 2; rows < 10; rows++) {
+ FilterbankAccumulateMicrokernelTester()
+ .rows(rows)
+ .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1);
+ }
+ }
+#endif // XNN_ARCH_ARM
+
+
+#if XNN_ARCH_ARM
TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_NEON_X1, rows_eq_1) {
TEST_REQUIRES_ARM_NEON;
FilterbankAccumulateMicrokernelTester()
diff --git a/test/u32-filterbank-accumulate.yaml b/test/u32-filterbank-accumulate.yaml
index 54798504c..332cbd350 100644
--- a/test/u32-filterbank-accumulate.yaml
+++ b/test/u32-filterbank-accumulate.yaml
@@ -5,6 +5,7 @@
# AArch32 assembly
+- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1
- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2