diff options
author | Frank Barchard <fbarchard@google.com> | 2022-08-29 21:12:19 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-08-29 21:13:15 -0700 |
commit | daa7762ba75b09cb0b89ed19f66114c13e2cb832 (patch) | |
tree | 56e5d30ef43e5aede8e670bb1d15f489bbd07ef8 | |
parent | bb836fb415197caf8feed54bc7bb81d003608d2c (diff) | |
download | XNNPACK-daa7762ba75b09cb0b89ed19f66114c13e2cb832.tar.gz |
Filterbank Accumulate in ARM assembly
PiperOrigin-RevId: 470890302
-rw-r--r-- | BUILD.bazel | 1 | ||||
-rwxr-xr-x | CMakeLists.txt | 1 | ||||
-rw-r--r-- | bench/u32-filterbank-accumulate.cc | 5 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/aarch32-arm-x1.S | 63 | ||||
-rw-r--r-- | src/xnnpack/filterbank.h | 1 | ||||
-rw-r--r-- | test/u32-filterbank-accumulate.cc | 17 | ||||
-rw-r--r-- | test/u32-filterbank-accumulate.yaml | 1 |
7 files changed, 87 insertions, 2 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index d4ab826d9..dacf9d300 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -8310,6 +8310,7 @@ AARCH32_ASM_MICROKERNEL_SRCS = [ "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S", "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S", + "src/u32-filterbank-accumulate/aarch32-arm-x1.S", "src/u32-filterbank-accumulate/aarch32-neon-x1.S", "src/u32-filterbank-accumulate/aarch32-neon-x2.S", ] diff --git a/CMakeLists.txt b/CMakeLists.txt index f11ac9480..9ceec5cbc 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6752,6 +6752,7 @@ SET(AARCH32_ASM_MICROKERNEL_SRCS src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S + src/u32-filterbank-accumulate/aarch32-arm-x1.S src/u32-filterbank-accumulate/aarch32-neon-x1.S src/u32-filterbank-accumulate/aarch32-neon-x2.S) diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc index ba064f945..aa1133dd1 100644 --- a/bench/u32-filterbank-accumulate.cc +++ b/bench/u32-filterbank-accumulate.cc @@ -68,8 +68,9 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) } #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY -BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); -BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); +BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_arm_x1, xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1)->Apply(BenchmarkKernelSize)->UseRealTime(); +BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); +BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY #if XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/src/u32-filterbank-accumulate/aarch32-arm-x1.S b/src/u32-filterbank-accumulate/aarch32-arm-x1.S new file mode 100644 index 000000000..ba349bca0 --- /dev/null +++ b/src/u32-filterbank-accumulate/aarch32-arm-x1.S @@ -0,0 +1,63 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <xnnpack/assembly.h> + +.syntax unified + +// void xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1( +// size_t rows, r0 +// const uint32_t* input, r1 +// const uint8_t* weight_widths, r2 +// const uint16_t* weights, r3 +// uint64_t* output) sp -> r12 + +// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. + +// Register usage +// input r1 r6 +// weights r3 r5 r7 +// weight_accumulator r12 r8 r9 +// unweight_accumulator r10 r11 +// weight_widths r2 r4 + +BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1 + .arm +#ifndef __APPLE__ + .arch armv7-a + .fpu neon +#endif + LDR r12, [sp] // output + PUSH {r4,r5,r6,r7,r8,r9,r10,r11} // push 32 bytes + MOV r8, 0 // weight_accumulator + MOV r9, 0 +0: + LDRB r4, [r2], #1 // weight_widths + MOV r10, 0 // unweight_accumulator + MOV r11, 0 +1: + LDR r5, [r3], #4 // weight+unweight + LDR r6, [r1], #4 // input + SUBS r4, r4, #1 + UXTH r7, r5 // weight + UXTH r5, r5, ror #16 // unweight + UMLAL r8, r9, r6, r7 // weight_accumulator + UMLAL r10, r11, r6, r5 // unweight_accumulator + BHI 1b + + STMIA r12!, {r8, r9} + SUBS r0, r0, #1 + MOV r8, r10 // weight_accumulator = unweight_accumulator + MOV r9, r11 + BNE 0b + + POP {r4,r5,r6,r7,r8,r9,r10,r11} + BX lr + +END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1 + +#ifdef __ELF__ +.section ".note.GNU-stack","",%progbits +#endif diff --git a/src/xnnpack/filterbank.h b/src/xnnpack/filterbank.h index f6d7a59ef..dae11b92a 100644 --- a/src/xnnpack/filterbank.h +++ b/src/xnnpack/filterbank.h @@ -23,6 +23,7 @@ extern "C" { const uint16_t* weights, \ uint64_t* output); +DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1) DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1) DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2) DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__neon_x1) diff --git a/test/u32-filterbank-accumulate.cc b/test/u32-filterbank-accumulate.cc index 0669af136..50bfca124 100644 --- a/test/u32-filterbank-accumulate.cc +++ b/test/u32-filterbank-accumulate.cc @@ -18,6 +18,23 @@ #if XNN_ARCH_ARM + TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_ARM_X1, rows_eq_1) { + FilterbankAccumulateMicrokernelTester() + .rows(1) + .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1); + } + + TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_ARM_X1, rows_gt_1) { + for (size_t rows = 2; rows < 10; rows++) { + FilterbankAccumulateMicrokernelTester() + .rows(rows) + .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1); + } + } +#endif // XNN_ARCH_ARM + + +#if XNN_ARCH_ARM TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_NEON_X1, rows_eq_1) { TEST_REQUIRES_ARM_NEON; FilterbankAccumulateMicrokernelTester() diff --git a/test/u32-filterbank-accumulate.yaml b/test/u32-filterbank-accumulate.yaml index 54798504c..332cbd350 100644 --- a/test/u32-filterbank-accumulate.yaml +++ b/test/u32-filterbank-accumulate.yaml @@ -5,6 +5,7 @@ # AArch32 assembly +- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1 - name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1 - name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2 |