aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-08-23 11:08:41 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-08-23 11:09:45 -0700
commit762038c5e1cd79b6d9529bae5a63d30640602df7 (patch)
tree3a92d75eed587d490d5a680be3d00431f01e7f10
parentef4d790c25cc054349c98407a48ffe38f0843a4f (diff)
downloadXNNPACK-762038c5e1cd79b6d9529bae5a63d30640602df7.tar.gz
Aarch32 filterbank-accumulate assembly
PiperOrigin-RevId: 469506037
-rw-r--r--BUILD.bazel6
-rwxr-xr-xCMakeLists.txt8
-rw-r--r--bench/u32-filterbank-accumulate.cc6
-rw-r--r--src/u32-filterbank-accumulate/aarch32-neon-x1.S58
-rw-r--r--src/u32-filterbank-accumulate/aarch32-neon-x2.S69
-rw-r--r--src/xnnpack/filterbank.h2
-rw-r--r--test/u32-filterbank-accumulate.cc64
-rw-r--r--test/u32-filterbank-accumulate.yaml4
-rwxr-xr-xtools/generate-filterbank-accumulate-test.py16
9 files changed, 201 insertions, 32 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 63feeb4ec..e0efc26b1 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -618,11 +618,11 @@ PROD_SCALAR_RISCV_MICROKERNEL_SRCS = [
]
ALL_SCALAR_MICROKERNEL_SRCS = [
- "src/cs16-bfly4/scalar-m1.c",
"src/cs16-bfly4/gen/scalar-x1.c",
"src/cs16-bfly4/gen/scalar-x2.c",
"src/cs16-bfly4/gen/scalar-x3.c",
"src/cs16-bfly4/gen/scalar-x4.c",
+ "src/cs16-bfly4/scalar-m1.c",
"src/cs16-fftr/gen/scalar-x1.c",
"src/cs16-fftr/gen/scalar-x2.c",
"src/cs16-fftr/gen/scalar-x3.c",
@@ -1050,8 +1050,8 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
"src/math/sqrt-u32-scalar-clz-newton.c",
"src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c",
"src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c",
- "src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c",
"src/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c",
+ "src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c",
"src/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c",
"src/math/sqrt-u32-scalar-hashemian.c",
"src/math/sqrt-u32-scalar-tflm.c",
@@ -8309,6 +8309,8 @@ AARCH32_ASM_MICROKERNEL_SRCS = [
"src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
"src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
"src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S",
+ "src/u32-filterbank-accumulate/aarch32-neon-x1.S",
+ "src/u32-filterbank-accumulate/aarch32-neon-x2.S",
]
AARCH64_ASM_MICROKERNEL_SRCS = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 664c0e95e..2ff87f4ed 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -606,11 +606,11 @@ SET(PROD_SCALAR_RISCV_MICROKERNEL_SRCS
src/x32-zip/x4-scalar.c)
SET(ALL_SCALAR_MICROKERNEL_SRCS
- src/cs16-bfly4/scalar-m1.c
src/cs16-bfly4/gen/scalar-x1.c
src/cs16-bfly4/gen/scalar-x2.c
src/cs16-bfly4/gen/scalar-x3.c
src/cs16-bfly4/gen/scalar-x4.c
+ src/cs16-bfly4/scalar-m1.c
src/cs16-fftr/gen/scalar-x1.c
src/cs16-fftr/gen/scalar-x2.c
src/cs16-fftr/gen/scalar-x3.c
@@ -1038,8 +1038,8 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
src/math/sqrt-u32-scalar-clz-newton.c
src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c
src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c
- src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c
src/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c
+ src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c
src/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c
src/math/sqrt-u32-scalar-hashemian.c
src/math/sqrt-u32-scalar-tflm.c
@@ -6750,7 +6750,9 @@ SET(AARCH32_ASM_MICROKERNEL_SRCS
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
- src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S)
+ src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
+ src/u32-filterbank-accumulate/aarch32-neon-x1.S
+ src/u32-filterbank-accumulate/aarch32-neon-x2.S)
SET(AARCH64_ASM_MICROKERNEL_SRCS
src/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S
diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc
index a5c0a9a52..ba064f945 100644
--- a/bench/u32-filterbank-accumulate.cc
+++ b/bench/u32-filterbank-accumulate.cc
@@ -53,7 +53,6 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
{
b->ArgNames({"rows", "batch"});
b->Args({1, 237});
-
b->Args({5, 1});
b->Args({10, 2});
b->Args({7, 3});
@@ -68,6 +67,11 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
b->Args({1, 13});
}
+#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
+#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime();
diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x1.S b/src/u32-filterbank-accumulate/aarch32-neon-x1.S
new file mode 100644
index 000000000..bd5ad932a
--- /dev/null
+++ b/src/u32-filterbank-accumulate/aarch32-neon-x1.S
@@ -0,0 +1,58 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1(
+// size_t rows, r0
+// const uint32_t* input, r1
+// const uint8_t* weight_widths, r2
+// const uint16_t* weights, r3
+// uint64_t* output) sp -> r12
+
+// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
+
+// Register usage
+// input r1 d2
+// weights r3 d3 d4 d5
+// output r12 d0 d1
+
+// weight_widths r2 r4
+
+BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
+ .arm
+#ifndef __APPLE__
+ .arch armv7-a
+ .fpu neon
+#endif
+ LDR r12, [sp] // output
+ PUSH {r4} // push 4 bytes
+ VMOV.U8 q0, #0 // weight_accumulator
+0:
+ LDRB r4, [r2], #1 // weight_widths
+
+1:
+ VLD1.32 {d3[]}, [r3]! // weights
+ VLD1.32 {d2[]}, [r1]! // input
+ SUBS r4, r4, #1
+ VMOVL.U16 q2, d3
+ VMLAL.U32 q0, d4, d2
+ BHI 1b
+
+ VST1.64 {d0}, [r12]!
+ SUBS r0, r0, #1
+ VMOV d0, d1
+ BNE 0b
+
+ POP {r4}
+ BX lr
+
+END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x2.S b/src/u32-filterbank-accumulate/aarch32-neon-x2.S
new file mode 100644
index 000000000..5c8cc3a00
--- /dev/null
+++ b/src/u32-filterbank-accumulate/aarch32-neon-x2.S
@@ -0,0 +1,69 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2(
+// size_t rows, r0
+// const uint32_t* input, r1
+// const uint8_t* weight_widths, r2
+// const uint16_t* weights, r3
+// uint64_t* output) sp -> r12
+
+// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
+
+// Register usage
+// input r1 d2
+// weights r3 d3 d4 d5
+// output r12 d0 d1
+
+// weight_widths r2 r4
+
+BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
+ .arm
+#ifndef __APPLE__
+ .arch armv7-a
+ .fpu neon
+#endif
+ LDR r12, [sp] // output
+ PUSH {r4} // push 4 bytes
+ VMOV.U8 q0, #0 // weight_accumulator
+0:
+ LDRB r4, [r2], #1 // weight_widths
+ SUBS r4, r4, #1
+ BLS 2f // less than 2 weights?
+
+1:
+ VLD1.16 {d3}, [r3]! // weights
+ VLD1.32 {d2}, [r1]! // input
+ SUBS r4, r4, #2
+ VMOVL.U16 q2, d3
+ VMLAL.U32 q0, d4, d2[0]
+ VMLAL.U32 q0, d5, d2[1]
+ BHI 1b
+
+ BLO 3f // is there a remainder?
+2:
+ VLD1.32 {d3[]}, [r3]! // weights
+ VLD1.32 {d2[]}, [r1]! // input
+ VMOVL.U16 q2, d3
+ VMLAL.U32 q0, d4, d2
+
+3:
+ VST1.64 {d0}, [r12]!
+ SUBS r0, r0, #1
+ VMOV d0, d1
+ BNE 0b
+
+ POP {r4}
+ BX lr
+
+END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/xnnpack/filterbank.h b/src/xnnpack/filterbank.h
index 90ce94051..f6d7a59ef 100644
--- a/src/xnnpack/filterbank.h
+++ b/src/xnnpack/filterbank.h
@@ -23,6 +23,8 @@ extern "C" {
const uint16_t* weights, \
uint64_t* output);
+DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1)
+DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2)
DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__neon_x1)
DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__neon_x2)
DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__scalar_x1)
diff --git a/test/u32-filterbank-accumulate.cc b/test/u32-filterbank-accumulate.cc
index a1c0fd5d8..4929af500 100644
--- a/test/u32-filterbank-accumulate.cc
+++ b/test/u32-filterbank-accumulate.cc
@@ -17,6 +17,40 @@
#include "filterbank-accumulate-microkernel-tester.h"
+#if XNN_ARCH_ARM
+ TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_NEON_X1, rows_eq_1) {
+ TEST_REQUIRES_ARM_NEON;
+ FilterbankAccumulateMicrokernelTester()
+ .rows(1)
+ .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1);
+ }
+
+ TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_NEON_X1, rows_eq_2) {
+ TEST_REQUIRES_ARM_NEON;
+ FilterbankAccumulateMicrokernelTester()
+ .rows(2)
+ .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1);
+ }
+#endif // XNN_ARCH_ARM
+
+
+#if XNN_ARCH_ARM
+ TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_NEON_X2, rows_eq_1) {
+ TEST_REQUIRES_ARM_NEON;
+ FilterbankAccumulateMicrokernelTester()
+ .rows(1)
+ .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2);
+ }
+
+ TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_NEON_X2, rows_eq_2) {
+ TEST_REQUIRES_ARM_NEON;
+ FilterbankAccumulateMicrokernelTester()
+ .rows(2)
+ .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2);
+ }
+#endif // XNN_ARCH_ARM
+
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_eq_1) {
TEST_REQUIRES_ARM_NEON;
@@ -25,13 +59,11 @@
.Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1);
}
- TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_gt_1) {
+ TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_eq_2) {
TEST_REQUIRES_ARM_NEON;
- for (size_t rows = 2; rows <= 10; rows++) {
- FilterbankAccumulateMicrokernelTester()
- .rows(2)
- .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1);
- }
+ FilterbankAccumulateMicrokernelTester()
+ .rows(2)
+ .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1);
}
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
@@ -44,13 +76,11 @@
.Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2);
}
- TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, rows_gt_1) {
+ TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, rows_eq_2) {
TEST_REQUIRES_ARM_NEON;
- for (size_t rows = 2; rows <= 10; rows++) {
- FilterbankAccumulateMicrokernelTester()
- .rows(2)
- .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2);
- }
+ FilterbankAccumulateMicrokernelTester()
+ .rows(2)
+ .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2);
}
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
@@ -61,10 +91,8 @@ TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_eq_1) {
.Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1);
}
-TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_gt_1) {
- for (size_t rows = 2; rows <= 10; rows++) {
- FilterbankAccumulateMicrokernelTester()
- .rows(2)
- .Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1);
- }
+TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_eq_2) {
+ FilterbankAccumulateMicrokernelTester()
+ .rows(2)
+ .Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1);
}
diff --git a/test/u32-filterbank-accumulate.yaml b/test/u32-filterbank-accumulate.yaml
index c5cd045be..54798504c 100644
--- a/test/u32-filterbank-accumulate.yaml
+++ b/test/u32-filterbank-accumulate.yaml
@@ -4,6 +4,10 @@
# LICENSE file in the root directory of this source tree.
+# AArch32 assembly
+- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
+- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
+
# ARM NEON
- name: xnn_u32_filterbank_accumulate_ukernel__neon_x1
- name: xnn_u32_filterbank_accumulate_ukernel__neon_x2
diff --git a/tools/generate-filterbank-accumulate-test.py b/tools/generate-filterbank-accumulate-test.py
index 2a06a9396..2139a897e 100755
--- a/tools/generate-filterbank-accumulate-test.py
+++ b/tools/generate-filterbank-accumulate-test.py
@@ -27,10 +27,12 @@ parser.set_defaults(defines=list())
def split_ukernel_name(name):
- match = re.fullmatch(r"xnn_u32_filterbank_accumulate_ukernel__(.+)_x(\d+)", name)
+ match = re.fullmatch(r"xnn_u32_filterbank_accumulate_ukernel__(.+)(_x(\d+))?", name)
assert match is not None
row_tile = 1
- batch_tile = int(match.group(2))
+ batch_tile = 1
+ if match.group(3):
+ batch_tile = int(match.group(3))
arch, isa = xnncommon.parse_target_name(target_name=match.group(1))
return row_tile, batch_tile, arch, isa
@@ -45,14 +47,12 @@ TEST(${TEST_NAME}, rows_eq_1) {
.Test(${", ".join(TEST_ARGS)});
}
-TEST(${TEST_NAME}, rows_gt_1) {
+TEST(${TEST_NAME}, rows_eq_2) {
$if ISA_CHECK:
${ISA_CHECK};
- for (size_t rows = 2; rows <= 10; rows++) {
- FilterbankAccumulateMicrokernelTester()
- .rows(2)
- .Test(${", ".join(TEST_ARGS)});
- }
+ FilterbankAccumulateMicrokernelTester()
+ .rows(2)
+ .Test(${", ".join(TEST_ARGS)});
}