diff options
author | Frank Barchard <fbarchard@google.com> | 2022-08-24 13:57:31 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-08-24 13:58:08 -0700 |
commit | b1aad88bb522c36dce8a9ffe1f7f0d2b2743088b (patch) | |
tree | 519dfd7013e4d0e19e5e9489d36e14629fab6802 | |
parent | 84e5af66f53cd9b2b57dff1ef3aacf2b988824ff (diff) | |
download | XNNPACK-b1aad88bb522c36dce8a9ffe1f7f0d2b2743088b.tar.gz |
FILTERBANK-ACCUMULATE Neon microkernels with unweights accumulator set to 0
PiperOrigin-RevId: 469816851
-rw-r--r-- | BUILD.bazel | 4 | ||||
-rwxr-xr-x | CMakeLists.txt | 6 | ||||
-rw-r--r-- | bench/u32-filterbank-accumulate.cc | 10 | ||||
-rwxr-xr-x | scripts/generate-u32-filterbank-accumulate.sh | 4 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/aarch32-neon-x1.S | 57 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/aarch32-neon-x2.S | 69 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/gen/neon-x1.c | 51 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/gen/neon-x2.c | 60 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/neon.c.in | 65 | ||||
-rw-r--r-- | test/u32-filterbank-accumulate.yaml | 9 |
10 files changed, 334 insertions, 1 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index ec2005453..4c45afc0f 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -4214,6 +4214,8 @@ ALL_NEON_MICROKERNEL_SRCS = [ "src/u8-maxpool/9p8x-minmax-neon-c16.c", "src/u8-rmax/neon.c", "src/u8-vclamp/neon-x64.c", + "src/u32-filterbank-accumulate/gen/neon-x1.c", + "src/u32-filterbank-accumulate/gen/neon-x2.c", "src/xx-fill/neon-x64.c", "src/xx-pad/neon.c", "src/x8-transposec/gen/8x8-multi-dec-zip-neon.c", @@ -8307,6 +8309,8 @@ AARCH32_ASM_MICROKERNEL_SRCS = [ "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S", "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S", + "src/u32-filterbank-accumulate/aarch32-neon-x1.S", + "src/u32-filterbank-accumulate/aarch32-neon-x2.S", ] AARCH64_ASM_MICROKERNEL_SRCS = [ diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ecc493cb..bb154ca33 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2708,6 +2708,8 @@ SET(ALL_NEON_MICROKERNEL_SRCS src/u8-maxpool/9p8x-minmax-neon-c16.c src/u8-rmax/neon.c src/u8-vclamp/neon-x64.c + src/u32-filterbank-accumulate/gen/neon-x1.c + src/u32-filterbank-accumulate/gen/neon-x2.c src/xx-fill/neon-x64.c src/xx-pad/neon.c src/x8-transposec/gen/8x8-multi-dec-zip-neon.c @@ -6748,7 +6750,9 @@ SET(AARCH32_ASM_MICROKERNEL_SRCS src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S - src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S) + src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S + src/u32-filterbank-accumulate/aarch32-neon-x1.S + src/u32-filterbank-accumulate/aarch32-neon-x2.S) SET(AARCH64_ASM_MICROKERNEL_SRCS src/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc index 948998339..ba064f945 100644 --- a/bench/u32-filterbank-accumulate.cc +++ b/bench/u32-filterbank-accumulate.cc @@ -67,6 +67,16 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) b->Args({1, 13}); } +#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY +BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); +BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); +#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); +BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + BENCHMARK_CAPTURE(filterbank_accumulate, u32_scalar_x1, xnn_u32_filterbank_accumulate_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime(); #ifndef XNNPACK_BENCHMARK_NO_MAIN diff --git a/scripts/generate-u32-filterbank-accumulate.sh b/scripts/generate-u32-filterbank-accumulate.sh index f2dee01d0..36f44f266 100755 --- a/scripts/generate-u32-filterbank-accumulate.sh +++ b/scripts/generate-u32-filterbank-accumulate.sh @@ -7,6 +7,10 @@ ################################### SCALAR ################################### tools/xngen src/u32-filterbank-accumulate/scalar.c.in -D BATCH_TILE=1 -o src/u32-filterbank-accumulate/gen/scalar-x1.c & +################################### NEON ################################### +tools/xngen src/u32-filterbank-accumulate/neon.c.in -D BATCH_TILE=1 -o src/u32-filterbank-accumulate/gen/neon-x1.c & +tools/xngen src/u32-filterbank-accumulate/neon.c.in -D BATCH_TILE=2 -o src/u32-filterbank-accumulate/gen/neon-x2.c & + ################################## Unit tests ################################# tools/generate-filterbank-accumulate-test.py --spec test/u32-filterbank-accumulate.yaml --output test/u32-filterbank-accumulate.cc & diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x1.S b/src/u32-filterbank-accumulate/aarch32-neon-x1.S new file mode 100644 index 000000000..55011c80d --- /dev/null +++ b/src/u32-filterbank-accumulate/aarch32-neon-x1.S @@ -0,0 +1,57 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <xnnpack/assembly.h> + +.syntax unified + +// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1( +// size_t rows, r0 +// const uint32_t* input, r1 +// const uint8_t* weight_widths, r2 +// const uint16_t* weights, r3 +// uint64_t* output) sp -> r12 + +// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. + +// Register usage +// input r1 d2 +// weights r3 d3 d4 d5 +// output r12 d0 d1 + +// weight_widths r2 r4 + +BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1 + .arm +#ifndef __APPLE__ + .arch armv7-a + .fpu neon +#endif + LDR r12, [sp] // output + PUSH {r4,lr} // push 8 bytes + VMOV.U8 d0, #0 // weight_accumulator +0: + LDRB r4, [r2], #1 // weight_widths + VMOV.U8 d1, #0 // unweight_accumulator +1: + VLD1.32 {d3[]}, [r3]! // weights + VLD1.32 {d2[]}, [r1]! // input + SUBS r4, r4, #1 + VMOVL.U16 q2, d3 + VMLAL.U32 q0, d4, d2 + BHI 1b + + VST1.64 {d0}, [r12]! + SUBS r0, r0, #1 + VMOV d0, d1 + BNE 0b + + POP {r4,pc} + +END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1 + +#ifdef __ELF__ +.section ".note.GNU-stack","",%progbits +#endif diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x2.S b/src/u32-filterbank-accumulate/aarch32-neon-x2.S new file mode 100644 index 000000000..840d7c9f3 --- /dev/null +++ b/src/u32-filterbank-accumulate/aarch32-neon-x2.S @@ -0,0 +1,69 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <xnnpack/assembly.h> + +.syntax unified + +// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2( +// size_t rows, r0 +// const uint32_t* input, r1 +// const uint8_t* weight_widths, r2 +// const uint16_t* weights, r3 +// uint64_t* output) sp -> r12 + +// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. + +// Register usage +// input r1 d2 +// weights r3 d3 d4 d5 +// output r12 d0 d1 + +// weight_widths r2 r4 + +BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2 + .arm +#ifndef __APPLE__ + .arch armv7-a + .fpu neon +#endif + LDR r12, [sp] // output + PUSH {r4,lr} // push 8 bytes + VMOV.U8 d0, #0 // weight_accumulator +0: + LDRB r4, [r2], #1 // weight_widths + SUBS r4, r4, #1 + VMOV.U8 d1, #0 // unweight_accumulator + BLS 2f // less than 2 weights? + +1: + VLD1.16 {d3}, [r3]! // weights + VLD1.32 {d2}, [r1]! // input + SUBS r4, r4, #2 + VMOVL.U16 q2, d3 + VMLAL.U32 q0, d4, d2[0] + VMLAL.U32 q0, d5, d2[1] + BHI 1b + + BLO 3f // is there a remainder? +2: + VLD1.32 {d3[]}, [r3]! // weights + VLD1.32 {d2[]}, [r1]! // input + VMOVL.U16 q2, d3 + VMLAL.U32 q0, d4, d2 + +3: + VST1.64 {d0}, [r12]! + SUBS r0, r0, #1 + VMOV d0, d1 + BNE 0b + + POP {r4,pc} + +END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2 + +#ifdef __ELF__ +.section ".note.GNU-stack","",%progbits +#endif diff --git a/src/u32-filterbank-accumulate/gen/neon-x1.c b/src/u32-filterbank-accumulate/gen/neon-x1.c new file mode 100644 index 000000000..0a59096c4 --- /dev/null +++ b/src/u32-filterbank-accumulate/gen/neon-x1.c @@ -0,0 +1,51 @@ +// Auto-generated file. Do not edit! +// Template: src/u32-filterbank-accumulate/neon.c.in +// Generator: tools/xngen +// +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +#include <arm_neon.h> + +#include <xnnpack/math.h> +#include <xnnpack/filterbank.h> + + +void xnn_u32_filterbank_accumulate_ukernel__neon_x1( + size_t rows, + const uint32_t* input, + const uint8_t* weight_widths, + const uint16_t* weights, + uint64_t* output) { + + assert(rows != 0); + assert(input != NULL); + assert(weight_widths != NULL); + assert(weights != NULL); + assert(output != NULL); + + uint64x2_t weight_accumulator = vdupq_n_u64(0); + + do { + size_t n = (size_t) *weight_widths++; + assert(n != 0); + + do { + const uint32x2_t vi = vld1_dup_u32(input); input += 1; + const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; + const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); + + weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); + } while (--n != 0); + + vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1; + weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); + + } while (--rows != 0); +} diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c new file mode 100644 index 000000000..743244a83 --- /dev/null +++ b/src/u32-filterbank-accumulate/gen/neon-x2.c @@ -0,0 +1,60 @@ +// Auto-generated file. Do not edit! +// Template: src/u32-filterbank-accumulate/neon.c.in +// Generator: tools/xngen +// +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +#include <arm_neon.h> + +#include <xnnpack/math.h> +#include <xnnpack/filterbank.h> + + +void xnn_u32_filterbank_accumulate_ukernel__neon_x2( + size_t rows, + const uint32_t* input, + const uint8_t* weight_widths, + const uint16_t* weights, + uint64_t* output) { + + assert(rows != 0); + assert(input != NULL); + assert(weight_widths != NULL); + assert(weights != NULL); + assert(output != NULL); + + uint64x2_t weight_accumulator = vdupq_n_u64(0); + + do { + size_t n = (size_t) *weight_widths++; + assert(n != 0); + + for (;n >= 2; n -= 2) { + const uint32x2_t vi = vld1_u32(input); input += 2; + const uint16x4_t vw = vld1_u16(weights); weights += 4; + const uint32x4_t vw32 = vmovl_u16(vw); + + weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0); + weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1); + } + + if XNN_UNPREDICTABLE(n != 0) { + const uint32x2_t vi = vld1_dup_u32(input); input += 1; + const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; + const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); + + weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); + } + + vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1; + weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); + + } while (--rows != 0); +} diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in new file mode 100644 index 000000000..77d88bea6 --- /dev/null +++ b/src/u32-filterbank-accumulate/neon.c.in @@ -0,0 +1,65 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +#include <arm_neon.h> + +#include <xnnpack/math.h> +#include <xnnpack/filterbank.h> + + +void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}( + size_t rows, + const uint32_t* input, + const uint8_t* weight_widths, + const uint16_t* weights, + uint64_t* output) { + + assert(rows != 0); + assert(input != NULL); + assert(weight_widths != NULL); + assert(weights != NULL); + assert(output != NULL); + + uint64x2_t weight_accumulator = vdupq_n_u64(0); + + do { + size_t n = (size_t) *weight_widths++; + assert(n != 0); + + $if BATCH_TILE > 1: + for (;n >= 2; n -= 2) { + const uint32x2_t vi = vld1_u32(input); input += 2; + const uint16x4_t vw = vld1_u16(weights); weights += 4; + const uint32x4_t vw32 = vmovl_u16(vw); + + weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_low_u32(vw32), vi, 0); + weight_accumulator = vmlal_lane_u32(weight_accumulator, vget_high_u32(vw32), vi, 1); + } + + if XNN_UNPREDICTABLE(n != 0) { + const uint32x2_t vi = vld1_dup_u32(input); input += 1; + const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; + const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); + + weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); + } + $else: + do { + const uint32x2_t vi = vld1_dup_u32(input); input += 1; + const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; + const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); + + weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); + } while (--n != 0); + + vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1; + weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); + + } while (--rows != 0); +} diff --git a/test/u32-filterbank-accumulate.yaml b/test/u32-filterbank-accumulate.yaml index a44f9d45c..54798504c 100644 --- a/test/u32-filterbank-accumulate.yaml +++ b/test/u32-filterbank-accumulate.yaml @@ -3,5 +3,14 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# AArch32 assembly +- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1 +- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2 + +# ARM NEON +- name: xnn_u32_filterbank_accumulate_ukernel__neon_x1 +- name: xnn_u32_filterbank_accumulate_ukernel__neon_x2 + # Scalar - name: xnn_u32_filterbank_accumulate_ukernel__scalar_x1 |