diff options
author | Frank Barchard <fbarchard@google.com> | 2022-08-26 14:15:30 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-08-26 14:16:58 -0700 |
commit | 54b6a268385061cca8633b79b76b8b97a6751244 (patch) | |
tree | 2d273ebf7c95646bafff8a1fd5b7773a52b4edb4 | |
parent | 360bdd17dce31968bea7417b0d75837e97cd30c3 (diff) | |
download | XNNPACK-54b6a268385061cca8633b79b76b8b97a6751244.tar.gz |
bfly4m1 NEON microkernel
PiperOrigin-RevId: 470331404
-rw-r--r-- | BUILD.bazel | 1 | ||||
-rwxr-xr-x | CMakeLists.txt | 1 | ||||
-rw-r--r-- | bench/cs16-bfly4.cc | 4 | ||||
-rw-r--r-- | src/cs16-bfly4/neon-m1.c | 50 | ||||
-rw-r--r-- | src/xnnpack/fft.h | 1 | ||||
-rw-r--r-- | test/cs16-bfly4.cc | 11 | ||||
-rw-r--r-- | test/cs16-bfly4.yaml | 3 |
7 files changed, 71 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index 4c45afc0f..d4ab826d9 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -3153,6 +3153,7 @@ PROD_NEON_MICROKERNEL_SRCS = [ ] ALL_NEON_MICROKERNEL_SRCS = [ + "src/cs16-bfly4/neon-m1.c", "src/cs16-vsquareabs/gen/neon-mlal-ld128-x4.c", "src/cs16-vsquareabs/gen/neon-mlal-ld128-x8.c", "src/cs16-vsquareabs/gen/neon-mlal-ld128-x12.c", diff --git a/CMakeLists.txt b/CMakeLists.txt index bb154ca33..f11ac9480 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1647,6 +1647,7 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/x32-zip/x4-neon.c) SET(ALL_NEON_MICROKERNEL_SRCS + src/cs16-bfly4/neon-m1.c src/cs16-vsquareabs/gen/neon-mlal-ld128-x4.c src/cs16-vsquareabs/gen/neon-mlal-ld128-x8.c src/cs16-vsquareabs/gen/neon-mlal-ld128-x12.c diff --git a/bench/cs16-bfly4.cc b/bench/cs16-bfly4.cc index e4c46d2d9..724c85da9 100644 --- a/bench/cs16-bfly4.cc +++ b/bench/cs16-bfly4.cc @@ -74,6 +74,10 @@ static void BenchmarkM1KernelSize(benchmark::internal::Benchmark* b) b->Args({1024, 1, 256}); } +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +BENCHMARK_CAPTURE(cs16_bfly4, cs16_neon_m1, xnn_cs16_bfly4m1_ukernel__neon)->Apply(BenchmarkM1KernelSize)->UseRealTime(); +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_m1, xnn_cs16_bfly4m1_ukernel__scalar)->Apply(BenchmarkM1KernelSize)->UseRealTime(); BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x1, xnn_cs16_bfly4_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime(); BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x2, xnn_cs16_bfly4_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime(); diff --git a/src/cs16-bfly4/neon-m1.c b/src/cs16-bfly4/neon-m1.c new file mode 100644 index 000000000..cb9d60b40 --- /dev/null +++ b/src/cs16-bfly4/neon-m1.c @@ -0,0 +1,50 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +#include <xnnpack/math.h> +#include <xnnpack/fft.h> + +#include <arm_neon.h> + + +void xnn_cs16_bfly4m1_ukernel__neon( + size_t samples, + int16_t* data, + const size_t stride, + const int16_t* twiddle) { + + assert(samples == 1); + assert(data != NULL); + assert(stride != 0); + assert(twiddle != NULL); + + const int16x8_t vi = vld1q_s16(data); + const int16x8_t vdiv4 = vdupq_n_s16(8191); + const int16x8_t vout = vqrdmulhq_s16(vi, vdiv4); + + const int16x4_t vtmp5 = vsub_s16(vget_low_s16(vout), vget_high_s16(vout)); + int16x4_t vout0 = vadd_s16(vget_low_s16(vout), vget_high_s16(vout)); + + const int16x4_t vtmp3 = vadd_s16(vget_low_s16(vout), vget_high_s16(vout)); + const int16x4_t vtmp4 = vsub_s16(vget_low_s16(vout), vget_high_s16(vout)); + + const int16x4_t vtmp3hi = vext_s16(vtmp3, vtmp3, 2); + const int16x4_t vout2 = vsub_s16(vout0, vtmp3hi); + vout0 = vadd_s16(vout0, vtmp3hi); + const int16x4_t vtmp4rev = vrev64_s16(vtmp4); + const int16x4_t vout1r3i = vadd_s16(vtmp5, vtmp4rev); + const int16x4_t vout3r1i = vsub_s16(vtmp5, vtmp4rev); + + vst1_lane_u32((void*) data, vreinterpret_u32_s16(vout0), 0); data += 2; + vst1_lane_s16(data, vout1r3i, 0); data += 1; + vst1_lane_s16(data, vout3r1i, 1); data += 1; + vst1_lane_u32((void*) data, vreinterpret_u32_s16(vout2), 0); data += 2; + vst1_lane_s16(data, vout3r1i, 0); data += 1; + vst1_lane_s16(data, vout1r3i, 1); +} diff --git a/src/xnnpack/fft.h b/src/xnnpack/fft.h index 374241136..90d9bacc4 100644 --- a/src/xnnpack/fft.h +++ b/src/xnnpack/fft.h @@ -27,6 +27,7 @@ DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x2) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x3) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x4) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__scalar) +DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__neon) #define DECLARE_CS16_FFTR_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ diff --git a/test/cs16-bfly4.cc b/test/cs16-bfly4.cc index a3217ec78..67e8cb7ed 100644 --- a/test/cs16-bfly4.cc +++ b/test/cs16-bfly4.cc @@ -17,6 +17,17 @@ #include "bfly4-microkernel-tester.h" +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 + TEST(CS16_BFLY4M1__NEON, samples_eq_1) { + TEST_REQUIRES_ARM_NEON; + BFly4MicrokernelTester() + .samples(1) + .stride(64) + .Test(xnn_cs16_bfly4m1_ukernel__neon); + } +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + + TEST(CS16_BFLY4__SCALAR_X1, samples_eq_1) { BFly4MicrokernelTester() .samples(1) diff --git a/test/cs16-bfly4.yaml b/test/cs16-bfly4.yaml index a9ddc8e58..5f24af79a 100644 --- a/test/cs16-bfly4.yaml +++ b/test/cs16-bfly4.yaml @@ -4,6 +4,9 @@ # LICENSE file in the root directory of this source tree. +# NEON +- name: xnn_cs16_bfly4m1_ukernel__neon + # Scalar - name: xnn_cs16_bfly4_ukernel__scalar_x1 - name: xnn_cs16_bfly4_ukernel__scalar_x2 |