aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-08-26 14:15:30 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-08-26 14:16:58 -0700
commit54b6a268385061cca8633b79b76b8b97a6751244 (patch)
tree2d273ebf7c95646bafff8a1fd5b7773a52b4edb4
parent360bdd17dce31968bea7417b0d75837e97cd30c3 (diff)
downloadXNNPACK-54b6a268385061cca8633b79b76b8b97a6751244.tar.gz
bfly4m1 NEON microkernel
PiperOrigin-RevId: 470331404
-rw-r--r--BUILD.bazel1
-rwxr-xr-xCMakeLists.txt1
-rw-r--r--bench/cs16-bfly4.cc4
-rw-r--r--src/cs16-bfly4/neon-m1.c50
-rw-r--r--src/xnnpack/fft.h1
-rw-r--r--test/cs16-bfly4.cc11
-rw-r--r--test/cs16-bfly4.yaml3
7 files changed, 71 insertions, 0 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 4c45afc0f..d4ab826d9 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3153,6 +3153,7 @@ PROD_NEON_MICROKERNEL_SRCS = [
]
ALL_NEON_MICROKERNEL_SRCS = [
+ "src/cs16-bfly4/neon-m1.c",
"src/cs16-vsquareabs/gen/neon-mlal-ld128-x4.c",
"src/cs16-vsquareabs/gen/neon-mlal-ld128-x8.c",
"src/cs16-vsquareabs/gen/neon-mlal-ld128-x12.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb154ca33..f11ac9480 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1647,6 +1647,7 @@ SET(PROD_NEON_MICROKERNEL_SRCS
src/x32-zip/x4-neon.c)
SET(ALL_NEON_MICROKERNEL_SRCS
+ src/cs16-bfly4/neon-m1.c
src/cs16-vsquareabs/gen/neon-mlal-ld128-x4.c
src/cs16-vsquareabs/gen/neon-mlal-ld128-x8.c
src/cs16-vsquareabs/gen/neon-mlal-ld128-x12.c
diff --git a/bench/cs16-bfly4.cc b/bench/cs16-bfly4.cc
index e4c46d2d9..724c85da9 100644
--- a/bench/cs16-bfly4.cc
+++ b/bench/cs16-bfly4.cc
@@ -74,6 +74,10 @@ static void BenchmarkM1KernelSize(benchmark::internal::Benchmark* b)
b->Args({1024, 1, 256});
}
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+BENCHMARK_CAPTURE(cs16_bfly4, cs16_neon_m1, xnn_cs16_bfly4m1_ukernel__neon)->Apply(BenchmarkM1KernelSize)->UseRealTime();
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_m1, xnn_cs16_bfly4m1_ukernel__scalar)->Apply(BenchmarkM1KernelSize)->UseRealTime();
BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x1, xnn_cs16_bfly4_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x2, xnn_cs16_bfly4_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime();
diff --git a/src/cs16-bfly4/neon-m1.c b/src/cs16-bfly4/neon-m1.c
new file mode 100644
index 000000000..cb9d60b40
--- /dev/null
+++ b/src/cs16-bfly4/neon-m1.c
@@ -0,0 +1,50 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/fft.h>
+
+#include <arm_neon.h>
+
+
+void xnn_cs16_bfly4m1_ukernel__neon(
+ size_t samples,
+ int16_t* data,
+ const size_t stride,
+ const int16_t* twiddle) {
+
+ assert(samples == 1);
+ assert(data != NULL);
+ assert(stride != 0);
+ assert(twiddle != NULL);
+
+ const int16x8_t vi = vld1q_s16(data);
+ const int16x8_t vdiv4 = vdupq_n_s16(8191);
+ const int16x8_t vout = vqrdmulhq_s16(vi, vdiv4);
+
+ const int16x4_t vtmp5 = vsub_s16(vget_low_s16(vout), vget_high_s16(vout));
+ int16x4_t vout0 = vadd_s16(vget_low_s16(vout), vget_high_s16(vout));
+
+ const int16x4_t vtmp3 = vadd_s16(vget_low_s16(vout), vget_high_s16(vout));
+ const int16x4_t vtmp4 = vsub_s16(vget_low_s16(vout), vget_high_s16(vout));
+
+ const int16x4_t vtmp3hi = vext_s16(vtmp3, vtmp3, 2);
+ const int16x4_t vout2 = vsub_s16(vout0, vtmp3hi);
+ vout0 = vadd_s16(vout0, vtmp3hi);
+ const int16x4_t vtmp4rev = vrev64_s16(vtmp4);
+ const int16x4_t vout1r3i = vadd_s16(vtmp5, vtmp4rev);
+ const int16x4_t vout3r1i = vsub_s16(vtmp5, vtmp4rev);
+
+ vst1_lane_u32((void*) data, vreinterpret_u32_s16(vout0), 0); data += 2;
+ vst1_lane_s16(data, vout1r3i, 0); data += 1;
+ vst1_lane_s16(data, vout3r1i, 1); data += 1;
+ vst1_lane_u32((void*) data, vreinterpret_u32_s16(vout2), 0); data += 2;
+ vst1_lane_s16(data, vout3r1i, 0); data += 1;
+ vst1_lane_s16(data, vout1r3i, 1);
+}
diff --git a/src/xnnpack/fft.h b/src/xnnpack/fft.h
index 374241136..90d9bacc4 100644
--- a/src/xnnpack/fft.h
+++ b/src/xnnpack/fft.h
@@ -27,6 +27,7 @@ DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x2)
DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x3)
DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x4)
DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__scalar)
+DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__neon)
#define DECLARE_CS16_FFTR_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
diff --git a/test/cs16-bfly4.cc b/test/cs16-bfly4.cc
index a3217ec78..67e8cb7ed 100644
--- a/test/cs16-bfly4.cc
+++ b/test/cs16-bfly4.cc
@@ -17,6 +17,17 @@
#include "bfly4-microkernel-tester.h"
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(CS16_BFLY4M1__NEON, samples_eq_1) {
+ TEST_REQUIRES_ARM_NEON;
+ BFly4MicrokernelTester()
+ .samples(1)
+ .stride(64)
+ .Test(xnn_cs16_bfly4m1_ukernel__neon);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
TEST(CS16_BFLY4__SCALAR_X1, samples_eq_1) {
BFly4MicrokernelTester()
.samples(1)
diff --git a/test/cs16-bfly4.yaml b/test/cs16-bfly4.yaml
index a9ddc8e58..5f24af79a 100644
--- a/test/cs16-bfly4.yaml
+++ b/test/cs16-bfly4.yaml
@@ -4,6 +4,9 @@
# LICENSE file in the root directory of this source tree.
+# NEON
+- name: xnn_cs16_bfly4m1_ukernel__neon
+
# Scalar
- name: xnn_cs16_bfly4_ukernel__scalar_x1
- name: xnn_cs16_bfly4_ukernel__scalar_x2