aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarat Dukhan <maratek@google.com>2022-09-01 10:54:21 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-09-01 10:55:16 -0700
commit0eaea5648189017ea286d1df37c0ca34f8302e25 (patch)
tree6c8a8bf711bb074d3606e413bf7fadf6c60ac00c
parent63ba5160296fe81926f56ea89b9fb3ee1a7b7a42 (diff)
downloadXNNPACK-0eaea5648189017ea286d1df37c0ca34f8302e25.tar.gz
Harmonize naming of specialized BFLY4 microkernels
- Rename m1 to samples1 - Separate samples1 from microkernel name by underscore PiperOrigin-RevId: 471574436
-rw-r--r--BUILD.bazel4
-rwxr-xr-xCMakeLists.txt4
-rw-r--r--bench/cs16-bfly4.cc6
-rwxr-xr-xscripts/generate-cs16-bfly4.sh8
-rw-r--r--src/cs16-bfly4/gen/scalar-x1.c56
-rw-r--r--src/cs16-bfly4/gen/scalar-x2.c128
-rw-r--r--src/cs16-bfly4/gen/scalar-x3.c160
-rw-r--r--src/cs16-bfly4/gen/scalar-x4.c192
-rw-r--r--src/cs16-bfly4/neon-samples1.c (renamed from src/cs16-bfly4/neon-m1.c)6
-rw-r--r--src/cs16-bfly4/scalar-samples1.c (renamed from src/cs16-bfly4/scalar-m1.c)6
-rw-r--r--src/cs16-bfly4/scalar.c.in172
-rw-r--r--src/cs16-fftr/gen/scalar-x1.c4
-rw-r--r--src/cs16-fftr/gen/scalar-x2.c4
-rw-r--r--src/cs16-fftr/gen/scalar-x3.c4
-rw-r--r--src/cs16-fftr/gen/scalar-x4.c4
-rw-r--r--src/cs16-fftr/scalar.c.in4
-rw-r--r--src/xnnpack/fft.h4
-rw-r--r--test/cs16-bfly4.cc8
-rw-r--r--test/cs16-bfly4.yaml4
-rwxr-xr-xtools/generate-bfly4-test.py26
20 files changed, 385 insertions, 419 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index f69155098..1cc5b7031 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -622,7 +622,7 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
"src/cs16-bfly4/gen/scalar-x2.c",
"src/cs16-bfly4/gen/scalar-x3.c",
"src/cs16-bfly4/gen/scalar-x4.c",
- "src/cs16-bfly4/scalar-m1.c",
+ "src/cs16-bfly4/scalar-samples1.c",
"src/cs16-fftr/gen/scalar-x1.c",
"src/cs16-fftr/gen/scalar-x2.c",
"src/cs16-fftr/gen/scalar-x3.c",
@@ -3153,7 +3153,7 @@ PROD_NEON_MICROKERNEL_SRCS = [
]
ALL_NEON_MICROKERNEL_SRCS = [
- "src/cs16-bfly4/neon-m1.c",
+ "src/cs16-bfly4/neon-samples1.c",
"src/cs16-vsquareabs/gen/neon-mlal-ld128-x4.c",
"src/cs16-vsquareabs/gen/neon-mlal-ld128-x8.c",
"src/cs16-vsquareabs/gen/neon-mlal-ld128-x12.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9a7d390b..cb002e3f4 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -610,7 +610,7 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
src/cs16-bfly4/gen/scalar-x2.c
src/cs16-bfly4/gen/scalar-x3.c
src/cs16-bfly4/gen/scalar-x4.c
- src/cs16-bfly4/scalar-m1.c
+ src/cs16-bfly4/scalar-samples1.c
src/cs16-fftr/gen/scalar-x1.c
src/cs16-fftr/gen/scalar-x2.c
src/cs16-fftr/gen/scalar-x3.c
@@ -1647,7 +1647,7 @@ SET(PROD_NEON_MICROKERNEL_SRCS
src/x32-zip/x4-neon.c)
SET(ALL_NEON_MICROKERNEL_SRCS
- src/cs16-bfly4/neon-m1.c
+ src/cs16-bfly4/neon-samples1.c
src/cs16-vsquareabs/gen/neon-mlal-ld128-x4.c
src/cs16-vsquareabs/gen/neon-mlal-ld128-x8.c
src/cs16-vsquareabs/gen/neon-mlal-ld128-x12.c
diff --git a/bench/cs16-bfly4.cc b/bench/cs16-bfly4.cc
index 724c85da9..9cfdfa2dc 100644
--- a/bench/cs16-bfly4.cc
+++ b/bench/cs16-bfly4.cc
@@ -67,7 +67,7 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
b->Args({1024, 1, 256});
}
-static void BenchmarkM1KernelSize(benchmark::internal::Benchmark* b)
+static void BenchmarkSamples1KernelSize(benchmark::internal::Benchmark* b)
{
b->ArgNames({"fft_size", "samples", "stride"});
b->Args({256, 1, 64});
@@ -75,10 +75,10 @@ static void BenchmarkM1KernelSize(benchmark::internal::Benchmark* b)
}
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-BENCHMARK_CAPTURE(cs16_bfly4, cs16_neon_m1, xnn_cs16_bfly4m1_ukernel__neon)->Apply(BenchmarkM1KernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, cs16_neon_m1, xnn_cs16_bfly4_samples1_ukernel__neon)->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
-BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_m1, xnn_cs16_bfly4m1_ukernel__scalar)->Apply(BenchmarkM1KernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_m1, xnn_cs16_bfly4_samples1_ukernel__scalar)->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x1, xnn_cs16_bfly4_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x2, xnn_cs16_bfly4_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime();
BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x3, xnn_cs16_bfly4_ukernel__scalar_x3)->Apply(BenchmarkKernelSize)->UseRealTime();
diff --git a/scripts/generate-cs16-bfly4.sh b/scripts/generate-cs16-bfly4.sh
index 992f61162..a4f404fa8 100755
--- a/scripts/generate-cs16-bfly4.sh
+++ b/scripts/generate-cs16-bfly4.sh
@@ -5,10 +5,10 @@
# LICENSE file in the root directory of this source tree.
################################### SCALAR ###################################
-tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=1 -D M=0 -o src/cs16-bfly4/gen/scalar-x1.c &
-tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=2 -D M=0 -o src/cs16-bfly4/gen/scalar-x2.c &
-tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=3 -D M=0 -o src/cs16-bfly4/gen/scalar-x3.c &
-tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=4 -D M=0 -o src/cs16-bfly4/gen/scalar-x4.c &
+tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=1 -o src/cs16-bfly4/gen/scalar-x1.c &
+tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=2 -o src/cs16-bfly4/gen/scalar-x2.c &
+tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=3 -o src/cs16-bfly4/gen/scalar-x3.c &
+tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=4 -o src/cs16-bfly4/gen/scalar-x4.c &
################################## Unit tests #################################
tools/generate-bfly4-test.py --spec test/cs16-bfly4.yaml --output test/cs16-bfly4.cc &
diff --git a/src/cs16-bfly4/gen/scalar-x1.c b/src/cs16-bfly4/gen/scalar-x1.c
index 89dd4b8eb..e38cf64eb 100644
--- a/src/cs16-bfly4/gen/scalar-x1.c
+++ b/src/cs16-bfly4/gen/scalar-x1.c
@@ -19,15 +19,15 @@ void xnn_cs16_bfly4_ukernel__scalar_x1(
size_t samples,
int16_t* data,
const size_t stride,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
const int16_t* tw1 = twiddle;
- const int16_t* tw2 = tw1;
- const int16_t* tw3 = tw1;
- int16_t* out0 = data;
- int16_t* out1 = data + samples * 2;
- int16_t* out2 = data + samples * 4;
- int16_t* out3 = data + samples * 6;
+ const int16_t* tw2 = twiddle;
+ const int16_t* tw3 = twiddle;
+ int16_t* data0 = data;
+ int16_t* data1 = data + samples * 2;
+ int16_t* data2 = data + samples * 4;
+ int16_t* data3 = data + samples * 6;
assert(samples != 0);
assert(data != NULL);
@@ -37,14 +37,14 @@ void xnn_cs16_bfly4_ukernel__scalar_x1(
if XNN_UNLIKELY(samples != 0) {
do {
- int32_t vout0r = (int32_t) out0[0];
- int32_t vout0i = (int32_t) out0[1];
- int32_t vout1r = (int32_t) out1[0];
- int32_t vout1i = (int32_t) out1[1];
- int32_t vout2r = (int32_t) out2[0];
- int32_t vout2i = (int32_t) out2[1];
- int32_t vout3r = (int32_t) out3[0];
- int32_t vout3i = (int32_t) out3[1];
+ int32_t vout0r = (int32_t) data0[0];
+ int32_t vout0i = (int32_t) data0[1];
+ int32_t vout1r = (int32_t) data1[0];
+ int32_t vout1i = (int32_t) data1[1];
+ int32_t vout2r = (int32_t) data2[0];
+ int32_t vout2i = (int32_t) data2[1];
+ int32_t vout3r = (int32_t) data3[0];
+ int32_t vout3i = (int32_t) data3[1];
const int32_t vtw1r = (const int32_t) tw1[0];
const int32_t vtw1i = (const int32_t) tw1[1];
@@ -92,18 +92,18 @@ void xnn_cs16_bfly4_ukernel__scalar_x1(
vout3r = vtmp5r - vtmp4i;
vout3i = vtmp5i + vtmp4r;
- out0[0] = (int16_t) vout0r;
- out0[1] = (int16_t) vout0i;
- out1[0] = (int16_t) vout1r;
- out1[1] = (int16_t) vout1i;
- out2[0] = (int16_t) vout2r;
- out2[1] = (int16_t) vout2i;
- out3[0] = (int16_t) vout3r;
- out3[1] = (int16_t) vout3i;
- out0 += 2;
- out1 += 2;
- out2 += 2;
- out3 += 2;
+ data0[0] = (int16_t) vout0r;
+ data0[1] = (int16_t) vout0i;
+ data1[0] = (int16_t) vout1r;
+ data1[1] = (int16_t) vout1i;
+ data2[0] = (int16_t) vout2r;
+ data2[1] = (int16_t) vout2i;
+ data3[0] = (int16_t) vout3r;
+ data3[1] = (int16_t) vout3i;
+ data0 += 2;
+ data1 += 2;
+ data2 += 2;
+ data3 += 2;
} while(--samples != 0);
}
}
diff --git a/src/cs16-bfly4/gen/scalar-x2.c b/src/cs16-bfly4/gen/scalar-x2.c
index 99bd8653e..214973041 100644
--- a/src/cs16-bfly4/gen/scalar-x2.c
+++ b/src/cs16-bfly4/gen/scalar-x2.c
@@ -19,15 +19,15 @@ void xnn_cs16_bfly4_ukernel__scalar_x2(
size_t samples,
int16_t* data,
const size_t stride,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
const int16_t* tw1 = twiddle;
- const int16_t* tw2 = tw1;
- const int16_t* tw3 = tw1;
- int16_t* out0 = data;
- int16_t* out1 = data + samples * 2;
- int16_t* out2 = data + samples * 4;
- int16_t* out3 = data + samples * 6;
+ const int16_t* tw2 = twiddle;
+ const int16_t* tw3 = twiddle;
+ int16_t* data0 = data;
+ int16_t* data1 = data + samples * 2;
+ int16_t* data2 = data + samples * 4;
+ int16_t* data3 = data + samples * 6;
assert(samples != 0);
assert(data != NULL);
@@ -35,22 +35,22 @@ void xnn_cs16_bfly4_ukernel__scalar_x2(
assert(twiddle != NULL);
for (; samples >= 2; samples -= 2) {
- int32_t vout0r0 = (int32_t) out0[0];
- int32_t vout0i0 = (int32_t) out0[1];
- int32_t vout0r1 = (int32_t) out0[2];
- int32_t vout0i1 = (int32_t) out0[3];
- int32_t vout1r0 = (int32_t) out1[0];
- int32_t vout1i0 = (int32_t) out1[1];
- int32_t vout1r1 = (int32_t) out1[2];
- int32_t vout1i1 = (int32_t) out1[3];
- int32_t vout2r0 = (int32_t) out2[0];
- int32_t vout2i0 = (int32_t) out2[1];
- int32_t vout2r1 = (int32_t) out2[2];
- int32_t vout2i1 = (int32_t) out2[3];
- int32_t vout3r0 = (int32_t) out3[0];
- int32_t vout3i0 = (int32_t) out3[1];
- int32_t vout3r1 = (int32_t) out3[2];
- int32_t vout3i1 = (int32_t) out3[3];
+ int32_t vout0r0 = (int32_t) data0[0];
+ int32_t vout0i0 = (int32_t) data0[1];
+ int32_t vout0r1 = (int32_t) data0[2];
+ int32_t vout0i1 = (int32_t) data0[3];
+ int32_t vout1r0 = (int32_t) data1[0];
+ int32_t vout1i0 = (int32_t) data1[1];
+ int32_t vout1r1 = (int32_t) data1[2];
+ int32_t vout1i1 = (int32_t) data1[3];
+ int32_t vout2r0 = (int32_t) data2[0];
+ int32_t vout2i0 = (int32_t) data2[1];
+ int32_t vout2r1 = (int32_t) data2[2];
+ int32_t vout2i1 = (int32_t) data2[3];
+ int32_t vout3r0 = (int32_t) data3[0];
+ int32_t vout3i0 = (int32_t) data3[1];
+ int32_t vout3r1 = (int32_t) data3[2];
+ int32_t vout3i1 = (int32_t) data3[3];
const int32_t vtw1r0 = (const int32_t) tw1[0];
const int32_t vtw1i0 = (const int32_t) tw1[1];
@@ -135,38 +135,38 @@ void xnn_cs16_bfly4_ukernel__scalar_x2(
vout3i0 = vtmp5i0 + vtmp4r0;
vout3i1 = vtmp5i1 + vtmp4r1;
- out0[0] = (int16_t) vout0r0;
- out0[1] = (int16_t) vout0i0;
- out0[2] = (int16_t) vout0r1;
- out0[3] = (int16_t) vout0i1;
- out0 += 2 * 2;
- out1[0] = (int16_t) vout1r0;
- out1[1] = (int16_t) vout1i0;
- out1[2] = (int16_t) vout1r1;
- out1[3] = (int16_t) vout1i1;
- out1 += 2 * 2;
- out2[0] = (int16_t) vout2r0;
- out2[1] = (int16_t) vout2i0;
- out2[2] = (int16_t) vout2r1;
- out2[3] = (int16_t) vout2i1;
- out2 += 2 * 2;
- out3[0] = (int16_t) vout3r0;
- out3[1] = (int16_t) vout3i0;
- out3[2] = (int16_t) vout3r1;
- out3[3] = (int16_t) vout3i1;
- out3 += 2 * 2;
+ data0[0] = (int16_t) vout0r0;
+ data0[1] = (int16_t) vout0i0;
+ data0[2] = (int16_t) vout0r1;
+ data0[3] = (int16_t) vout0i1;
+ data0 += 2 * 2;
+ data1[0] = (int16_t) vout1r0;
+ data1[1] = (int16_t) vout1i0;
+ data1[2] = (int16_t) vout1r1;
+ data1[3] = (int16_t) vout1i1;
+ data1 += 2 * 2;
+ data2[0] = (int16_t) vout2r0;
+ data2[1] = (int16_t) vout2i0;
+ data2[2] = (int16_t) vout2r1;
+ data2[3] = (int16_t) vout2i1;
+ data2 += 2 * 2;
+ data3[0] = (int16_t) vout3r0;
+ data3[1] = (int16_t) vout3i0;
+ data3[2] = (int16_t) vout3r1;
+ data3[3] = (int16_t) vout3i1;
+ data3 += 2 * 2;
}
if XNN_UNLIKELY(samples != 0) {
do {
- int32_t vout0r = (int32_t) out0[0];
- int32_t vout0i = (int32_t) out0[1];
- int32_t vout1r = (int32_t) out1[0];
- int32_t vout1i = (int32_t) out1[1];
- int32_t vout2r = (int32_t) out2[0];
- int32_t vout2i = (int32_t) out2[1];
- int32_t vout3r = (int32_t) out3[0];
- int32_t vout3i = (int32_t) out3[1];
+ int32_t vout0r = (int32_t) data0[0];
+ int32_t vout0i = (int32_t) data0[1];
+ int32_t vout1r = (int32_t) data1[0];
+ int32_t vout1i = (int32_t) data1[1];
+ int32_t vout2r = (int32_t) data2[0];
+ int32_t vout2i = (int32_t) data2[1];
+ int32_t vout3r = (int32_t) data3[0];
+ int32_t vout3i = (int32_t) data3[1];
const int32_t vtw1r = (const int32_t) tw1[0];
const int32_t vtw1i = (const int32_t) tw1[1];
@@ -214,18 +214,18 @@ void xnn_cs16_bfly4_ukernel__scalar_x2(
vout3r = vtmp5r - vtmp4i;
vout3i = vtmp5i + vtmp4r;
- out0[0] = (int16_t) vout0r;
- out0[1] = (int16_t) vout0i;
- out1[0] = (int16_t) vout1r;
- out1[1] = (int16_t) vout1i;
- out2[0] = (int16_t) vout2r;
- out2[1] = (int16_t) vout2i;
- out3[0] = (int16_t) vout3r;
- out3[1] = (int16_t) vout3i;
- out0 += 2;
- out1 += 2;
- out2 += 2;
- out3 += 2;
+ data0[0] = (int16_t) vout0r;
+ data0[1] = (int16_t) vout0i;
+ data1[0] = (int16_t) vout1r;
+ data1[1] = (int16_t) vout1i;
+ data2[0] = (int16_t) vout2r;
+ data2[1] = (int16_t) vout2i;
+ data3[0] = (int16_t) vout3r;
+ data3[1] = (int16_t) vout3i;
+ data0 += 2;
+ data1 += 2;
+ data2 += 2;
+ data3 += 2;
} while(--samples != 0);
}
}
diff --git a/src/cs16-bfly4/gen/scalar-x3.c b/src/cs16-bfly4/gen/scalar-x3.c
index fcb487110..68afa72bf 100644
--- a/src/cs16-bfly4/gen/scalar-x3.c
+++ b/src/cs16-bfly4/gen/scalar-x3.c
@@ -19,15 +19,15 @@ void xnn_cs16_bfly4_ukernel__scalar_x3(
size_t samples,
int16_t* data,
const size_t stride,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
const int16_t* tw1 = twiddle;
- const int16_t* tw2 = tw1;
- const int16_t* tw3 = tw1;
- int16_t* out0 = data;
- int16_t* out1 = data + samples * 2;
- int16_t* out2 = data + samples * 4;
- int16_t* out3 = data + samples * 6;
+ const int16_t* tw2 = twiddle;
+ const int16_t* tw3 = twiddle;
+ int16_t* data0 = data;
+ int16_t* data1 = data + samples * 2;
+ int16_t* data2 = data + samples * 4;
+ int16_t* data3 = data + samples * 6;
assert(samples != 0);
assert(data != NULL);
@@ -35,30 +35,30 @@ void xnn_cs16_bfly4_ukernel__scalar_x3(
assert(twiddle != NULL);
for (; samples >= 3; samples -= 3) {
- int32_t vout0r0 = (int32_t) out0[0];
- int32_t vout0i0 = (int32_t) out0[1];
- int32_t vout0r1 = (int32_t) out0[2];
- int32_t vout0i1 = (int32_t) out0[3];
- int32_t vout0r2 = (int32_t) out0[4];
- int32_t vout0i2 = (int32_t) out0[5];
- int32_t vout1r0 = (int32_t) out1[0];
- int32_t vout1i0 = (int32_t) out1[1];
- int32_t vout1r1 = (int32_t) out1[2];
- int32_t vout1i1 = (int32_t) out1[3];
- int32_t vout1r2 = (int32_t) out1[4];
- int32_t vout1i2 = (int32_t) out1[5];
- int32_t vout2r0 = (int32_t) out2[0];
- int32_t vout2i0 = (int32_t) out2[1];
- int32_t vout2r1 = (int32_t) out2[2];
- int32_t vout2i1 = (int32_t) out2[3];
- int32_t vout2r2 = (int32_t) out2[4];
- int32_t vout2i2 = (int32_t) out2[5];
- int32_t vout3r0 = (int32_t) out3[0];
- int32_t vout3i0 = (int32_t) out3[1];
- int32_t vout3r1 = (int32_t) out3[2];
- int32_t vout3i1 = (int32_t) out3[3];
- int32_t vout3r2 = (int32_t) out3[4];
- int32_t vout3i2 = (int32_t) out3[5];
+ int32_t vout0r0 = (int32_t) data0[0];
+ int32_t vout0i0 = (int32_t) data0[1];
+ int32_t vout0r1 = (int32_t) data0[2];
+ int32_t vout0i1 = (int32_t) data0[3];
+ int32_t vout0r2 = (int32_t) data0[4];
+ int32_t vout0i2 = (int32_t) data0[5];
+ int32_t vout1r0 = (int32_t) data1[0];
+ int32_t vout1i0 = (int32_t) data1[1];
+ int32_t vout1r1 = (int32_t) data1[2];
+ int32_t vout1i1 = (int32_t) data1[3];
+ int32_t vout1r2 = (int32_t) data1[4];
+ int32_t vout1i2 = (int32_t) data1[5];
+ int32_t vout2r0 = (int32_t) data2[0];
+ int32_t vout2i0 = (int32_t) data2[1];
+ int32_t vout2r1 = (int32_t) data2[2];
+ int32_t vout2i1 = (int32_t) data2[3];
+ int32_t vout2r2 = (int32_t) data2[4];
+ int32_t vout2i2 = (int32_t) data2[5];
+ int32_t vout3r0 = (int32_t) data3[0];
+ int32_t vout3i0 = (int32_t) data3[1];
+ int32_t vout3r1 = (int32_t) data3[2];
+ int32_t vout3i1 = (int32_t) data3[3];
+ int32_t vout3r2 = (int32_t) data3[4];
+ int32_t vout3i2 = (int32_t) data3[5];
const int32_t vtw1r0 = (const int32_t) tw1[0];
const int32_t vtw1i0 = (const int32_t) tw1[1];
@@ -182,46 +182,46 @@ void xnn_cs16_bfly4_ukernel__scalar_x3(
vout3i1 = vtmp5i1 + vtmp4r1;
vout3i2 = vtmp5i2 + vtmp4r2;
- out0[0] = (int16_t) vout0r0;
- out0[1] = (int16_t) vout0i0;
- out0[2] = (int16_t) vout0r1;
- out0[3] = (int16_t) vout0i1;
- out0[4] = (int16_t) vout0r2;
- out0[5] = (int16_t) vout0i2;
- out0 += 3 * 2;
- out1[0] = (int16_t) vout1r0;
- out1[1] = (int16_t) vout1i0;
- out1[2] = (int16_t) vout1r1;
- out1[3] = (int16_t) vout1i1;
- out1[4] = (int16_t) vout1r2;
- out1[5] = (int16_t) vout1i2;
- out1 += 3 * 2;
- out2[0] = (int16_t) vout2r0;
- out2[1] = (int16_t) vout2i0;
- out2[2] = (int16_t) vout2r1;
- out2[3] = (int16_t) vout2i1;
- out2[4] = (int16_t) vout2r2;
- out2[5] = (int16_t) vout2i2;
- out2 += 3 * 2;
- out3[0] = (int16_t) vout3r0;
- out3[1] = (int16_t) vout3i0;
- out3[2] = (int16_t) vout3r1;
- out3[3] = (int16_t) vout3i1;
- out3[4] = (int16_t) vout3r2;
- out3[5] = (int16_t) vout3i2;
- out3 += 3 * 2;
+ data0[0] = (int16_t) vout0r0;
+ data0[1] = (int16_t) vout0i0;
+ data0[2] = (int16_t) vout0r1;
+ data0[3] = (int16_t) vout0i1;
+ data0[4] = (int16_t) vout0r2;
+ data0[5] = (int16_t) vout0i2;
+ data0 += 3 * 2;
+ data1[0] = (int16_t) vout1r0;
+ data1[1] = (int16_t) vout1i0;
+ data1[2] = (int16_t) vout1r1;
+ data1[3] = (int16_t) vout1i1;
+ data1[4] = (int16_t) vout1r2;
+ data1[5] = (int16_t) vout1i2;
+ data1 += 3 * 2;
+ data2[0] = (int16_t) vout2r0;
+ data2[1] = (int16_t) vout2i0;
+ data2[2] = (int16_t) vout2r1;
+ data2[3] = (int16_t) vout2i1;
+ data2[4] = (int16_t) vout2r2;
+ data2[5] = (int16_t) vout2i2;
+ data2 += 3 * 2;
+ data3[0] = (int16_t) vout3r0;
+ data3[1] = (int16_t) vout3i0;
+ data3[2] = (int16_t) vout3r1;
+ data3[3] = (int16_t) vout3i1;
+ data3[4] = (int16_t) vout3r2;
+ data3[5] = (int16_t) vout3i2;
+ data3 += 3 * 2;
}
if XNN_UNLIKELY(samples != 0) {
do {
- int32_t vout0r = (int32_t) out0[0];
- int32_t vout0i = (int32_t) out0[1];
- int32_t vout1r = (int32_t) out1[0];
- int32_t vout1i = (int32_t) out1[1];
- int32_t vout2r = (int32_t) out2[0];
- int32_t vout2i = (int32_t) out2[1];
- int32_t vout3r = (int32_t) out3[0];
- int32_t vout3i = (int32_t) out3[1];
+ int32_t vout0r = (int32_t) data0[0];
+ int32_t vout0i = (int32_t) data0[1];
+ int32_t vout1r = (int32_t) data1[0];
+ int32_t vout1i = (int32_t) data1[1];
+ int32_t vout2r = (int32_t) data2[0];
+ int32_t vout2i = (int32_t) data2[1];
+ int32_t vout3r = (int32_t) data3[0];
+ int32_t vout3i = (int32_t) data3[1];
const int32_t vtw1r = (const int32_t) tw1[0];
const int32_t vtw1i = (const int32_t) tw1[1];
@@ -269,18 +269,18 @@ void xnn_cs16_bfly4_ukernel__scalar_x3(
vout3r = vtmp5r - vtmp4i;
vout3i = vtmp5i + vtmp4r;
- out0[0] = (int16_t) vout0r;
- out0[1] = (int16_t) vout0i;
- out1[0] = (int16_t) vout1r;
- out1[1] = (int16_t) vout1i;
- out2[0] = (int16_t) vout2r;
- out2[1] = (int16_t) vout2i;
- out3[0] = (int16_t) vout3r;
- out3[1] = (int16_t) vout3i;
- out0 += 2;
- out1 += 2;
- out2 += 2;
- out3 += 2;
+ data0[0] = (int16_t) vout0r;
+ data0[1] = (int16_t) vout0i;
+ data1[0] = (int16_t) vout1r;
+ data1[1] = (int16_t) vout1i;
+ data2[0] = (int16_t) vout2r;
+ data2[1] = (int16_t) vout2i;
+ data3[0] = (int16_t) vout3r;
+ data3[1] = (int16_t) vout3i;
+ data0 += 2;
+ data1 += 2;
+ data2 += 2;
+ data3 += 2;
} while(--samples != 0);
}
}
diff --git a/src/cs16-bfly4/gen/scalar-x4.c b/src/cs16-bfly4/gen/scalar-x4.c
index ac7117c1e..37c0a8f6b 100644
--- a/src/cs16-bfly4/gen/scalar-x4.c
+++ b/src/cs16-bfly4/gen/scalar-x4.c
@@ -19,15 +19,15 @@ void xnn_cs16_bfly4_ukernel__scalar_x4(
size_t samples,
int16_t* data,
const size_t stride,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
const int16_t* tw1 = twiddle;
- const int16_t* tw2 = tw1;
- const int16_t* tw3 = tw1;
- int16_t* out0 = data;
- int16_t* out1 = data + samples * 2;
- int16_t* out2 = data + samples * 4;
- int16_t* out3 = data + samples * 6;
+ const int16_t* tw2 = twiddle;
+ const int16_t* tw3 = twiddle;
+ int16_t* data0 = data;
+ int16_t* data1 = data + samples * 2;
+ int16_t* data2 = data + samples * 4;
+ int16_t* data3 = data + samples * 6;
assert(samples != 0);
assert(data != NULL);
@@ -35,38 +35,38 @@ void xnn_cs16_bfly4_ukernel__scalar_x4(
assert(twiddle != NULL);
for (; samples >= 4; samples -= 4) {
- int32_t vout0r0 = (int32_t) out0[0];
- int32_t vout0i0 = (int32_t) out0[1];
- int32_t vout0r1 = (int32_t) out0[2];
- int32_t vout0i1 = (int32_t) out0[3];
- int32_t vout0r2 = (int32_t) out0[4];
- int32_t vout0i2 = (int32_t) out0[5];
- int32_t vout0r3 = (int32_t) out0[6];
- int32_t vout0i3 = (int32_t) out0[7];
- int32_t vout1r0 = (int32_t) out1[0];
- int32_t vout1i0 = (int32_t) out1[1];
- int32_t vout1r1 = (int32_t) out1[2];
- int32_t vout1i1 = (int32_t) out1[3];
- int32_t vout1r2 = (int32_t) out1[4];
- int32_t vout1i2 = (int32_t) out1[5];
- int32_t vout1r3 = (int32_t) out1[6];
- int32_t vout1i3 = (int32_t) out1[7];
- int32_t vout2r0 = (int32_t) out2[0];
- int32_t vout2i0 = (int32_t) out2[1];
- int32_t vout2r1 = (int32_t) out2[2];
- int32_t vout2i1 = (int32_t) out2[3];
- int32_t vout2r2 = (int32_t) out2[4];
- int32_t vout2i2 = (int32_t) out2[5];
- int32_t vout2r3 = (int32_t) out2[6];
- int32_t vout2i3 = (int32_t) out2[7];
- int32_t vout3r0 = (int32_t) out3[0];
- int32_t vout3i0 = (int32_t) out3[1];
- int32_t vout3r1 = (int32_t) out3[2];
- int32_t vout3i1 = (int32_t) out3[3];
- int32_t vout3r2 = (int32_t) out3[4];
- int32_t vout3i2 = (int32_t) out3[5];
- int32_t vout3r3 = (int32_t) out3[6];
- int32_t vout3i3 = (int32_t) out3[7];
+ int32_t vout0r0 = (int32_t) data0[0];
+ int32_t vout0i0 = (int32_t) data0[1];
+ int32_t vout0r1 = (int32_t) data0[2];
+ int32_t vout0i1 = (int32_t) data0[3];
+ int32_t vout0r2 = (int32_t) data0[4];
+ int32_t vout0i2 = (int32_t) data0[5];
+ int32_t vout0r3 = (int32_t) data0[6];
+ int32_t vout0i3 = (int32_t) data0[7];
+ int32_t vout1r0 = (int32_t) data1[0];
+ int32_t vout1i0 = (int32_t) data1[1];
+ int32_t vout1r1 = (int32_t) data1[2];
+ int32_t vout1i1 = (int32_t) data1[3];
+ int32_t vout1r2 = (int32_t) data1[4];
+ int32_t vout1i2 = (int32_t) data1[5];
+ int32_t vout1r3 = (int32_t) data1[6];
+ int32_t vout1i3 = (int32_t) data1[7];
+ int32_t vout2r0 = (int32_t) data2[0];
+ int32_t vout2i0 = (int32_t) data2[1];
+ int32_t vout2r1 = (int32_t) data2[2];
+ int32_t vout2i1 = (int32_t) data2[3];
+ int32_t vout2r2 = (int32_t) data2[4];
+ int32_t vout2i2 = (int32_t) data2[5];
+ int32_t vout2r3 = (int32_t) data2[6];
+ int32_t vout2i3 = (int32_t) data2[7];
+ int32_t vout3r0 = (int32_t) data3[0];
+ int32_t vout3i0 = (int32_t) data3[1];
+ int32_t vout3r1 = (int32_t) data3[2];
+ int32_t vout3i1 = (int32_t) data3[3];
+ int32_t vout3r2 = (int32_t) data3[4];
+ int32_t vout3i2 = (int32_t) data3[5];
+ int32_t vout3r3 = (int32_t) data3[6];
+ int32_t vout3i3 = (int32_t) data3[7];
const int32_t vtw1r0 = (const int32_t) tw1[0];
const int32_t vtw1i0 = (const int32_t) tw1[1];
@@ -229,54 +229,54 @@ void xnn_cs16_bfly4_ukernel__scalar_x4(
vout3i2 = vtmp5i2 + vtmp4r2;
vout3i3 = vtmp5i3 + vtmp4r3;
- out0[0] = (int16_t) vout0r0;
- out0[1] = (int16_t) vout0i0;
- out0[2] = (int16_t) vout0r1;
- out0[3] = (int16_t) vout0i1;
- out0[4] = (int16_t) vout0r2;
- out0[5] = (int16_t) vout0i2;
- out0[6] = (int16_t) vout0r3;
- out0[7] = (int16_t) vout0i3;
- out0 += 4 * 2;
- out1[0] = (int16_t) vout1r0;
- out1[1] = (int16_t) vout1i0;
- out1[2] = (int16_t) vout1r1;
- out1[3] = (int16_t) vout1i1;
- out1[4] = (int16_t) vout1r2;
- out1[5] = (int16_t) vout1i2;
- out1[6] = (int16_t) vout1r3;
- out1[7] = (int16_t) vout1i3;
- out1 += 4 * 2;
- out2[0] = (int16_t) vout2r0;
- out2[1] = (int16_t) vout2i0;
- out2[2] = (int16_t) vout2r1;
- out2[3] = (int16_t) vout2i1;
- out2[4] = (int16_t) vout2r2;
- out2[5] = (int16_t) vout2i2;
- out2[6] = (int16_t) vout2r3;
- out2[7] = (int16_t) vout2i3;
- out2 += 4 * 2;
- out3[0] = (int16_t) vout3r0;
- out3[1] = (int16_t) vout3i0;
- out3[2] = (int16_t) vout3r1;
- out3[3] = (int16_t) vout3i1;
- out3[4] = (int16_t) vout3r2;
- out3[5] = (int16_t) vout3i2;
- out3[6] = (int16_t) vout3r3;
- out3[7] = (int16_t) vout3i3;
- out3 += 4 * 2;
+ data0[0] = (int16_t) vout0r0;
+ data0[1] = (int16_t) vout0i0;
+ data0[2] = (int16_t) vout0r1;
+ data0[3] = (int16_t) vout0i1;
+ data0[4] = (int16_t) vout0r2;
+ data0[5] = (int16_t) vout0i2;
+ data0[6] = (int16_t) vout0r3;
+ data0[7] = (int16_t) vout0i3;
+ data0 += 4 * 2;
+ data1[0] = (int16_t) vout1r0;
+ data1[1] = (int16_t) vout1i0;
+ data1[2] = (int16_t) vout1r1;
+ data1[3] = (int16_t) vout1i1;
+ data1[4] = (int16_t) vout1r2;
+ data1[5] = (int16_t) vout1i2;
+ data1[6] = (int16_t) vout1r3;
+ data1[7] = (int16_t) vout1i3;
+ data1 += 4 * 2;
+ data2[0] = (int16_t) vout2r0;
+ data2[1] = (int16_t) vout2i0;
+ data2[2] = (int16_t) vout2r1;
+ data2[3] = (int16_t) vout2i1;
+ data2[4] = (int16_t) vout2r2;
+ data2[5] = (int16_t) vout2i2;
+ data2[6] = (int16_t) vout2r3;
+ data2[7] = (int16_t) vout2i3;
+ data2 += 4 * 2;
+ data3[0] = (int16_t) vout3r0;
+ data3[1] = (int16_t) vout3i0;
+ data3[2] = (int16_t) vout3r1;
+ data3[3] = (int16_t) vout3i1;
+ data3[4] = (int16_t) vout3r2;
+ data3[5] = (int16_t) vout3i2;
+ data3[6] = (int16_t) vout3r3;
+ data3[7] = (int16_t) vout3i3;
+ data3 += 4 * 2;
}
if XNN_UNLIKELY(samples != 0) {
do {
- int32_t vout0r = (int32_t) out0[0];
- int32_t vout0i = (int32_t) out0[1];
- int32_t vout1r = (int32_t) out1[0];
- int32_t vout1i = (int32_t) out1[1];
- int32_t vout2r = (int32_t) out2[0];
- int32_t vout2i = (int32_t) out2[1];
- int32_t vout3r = (int32_t) out3[0];
- int32_t vout3i = (int32_t) out3[1];
+ int32_t vout0r = (int32_t) data0[0];
+ int32_t vout0i = (int32_t) data0[1];
+ int32_t vout1r = (int32_t) data1[0];
+ int32_t vout1i = (int32_t) data1[1];
+ int32_t vout2r = (int32_t) data2[0];
+ int32_t vout2i = (int32_t) data2[1];
+ int32_t vout3r = (int32_t) data3[0];
+ int32_t vout3i = (int32_t) data3[1];
const int32_t vtw1r = (const int32_t) tw1[0];
const int32_t vtw1i = (const int32_t) tw1[1];
@@ -324,18 +324,18 @@ void xnn_cs16_bfly4_ukernel__scalar_x4(
vout3r = vtmp5r - vtmp4i;
vout3i = vtmp5i + vtmp4r;
- out0[0] = (int16_t) vout0r;
- out0[1] = (int16_t) vout0i;
- out1[0] = (int16_t) vout1r;
- out1[1] = (int16_t) vout1i;
- out2[0] = (int16_t) vout2r;
- out2[1] = (int16_t) vout2i;
- out3[0] = (int16_t) vout3r;
- out3[1] = (int16_t) vout3i;
- out0 += 2;
- out1 += 2;
- out2 += 2;
- out3 += 2;
+ data0[0] = (int16_t) vout0r;
+ data0[1] = (int16_t) vout0i;
+ data1[0] = (int16_t) vout1r;
+ data1[1] = (int16_t) vout1i;
+ data2[0] = (int16_t) vout2r;
+ data2[1] = (int16_t) vout2i;
+ data3[0] = (int16_t) vout3r;
+ data3[1] = (int16_t) vout3i;
+ data0 += 2;
+ data1 += 2;
+ data2 += 2;
+ data3 += 2;
} while(--samples != 0);
}
}
diff --git a/src/cs16-bfly4/neon-m1.c b/src/cs16-bfly4/neon-samples1.c
index cb9d60b40..55f663020 100644
--- a/src/cs16-bfly4/neon-m1.c
+++ b/src/cs16-bfly4/neon-samples1.c
@@ -13,12 +13,12 @@
#include <arm_neon.h>
-void xnn_cs16_bfly4m1_ukernel__neon(
+void xnn_cs16_bfly4_samples1_ukernel__neon(
size_t samples,
int16_t* data,
const size_t stride,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
assert(samples == 1);
assert(data != NULL);
assert(stride != 0);
diff --git a/src/cs16-bfly4/scalar-m1.c b/src/cs16-bfly4/scalar-samples1.c
index d0f2af509..2f3226786 100644
--- a/src/cs16-bfly4/scalar-m1.c
+++ b/src/cs16-bfly4/scalar-samples1.c
@@ -11,12 +11,12 @@
#include <xnnpack/fft.h>
-void xnn_cs16_bfly4m1_ukernel__scalar(
+void xnn_cs16_bfly4_samples1_ukernel__scalar(
size_t samples,
int16_t* data,
const size_t stride,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
assert(samples == 1);
assert(data != NULL);
assert(stride != 0);
diff --git a/src/cs16-bfly4/scalar.c.in b/src/cs16-bfly4/scalar.c.in
index 23d5bfcbd..42c8e278c 100644
--- a/src/cs16-bfly4/scalar.c.in
+++ b/src/cs16-bfly4/scalar.c.in
@@ -12,27 +12,21 @@ $assert SAMPLE_TILE >= 1
#include <xnnpack/fft.h>
-$VARIANT = "m%s" % M if M else ""
-void xnn_cs16_bfly4${VARIANT}_ukernel__scalar_x${SAMPLE_TILE}(
+void xnn_cs16_bfly4_ukernel__scalar_x${SAMPLE_TILE}(
size_t samples,
int16_t* data,
const size_t stride,
- const int16_t* twiddle) {
-
- $if M != 1:
- const int16_t* tw1 = twiddle;
- const int16_t* tw2 = tw1;
- const int16_t* tw3 = tw1;
- int16_t* out0 = data;
- $if M != 1:
- int16_t* out1 = data + samples * 2;
- int16_t* out2 = data + samples * 4;
- int16_t* out3 = data + samples * 6;
-
- $if M != 0:
- assert(samples == ${M});
- $else:
- assert(samples != 0);
+ const int16_t* twiddle)
+{
+ const int16_t* tw1 = twiddle;
+ const int16_t* tw2 = twiddle;
+ const int16_t* tw3 = twiddle;
+ int16_t* data0 = data;
+ int16_t* data1 = data + samples * 2;
+ int16_t* data2 = data + samples * 4;
+ int16_t* data3 = data + samples * 6;
+
+ assert(samples != 0);
assert(data != NULL);
assert(stride != 0);
assert(twiddle != NULL);
@@ -40,17 +34,17 @@ void xnn_cs16_bfly4${VARIANT}_ukernel__scalar_x${SAMPLE_TILE}(
$if SAMPLE_TILE > 1:
for (; samples >= ${SAMPLE_TILE}; samples -= ${SAMPLE_TILE}) {
$for C in range(SAMPLE_TILE):
- int32_t vout0r${C} = (int32_t) out0[${C * 2 + 0}];
- int32_t vout0i${C} = (int32_t) out0[${C * 2 + 1}];
+ int32_t vout0r${C} = (int32_t) data0[${C * 2 + 0}];
+ int32_t vout0i${C} = (int32_t) data0[${C * 2 + 1}];
$for C in range(SAMPLE_TILE):
- int32_t vout1r${C} = (int32_t) out1[${C * 2 + 0}];
- int32_t vout1i${C} = (int32_t) out1[${C * 2 + 1}];
+ int32_t vout1r${C} = (int32_t) data1[${C * 2 + 0}];
+ int32_t vout1i${C} = (int32_t) data1[${C * 2 + 1}];
$for C in range(SAMPLE_TILE):
- int32_t vout2r${C} = (int32_t) out2[${C * 2 + 0}];
- int32_t vout2i${C} = (int32_t) out2[${C * 2 + 1}];
+ int32_t vout2r${C} = (int32_t) data2[${C * 2 + 0}];
+ int32_t vout2i${C} = (int32_t) data2[${C * 2 + 1}];
$for C in range(SAMPLE_TILE):
- int32_t vout3r${C} = (int32_t) out3[${C * 2 + 0}];
- int32_t vout3i${C} = (int32_t) out3[${C * 2 + 1}];
+ int32_t vout3r${C} = (int32_t) data3[${C * 2 + 0}];
+ int32_t vout3i${C} = (int32_t) data3[${C * 2 + 1}];
$for C in range(SAMPLE_TILE):
const int32_t vtw1r${C} = (const int32_t) tw1[0];
@@ -130,54 +124,43 @@ void xnn_cs16_bfly4${VARIANT}_ukernel__scalar_x${SAMPLE_TILE}(
vout3i${C} = vtmp5i${C} + vtmp4r${C};
$for C in range(SAMPLE_TILE):
- out0[${C * 2 + 0}] = (int16_t) vout0r${C};
- out0[${C * 2 + 1}] = (int16_t) vout0i${C};
- out0 += ${SAMPLE_TILE} * 2;
+ data0[${C * 2 + 0}] = (int16_t) vout0r${C};
+ data0[${C * 2 + 1}] = (int16_t) vout0i${C};
+ data0 += ${SAMPLE_TILE} * 2;
$for C in range(SAMPLE_TILE):
- out1[${C * 2 + 0}] = (int16_t) vout1r${C};
- out1[${C * 2 + 1}] = (int16_t) vout1i${C};
- out1 += ${SAMPLE_TILE} * 2;
+ data1[${C * 2 + 0}] = (int16_t) vout1r${C};
+ data1[${C * 2 + 1}] = (int16_t) vout1i${C};
+ data1 += ${SAMPLE_TILE} * 2;
$for C in range(SAMPLE_TILE):
- out2[${C * 2 + 0}] = (int16_t) vout2r${C};
- out2[${C * 2 + 1}] = (int16_t) vout2i${C};
- out2 += ${SAMPLE_TILE} * 2;
+ data2[${C * 2 + 0}] = (int16_t) vout2r${C};
+ data2[${C * 2 + 1}] = (int16_t) vout2i${C};
+ data2 += ${SAMPLE_TILE} * 2;
$for C in range(SAMPLE_TILE):
- out3[${C * 2 + 0}] = (int16_t) vout3r${C};
- out3[${C * 2 + 1}] = (int16_t) vout3i${C};
- out3 += ${SAMPLE_TILE} * 2;
+ data3[${C * 2 + 0}] = (int16_t) vout3r${C};
+ data3[${C * 2 + 1}] = (int16_t) vout3i${C};
+ data3 += ${SAMPLE_TILE} * 2;
}
if XNN_UNLIKELY(samples != 0) {
do {
- $if M == 1:
- int32_t vout0r = (int32_t) out0[0];
- int32_t vout0i = (int32_t) out0[1];
- int32_t vout1r = (int32_t) out0[2];
- int32_t vout1i = (int32_t) out0[3];
- int32_t vout2r = (int32_t) out0[4];
- int32_t vout2i = (int32_t) out0[5];
- int32_t vout3r = (int32_t) out0[6];
- int32_t vout3i = (int32_t) out0[7];
- $else:
- int32_t vout0r = (int32_t) out0[0];
- int32_t vout0i = (int32_t) out0[1];
- int32_t vout1r = (int32_t) out1[0];
- int32_t vout1i = (int32_t) out1[1];
- int32_t vout2r = (int32_t) out2[0];
- int32_t vout2i = (int32_t) out2[1];
- int32_t vout3r = (int32_t) out3[0];
- int32_t vout3i = (int32_t) out3[1];
-
- $if M != 1:
- const int32_t vtw1r = (const int32_t) tw1[0];
- const int32_t vtw1i = (const int32_t) tw1[1];
- const int32_t vtw2r = (const int32_t) tw2[0];
- const int32_t vtw2i = (const int32_t) tw2[1];
- const int32_t vtw3r = (const int32_t) tw3[0];
- const int32_t vtw3i = (const int32_t) tw3[1];
- tw1 += stride * 2;
- tw2 += stride * 4;
- tw3 += stride * 6;
+ int32_t vout0r = (int32_t) data0[0];
+ int32_t vout0i = (int32_t) data0[1];
+ int32_t vout1r = (int32_t) data1[0];
+ int32_t vout1i = (int32_t) data1[1];
+ int32_t vout2r = (int32_t) data2[0];
+ int32_t vout2i = (int32_t) data2[1];
+ int32_t vout3r = (int32_t) data3[0];
+ int32_t vout3i = (int32_t) data3[1];
+
+ const int32_t vtw1r = (const int32_t) tw1[0];
+ const int32_t vtw1i = (const int32_t) tw1[1];
+ const int32_t vtw2r = (const int32_t) tw2[0];
+ const int32_t vtw2i = (const int32_t) tw2[1];
+ const int32_t vtw3r = (const int32_t) tw3[0];
+ const int32_t vtw3i = (const int32_t) tw3[1];
+ tw1 += stride * 2;
+ tw2 += stride * 4;
+ tw3 += stride * 6;
// Note 32767 / 4 = 8191. Should be 8192.
vout0r = math_asr_s32(vout0r * 8191 + 16384, 15);
@@ -189,21 +172,12 @@ void xnn_cs16_bfly4${VARIANT}_ukernel__scalar_x${SAMPLE_TILE}(
vout3r = math_asr_s32(vout3r * 8191 + 16384, 15);
vout3i = math_asr_s32(vout3i * 8191 + 16384, 15);
- $if M == 1:
- // Note 32767 should be 32768 representing a multiply by 1.
- const int32_t vtmp0r = math_asr_s32(vout1r * 32767 + 16384, 15);
- const int32_t vtmp0i = math_asr_s32(vout1i * 32767 + 16384, 15);
- const int32_t vtmp1r = math_asr_s32(vout2r * 32767 + 16384, 15);
- const int32_t vtmp1i = math_asr_s32(vout2i * 32767 + 16384, 15);
- const int32_t vtmp2r = math_asr_s32(vout3r * 32767 + 16384, 15);
- const int32_t vtmp2i = math_asr_s32(vout3i * 32767 + 16384, 15);
- $else:
- const int32_t vtmp0r = math_asr_s32(vout1r * vtw1r - vout1i * vtw1i + 16384, 15);
- const int32_t vtmp0i = math_asr_s32(vout1r * vtw1i + vout1i * vtw1r + 16384, 15);
- const int32_t vtmp1r = math_asr_s32(vout2r * vtw2r - vout2i * vtw2i + 16384, 15);
- const int32_t vtmp1i = math_asr_s32(vout2r * vtw2i + vout2i * vtw2r + 16384, 15);
- const int32_t vtmp2r = math_asr_s32(vout3r * vtw3r - vout3i * vtw3i + 16384, 15);
- const int32_t vtmp2i = math_asr_s32(vout3r * vtw3i + vout3i * vtw3r + 16384, 15);
+ const int32_t vtmp0r = math_asr_s32(vout1r * vtw1r - vout1i * vtw1i + 16384, 15);
+ const int32_t vtmp0i = math_asr_s32(vout1r * vtw1i + vout1i * vtw1r + 16384, 15);
+ const int32_t vtmp1r = math_asr_s32(vout2r * vtw2r - vout2i * vtw2i + 16384, 15);
+ const int32_t vtmp1i = math_asr_s32(vout2r * vtw2i + vout2i * vtw2r + 16384, 15);
+ const int32_t vtmp2r = math_asr_s32(vout3r * vtw3r - vout3i * vtw3i + 16384, 15);
+ const int32_t vtmp2i = math_asr_s32(vout3r * vtw3i + vout3i * vtw3r + 16384, 15);
const int32_t vtmp5r = vout0r - vtmp1r;
const int32_t vtmp5i = vout0i - vtmp1i;
@@ -224,28 +198,18 @@ void xnn_cs16_bfly4${VARIANT}_ukernel__scalar_x${SAMPLE_TILE}(
vout3r = vtmp5r - vtmp4i;
vout3i = vtmp5i + vtmp4r;
- $if M == 1:
- out0[0] = (int16_t) vout0r;
- out0[1] = (int16_t) vout0i;
- out0[2] = (int16_t) vout1r;
- out0[3] = (int16_t) vout1i;
- out0[4] = (int16_t) vout2r;
- out0[5] = (int16_t) vout2i;
- out0[6] = (int16_t) vout3r;
- out0[7] = (int16_t) vout3i;
- $else:
- out0[0] = (int16_t) vout0r;
- out0[1] = (int16_t) vout0i;
- out1[0] = (int16_t) vout1r;
- out1[1] = (int16_t) vout1i;
- out2[0] = (int16_t) vout2r;
- out2[1] = (int16_t) vout2i;
- out3[0] = (int16_t) vout3r;
- out3[1] = (int16_t) vout3i;
- out0 += 2;
- out1 += 2;
- out2 += 2;
- out3 += 2;
+ data0[0] = (int16_t) vout0r;
+ data0[1] = (int16_t) vout0i;
+ data1[0] = (int16_t) vout1r;
+ data1[1] = (int16_t) vout1i;
+ data2[0] = (int16_t) vout2r;
+ data2[1] = (int16_t) vout2i;
+ data3[0] = (int16_t) vout3r;
+ data3[1] = (int16_t) vout3i;
+ data0 += 2;
+ data1 += 2;
+ data2 += 2;
+ data3 += 2;
} while(--samples != 0);
}
}
diff --git a/src/cs16-fftr/gen/scalar-x1.c b/src/cs16-fftr/gen/scalar-x1.c
index 2131ed918..f04dda53c 100644
--- a/src/cs16-fftr/gen/scalar-x1.c
+++ b/src/cs16-fftr/gen/scalar-x1.c
@@ -18,8 +18,8 @@
void xnn_cs16_fftr_ukernel__scalar_x1(
size_t samples,
int16_t* data,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
assert(samples >= 2);
assert(samples % 2 == 0);
assert(data != NULL);
diff --git a/src/cs16-fftr/gen/scalar-x2.c b/src/cs16-fftr/gen/scalar-x2.c
index 83dc7c20c..b8d279c19 100644
--- a/src/cs16-fftr/gen/scalar-x2.c
+++ b/src/cs16-fftr/gen/scalar-x2.c
@@ -18,8 +18,8 @@
void xnn_cs16_fftr_ukernel__scalar_x2(
size_t samples,
int16_t* data,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
assert(samples >= 2);
assert(samples % 2 == 0);
assert(data != NULL);
diff --git a/src/cs16-fftr/gen/scalar-x3.c b/src/cs16-fftr/gen/scalar-x3.c
index 24d016848..2ab829abe 100644
--- a/src/cs16-fftr/gen/scalar-x3.c
+++ b/src/cs16-fftr/gen/scalar-x3.c
@@ -18,8 +18,8 @@
void xnn_cs16_fftr_ukernel__scalar_x3(
size_t samples,
int16_t* data,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
assert(samples >= 2);
assert(samples % 2 == 0);
assert(data != NULL);
diff --git a/src/cs16-fftr/gen/scalar-x4.c b/src/cs16-fftr/gen/scalar-x4.c
index be23e44c1..16a295aa0 100644
--- a/src/cs16-fftr/gen/scalar-x4.c
+++ b/src/cs16-fftr/gen/scalar-x4.c
@@ -18,8 +18,8 @@
void xnn_cs16_fftr_ukernel__scalar_x4(
size_t samples,
int16_t* data,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
assert(samples >= 2);
assert(samples % 2 == 0);
assert(data != NULL);
diff --git a/src/cs16-fftr/scalar.c.in b/src/cs16-fftr/scalar.c.in
index 8af6c6c88..3c075d259 100644
--- a/src/cs16-fftr/scalar.c.in
+++ b/src/cs16-fftr/scalar.c.in
@@ -15,8 +15,8 @@ $assert SAMPLE_TILE >= 1
void xnn_cs16_fftr_ukernel__scalar_x${SAMPLE_TILE}(
size_t samples,
int16_t* data,
- const int16_t* twiddle) {
-
+ const int16_t* twiddle)
+{
assert(samples >= 2);
assert(samples % 2 == 0);
assert(data != NULL);
diff --git a/src/xnnpack/fft.h b/src/xnnpack/fft.h
index 90d9bacc4..de5a75361 100644
--- a/src/xnnpack/fft.h
+++ b/src/xnnpack/fft.h
@@ -26,8 +26,8 @@ DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x1)
DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x2)
DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x3)
DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x4)
-DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__scalar)
-DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__neon)
+DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_samples1_ukernel__scalar)
+DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_samples1_ukernel__neon)
#define DECLARE_CS16_FFTR_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
diff --git a/test/cs16-bfly4.cc b/test/cs16-bfly4.cc
index 67e8cb7ed..933b1b3f4 100644
--- a/test/cs16-bfly4.cc
+++ b/test/cs16-bfly4.cc
@@ -18,12 +18,12 @@
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
- TEST(CS16_BFLY4M1__NEON, samples_eq_1) {
+ TEST(CS16_BFLY4_SAMPLES1__NEON, samples_eq_1) {
TEST_REQUIRES_ARM_NEON;
BFly4MicrokernelTester()
.samples(1)
.stride(64)
- .Test(xnn_cs16_bfly4m1_ukernel__neon);
+ .Test(xnn_cs16_bfly4_samples1_ukernel__neon);
}
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
@@ -144,9 +144,9 @@ TEST(CS16_BFLY4__SCALAR_X4, samples_eq_64) {
}
-TEST(CS16_BFLY4M1__SCALAR, samples_eq_1) {
+TEST(CS16_BFLY4_SAMPLES1__SCALAR, samples_eq_1) {
BFly4MicrokernelTester()
.samples(1)
.stride(64)
- .Test(xnn_cs16_bfly4m1_ukernel__scalar);
+ .Test(xnn_cs16_bfly4_samples1_ukernel__scalar);
}
diff --git a/test/cs16-bfly4.yaml b/test/cs16-bfly4.yaml
index 5f24af79a..e42f1a70a 100644
--- a/test/cs16-bfly4.yaml
+++ b/test/cs16-bfly4.yaml
@@ -5,11 +5,11 @@
# NEON
-- name: xnn_cs16_bfly4m1_ukernel__neon
+- name: xnn_cs16_bfly4_samples1_ukernel__neon
# Scalar
- name: xnn_cs16_bfly4_ukernel__scalar_x1
- name: xnn_cs16_bfly4_ukernel__scalar_x2
- name: xnn_cs16_bfly4_ukernel__scalar_x3
- name: xnn_cs16_bfly4_ukernel__scalar_x4
-- name: xnn_cs16_bfly4m1_ukernel__scalar
+- name: xnn_cs16_bfly4_samples1_ukernel__scalar
diff --git a/tools/generate-bfly4-test.py b/tools/generate-bfly4-test.py
index 1bd2a31a7..f24b455ab 100755
--- a/tools/generate-bfly4-test.py
+++ b/tools/generate-bfly4-test.py
@@ -27,17 +27,19 @@ parser.set_defaults(defines=list())
def split_ukernel_name(name):
- m = 0
- samples_tile = 1
- match = re.fullmatch(r"xnn_cs16_bfly4(m(\d+))?_ukernel__(.+)(_x(\d+))?", name)
- assert match is not None
+ match = re.fullmatch(r"xnn_cs16_bfly4(_samples(\d+))?_ukernel__(.+)(_x(\d+))?", name)
+ assert match is not None, name
if match.group(2):
- m = int(match.group(2))
+ samples = int(match.group(2))
+ else:
+ samples = 0
if match.group(5):
samples_tile = int(match.group(5))
+ else:
+ samples_tile = 1
arch, isa = xnncommon.parse_target_name(target_name=match.group(3))
- return m, samples_tile, arch, isa
+ return samples, samples_tile, arch, isa
BFLY4_TEST_TEMPLATE = """\
@@ -50,7 +52,7 @@ TEST(${TEST_NAME}, samples_eq_1) {
.Test(${", ".join(TEST_ARGS)});
}
-$if M == 0:
+$if SAMPLES == 0:
TEST(${TEST_NAME}, samples_eq_4) {
$if ISA_CHECK:
${ISA_CHECK};
@@ -81,12 +83,12 @@ $if M == 0:
"""
-def generate_test_cases(ukernel, m, samples_tile, isa):
+def generate_test_cases(ukernel, samples, samples_tile, isa):
"""Generates all tests cases for a BFly4 micro-kernel.
Args:
ukernel: C name of the micro-kernel function.
- m: fixed number of samples for specialized m1 microkernel.
+ samples: fixed number of samples for specialized samples1 microkernel.
samples_tile: Number of samples processed per one iteration of the inner
loop of the micro-kernel.
isa: instruction set required to run the micro-kernel. Generated unit test
@@ -101,7 +103,7 @@ def generate_test_cases(ukernel, m, samples_tile, isa):
"TEST_NAME": test_name.upper().replace("UKERNEL_", ""),
"TEST_ARGS": [ukernel],
"DATATYPE": datatype,
- "M": m,
+ "SAMPLES": samples,
"SAMPLE_TILE": samples_tile,
"ISA_CHECK": xnncommon.generate_isa_check_macro(isa),
"next_prime": next_prime,
@@ -138,12 +140,12 @@ def main(args):
for ukernel_spec in spec_yaml:
name = ukernel_spec["name"]
- m, samples_tile, arch, isa = split_ukernel_name(name)
+ samples, samples_tile, arch, isa = split_ukernel_name(name)
# specification can override architecture
arch = ukernel_spec.get("arch", arch)
- test_case = generate_test_cases(name, m, samples_tile, isa)
+ test_case = generate_test_cases(name, samples, samples_tile, isa)
tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa)
txt_changed = True