diff options
author | Marat Dukhan <maratek@google.com> | 2022-09-01 10:54:21 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-09-01 10:55:16 -0700 |
commit | 0eaea5648189017ea286d1df37c0ca34f8302e25 (patch) | |
tree | 6c8a8bf711bb074d3606e413bf7fadf6c60ac00c | |
parent | 63ba5160296fe81926f56ea89b9fb3ee1a7b7a42 (diff) | |
download | XNNPACK-0eaea5648189017ea286d1df37c0ca34f8302e25.tar.gz |
Harmonize naming of specialized BFLY4 microkernels
- Rename m1 to samples1
- Separate samples1 from microkernel name by underscore
PiperOrigin-RevId: 471574436
-rw-r--r-- | BUILD.bazel | 4 | ||||
-rwxr-xr-x | CMakeLists.txt | 4 | ||||
-rw-r--r-- | bench/cs16-bfly4.cc | 6 | ||||
-rwxr-xr-x | scripts/generate-cs16-bfly4.sh | 8 | ||||
-rw-r--r-- | src/cs16-bfly4/gen/scalar-x1.c | 56 | ||||
-rw-r--r-- | src/cs16-bfly4/gen/scalar-x2.c | 128 | ||||
-rw-r--r-- | src/cs16-bfly4/gen/scalar-x3.c | 160 | ||||
-rw-r--r-- | src/cs16-bfly4/gen/scalar-x4.c | 192 | ||||
-rw-r--r-- | src/cs16-bfly4/neon-samples1.c (renamed from src/cs16-bfly4/neon-m1.c) | 6 | ||||
-rw-r--r-- | src/cs16-bfly4/scalar-samples1.c (renamed from src/cs16-bfly4/scalar-m1.c) | 6 | ||||
-rw-r--r-- | src/cs16-bfly4/scalar.c.in | 172 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x1.c | 4 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x2.c | 4 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x3.c | 4 | ||||
-rw-r--r-- | src/cs16-fftr/gen/scalar-x4.c | 4 | ||||
-rw-r--r-- | src/cs16-fftr/scalar.c.in | 4 | ||||
-rw-r--r-- | src/xnnpack/fft.h | 4 | ||||
-rw-r--r-- | test/cs16-bfly4.cc | 8 | ||||
-rw-r--r-- | test/cs16-bfly4.yaml | 4 | ||||
-rwxr-xr-x | tools/generate-bfly4-test.py | 26 |
20 files changed, 385 insertions, 419 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index f69155098..1cc5b7031 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -622,7 +622,7 @@ ALL_SCALAR_MICROKERNEL_SRCS = [ "src/cs16-bfly4/gen/scalar-x2.c", "src/cs16-bfly4/gen/scalar-x3.c", "src/cs16-bfly4/gen/scalar-x4.c", - "src/cs16-bfly4/scalar-m1.c", + "src/cs16-bfly4/scalar-samples1.c", "src/cs16-fftr/gen/scalar-x1.c", "src/cs16-fftr/gen/scalar-x2.c", "src/cs16-fftr/gen/scalar-x3.c", @@ -3153,7 +3153,7 @@ PROD_NEON_MICROKERNEL_SRCS = [ ] ALL_NEON_MICROKERNEL_SRCS = [ - "src/cs16-bfly4/neon-m1.c", + "src/cs16-bfly4/neon-samples1.c", "src/cs16-vsquareabs/gen/neon-mlal-ld128-x4.c", "src/cs16-vsquareabs/gen/neon-mlal-ld128-x8.c", "src/cs16-vsquareabs/gen/neon-mlal-ld128-x12.c", diff --git a/CMakeLists.txt b/CMakeLists.txt index a9a7d390b..cb002e3f4 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -610,7 +610,7 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS src/cs16-bfly4/gen/scalar-x2.c src/cs16-bfly4/gen/scalar-x3.c src/cs16-bfly4/gen/scalar-x4.c - src/cs16-bfly4/scalar-m1.c + src/cs16-bfly4/scalar-samples1.c src/cs16-fftr/gen/scalar-x1.c src/cs16-fftr/gen/scalar-x2.c src/cs16-fftr/gen/scalar-x3.c @@ -1647,7 +1647,7 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/x32-zip/x4-neon.c) SET(ALL_NEON_MICROKERNEL_SRCS - src/cs16-bfly4/neon-m1.c + src/cs16-bfly4/neon-samples1.c src/cs16-vsquareabs/gen/neon-mlal-ld128-x4.c src/cs16-vsquareabs/gen/neon-mlal-ld128-x8.c src/cs16-vsquareabs/gen/neon-mlal-ld128-x12.c diff --git a/bench/cs16-bfly4.cc b/bench/cs16-bfly4.cc index 724c85da9..9cfdfa2dc 100644 --- a/bench/cs16-bfly4.cc +++ b/bench/cs16-bfly4.cc @@ -67,7 +67,7 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) b->Args({1024, 1, 256}); } -static void BenchmarkM1KernelSize(benchmark::internal::Benchmark* b) +static void BenchmarkSamples1KernelSize(benchmark::internal::Benchmark* b) { b->ArgNames({"fft_size", "samples", "stride"}); b->Args({256, 1, 64}); @@ -75,10 +75,10 @@ static void BenchmarkM1KernelSize(benchmark::internal::Benchmark* b) } #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -BENCHMARK_CAPTURE(cs16_bfly4, cs16_neon_m1, xnn_cs16_bfly4m1_ukernel__neon)->Apply(BenchmarkM1KernelSize)->UseRealTime(); +BENCHMARK_CAPTURE(cs16_bfly4, cs16_neon_m1, xnn_cs16_bfly4_samples1_ukernel__neon)->Apply(BenchmarkSamples1KernelSize)->UseRealTime(); #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_m1, xnn_cs16_bfly4m1_ukernel__scalar)->Apply(BenchmarkM1KernelSize)->UseRealTime(); +BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_m1, xnn_cs16_bfly4_samples1_ukernel__scalar)->Apply(BenchmarkSamples1KernelSize)->UseRealTime(); BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x1, xnn_cs16_bfly4_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime(); BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x2, xnn_cs16_bfly4_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime(); BENCHMARK_CAPTURE(cs16_bfly4, cs16_scalar_x3, xnn_cs16_bfly4_ukernel__scalar_x3)->Apply(BenchmarkKernelSize)->UseRealTime(); diff --git a/scripts/generate-cs16-bfly4.sh b/scripts/generate-cs16-bfly4.sh index 992f61162..a4f404fa8 100755 --- a/scripts/generate-cs16-bfly4.sh +++ b/scripts/generate-cs16-bfly4.sh @@ -5,10 +5,10 @@ # LICENSE file in the root directory of this source tree. ################################### SCALAR ################################### -tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=1 -D M=0 -o src/cs16-bfly4/gen/scalar-x1.c & -tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=2 -D M=0 -o src/cs16-bfly4/gen/scalar-x2.c & -tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=3 -D M=0 -o src/cs16-bfly4/gen/scalar-x3.c & -tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=4 -D M=0 -o src/cs16-bfly4/gen/scalar-x4.c & +tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=1 -o src/cs16-bfly4/gen/scalar-x1.c & +tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=2 -o src/cs16-bfly4/gen/scalar-x2.c & +tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=3 -o src/cs16-bfly4/gen/scalar-x3.c & +tools/xngen src/cs16-bfly4/scalar.c.in -D SAMPLE_TILE=4 -o src/cs16-bfly4/gen/scalar-x4.c & ################################## Unit tests ################################# tools/generate-bfly4-test.py --spec test/cs16-bfly4.yaml --output test/cs16-bfly4.cc & diff --git a/src/cs16-bfly4/gen/scalar-x1.c b/src/cs16-bfly4/gen/scalar-x1.c index 89dd4b8eb..e38cf64eb 100644 --- a/src/cs16-bfly4/gen/scalar-x1.c +++ b/src/cs16-bfly4/gen/scalar-x1.c @@ -19,15 +19,15 @@ void xnn_cs16_bfly4_ukernel__scalar_x1( size_t samples, int16_t* data, const size_t stride, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ const int16_t* tw1 = twiddle; - const int16_t* tw2 = tw1; - const int16_t* tw3 = tw1; - int16_t* out0 = data; - int16_t* out1 = data + samples * 2; - int16_t* out2 = data + samples * 4; - int16_t* out3 = data + samples * 6; + const int16_t* tw2 = twiddle; + const int16_t* tw3 = twiddle; + int16_t* data0 = data; + int16_t* data1 = data + samples * 2; + int16_t* data2 = data + samples * 4; + int16_t* data3 = data + samples * 6; assert(samples != 0); assert(data != NULL); @@ -37,14 +37,14 @@ void xnn_cs16_bfly4_ukernel__scalar_x1( if XNN_UNLIKELY(samples != 0) { do { - int32_t vout0r = (int32_t) out0[0]; - int32_t vout0i = (int32_t) out0[1]; - int32_t vout1r = (int32_t) out1[0]; - int32_t vout1i = (int32_t) out1[1]; - int32_t vout2r = (int32_t) out2[0]; - int32_t vout2i = (int32_t) out2[1]; - int32_t vout3r = (int32_t) out3[0]; - int32_t vout3i = (int32_t) out3[1]; + int32_t vout0r = (int32_t) data0[0]; + int32_t vout0i = (int32_t) data0[1]; + int32_t vout1r = (int32_t) data1[0]; + int32_t vout1i = (int32_t) data1[1]; + int32_t vout2r = (int32_t) data2[0]; + int32_t vout2i = (int32_t) data2[1]; + int32_t vout3r = (int32_t) data3[0]; + int32_t vout3i = (int32_t) data3[1]; const int32_t vtw1r = (const int32_t) tw1[0]; const int32_t vtw1i = (const int32_t) tw1[1]; @@ -92,18 +92,18 @@ void xnn_cs16_bfly4_ukernel__scalar_x1( vout3r = vtmp5r - vtmp4i; vout3i = vtmp5i + vtmp4r; - out0[0] = (int16_t) vout0r; - out0[1] = (int16_t) vout0i; - out1[0] = (int16_t) vout1r; - out1[1] = (int16_t) vout1i; - out2[0] = (int16_t) vout2r; - out2[1] = (int16_t) vout2i; - out3[0] = (int16_t) vout3r; - out3[1] = (int16_t) vout3i; - out0 += 2; - out1 += 2; - out2 += 2; - out3 += 2; + data0[0] = (int16_t) vout0r; + data0[1] = (int16_t) vout0i; + data1[0] = (int16_t) vout1r; + data1[1] = (int16_t) vout1i; + data2[0] = (int16_t) vout2r; + data2[1] = (int16_t) vout2i; + data3[0] = (int16_t) vout3r; + data3[1] = (int16_t) vout3i; + data0 += 2; + data1 += 2; + data2 += 2; + data3 += 2; } while(--samples != 0); } } diff --git a/src/cs16-bfly4/gen/scalar-x2.c b/src/cs16-bfly4/gen/scalar-x2.c index 99bd8653e..214973041 100644 --- a/src/cs16-bfly4/gen/scalar-x2.c +++ b/src/cs16-bfly4/gen/scalar-x2.c @@ -19,15 +19,15 @@ void xnn_cs16_bfly4_ukernel__scalar_x2( size_t samples, int16_t* data, const size_t stride, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ const int16_t* tw1 = twiddle; - const int16_t* tw2 = tw1; - const int16_t* tw3 = tw1; - int16_t* out0 = data; - int16_t* out1 = data + samples * 2; - int16_t* out2 = data + samples * 4; - int16_t* out3 = data + samples * 6; + const int16_t* tw2 = twiddle; + const int16_t* tw3 = twiddle; + int16_t* data0 = data; + int16_t* data1 = data + samples * 2; + int16_t* data2 = data + samples * 4; + int16_t* data3 = data + samples * 6; assert(samples != 0); assert(data != NULL); @@ -35,22 +35,22 @@ void xnn_cs16_bfly4_ukernel__scalar_x2( assert(twiddle != NULL); for (; samples >= 2; samples -= 2) { - int32_t vout0r0 = (int32_t) out0[0]; - int32_t vout0i0 = (int32_t) out0[1]; - int32_t vout0r1 = (int32_t) out0[2]; - int32_t vout0i1 = (int32_t) out0[3]; - int32_t vout1r0 = (int32_t) out1[0]; - int32_t vout1i0 = (int32_t) out1[1]; - int32_t vout1r1 = (int32_t) out1[2]; - int32_t vout1i1 = (int32_t) out1[3]; - int32_t vout2r0 = (int32_t) out2[0]; - int32_t vout2i0 = (int32_t) out2[1]; - int32_t vout2r1 = (int32_t) out2[2]; - int32_t vout2i1 = (int32_t) out2[3]; - int32_t vout3r0 = (int32_t) out3[0]; - int32_t vout3i0 = (int32_t) out3[1]; - int32_t vout3r1 = (int32_t) out3[2]; - int32_t vout3i1 = (int32_t) out3[3]; + int32_t vout0r0 = (int32_t) data0[0]; + int32_t vout0i0 = (int32_t) data0[1]; + int32_t vout0r1 = (int32_t) data0[2]; + int32_t vout0i1 = (int32_t) data0[3]; + int32_t vout1r0 = (int32_t) data1[0]; + int32_t vout1i0 = (int32_t) data1[1]; + int32_t vout1r1 = (int32_t) data1[2]; + int32_t vout1i1 = (int32_t) data1[3]; + int32_t vout2r0 = (int32_t) data2[0]; + int32_t vout2i0 = (int32_t) data2[1]; + int32_t vout2r1 = (int32_t) data2[2]; + int32_t vout2i1 = (int32_t) data2[3]; + int32_t vout3r0 = (int32_t) data3[0]; + int32_t vout3i0 = (int32_t) data3[1]; + int32_t vout3r1 = (int32_t) data3[2]; + int32_t vout3i1 = (int32_t) data3[3]; const int32_t vtw1r0 = (const int32_t) tw1[0]; const int32_t vtw1i0 = (const int32_t) tw1[1]; @@ -135,38 +135,38 @@ void xnn_cs16_bfly4_ukernel__scalar_x2( vout3i0 = vtmp5i0 + vtmp4r0; vout3i1 = vtmp5i1 + vtmp4r1; - out0[0] = (int16_t) vout0r0; - out0[1] = (int16_t) vout0i0; - out0[2] = (int16_t) vout0r1; - out0[3] = (int16_t) vout0i1; - out0 += 2 * 2; - out1[0] = (int16_t) vout1r0; - out1[1] = (int16_t) vout1i0; - out1[2] = (int16_t) vout1r1; - out1[3] = (int16_t) vout1i1; - out1 += 2 * 2; - out2[0] = (int16_t) vout2r0; - out2[1] = (int16_t) vout2i0; - out2[2] = (int16_t) vout2r1; - out2[3] = (int16_t) vout2i1; - out2 += 2 * 2; - out3[0] = (int16_t) vout3r0; - out3[1] = (int16_t) vout3i0; - out3[2] = (int16_t) vout3r1; - out3[3] = (int16_t) vout3i1; - out3 += 2 * 2; + data0[0] = (int16_t) vout0r0; + data0[1] = (int16_t) vout0i0; + data0[2] = (int16_t) vout0r1; + data0[3] = (int16_t) vout0i1; + data0 += 2 * 2; + data1[0] = (int16_t) vout1r0; + data1[1] = (int16_t) vout1i0; + data1[2] = (int16_t) vout1r1; + data1[3] = (int16_t) vout1i1; + data1 += 2 * 2; + data2[0] = (int16_t) vout2r0; + data2[1] = (int16_t) vout2i0; + data2[2] = (int16_t) vout2r1; + data2[3] = (int16_t) vout2i1; + data2 += 2 * 2; + data3[0] = (int16_t) vout3r0; + data3[1] = (int16_t) vout3i0; + data3[2] = (int16_t) vout3r1; + data3[3] = (int16_t) vout3i1; + data3 += 2 * 2; } if XNN_UNLIKELY(samples != 0) { do { - int32_t vout0r = (int32_t) out0[0]; - int32_t vout0i = (int32_t) out0[1]; - int32_t vout1r = (int32_t) out1[0]; - int32_t vout1i = (int32_t) out1[1]; - int32_t vout2r = (int32_t) out2[0]; - int32_t vout2i = (int32_t) out2[1]; - int32_t vout3r = (int32_t) out3[0]; - int32_t vout3i = (int32_t) out3[1]; + int32_t vout0r = (int32_t) data0[0]; + int32_t vout0i = (int32_t) data0[1]; + int32_t vout1r = (int32_t) data1[0]; + int32_t vout1i = (int32_t) data1[1]; + int32_t vout2r = (int32_t) data2[0]; + int32_t vout2i = (int32_t) data2[1]; + int32_t vout3r = (int32_t) data3[0]; + int32_t vout3i = (int32_t) data3[1]; const int32_t vtw1r = (const int32_t) tw1[0]; const int32_t vtw1i = (const int32_t) tw1[1]; @@ -214,18 +214,18 @@ void xnn_cs16_bfly4_ukernel__scalar_x2( vout3r = vtmp5r - vtmp4i; vout3i = vtmp5i + vtmp4r; - out0[0] = (int16_t) vout0r; - out0[1] = (int16_t) vout0i; - out1[0] = (int16_t) vout1r; - out1[1] = (int16_t) vout1i; - out2[0] = (int16_t) vout2r; - out2[1] = (int16_t) vout2i; - out3[0] = (int16_t) vout3r; - out3[1] = (int16_t) vout3i; - out0 += 2; - out1 += 2; - out2 += 2; - out3 += 2; + data0[0] = (int16_t) vout0r; + data0[1] = (int16_t) vout0i; + data1[0] = (int16_t) vout1r; + data1[1] = (int16_t) vout1i; + data2[0] = (int16_t) vout2r; + data2[1] = (int16_t) vout2i; + data3[0] = (int16_t) vout3r; + data3[1] = (int16_t) vout3i; + data0 += 2; + data1 += 2; + data2 += 2; + data3 += 2; } while(--samples != 0); } } diff --git a/src/cs16-bfly4/gen/scalar-x3.c b/src/cs16-bfly4/gen/scalar-x3.c index fcb487110..68afa72bf 100644 --- a/src/cs16-bfly4/gen/scalar-x3.c +++ b/src/cs16-bfly4/gen/scalar-x3.c @@ -19,15 +19,15 @@ void xnn_cs16_bfly4_ukernel__scalar_x3( size_t samples, int16_t* data, const size_t stride, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ const int16_t* tw1 = twiddle; - const int16_t* tw2 = tw1; - const int16_t* tw3 = tw1; - int16_t* out0 = data; - int16_t* out1 = data + samples * 2; - int16_t* out2 = data + samples * 4; - int16_t* out3 = data + samples * 6; + const int16_t* tw2 = twiddle; + const int16_t* tw3 = twiddle; + int16_t* data0 = data; + int16_t* data1 = data + samples * 2; + int16_t* data2 = data + samples * 4; + int16_t* data3 = data + samples * 6; assert(samples != 0); assert(data != NULL); @@ -35,30 +35,30 @@ void xnn_cs16_bfly4_ukernel__scalar_x3( assert(twiddle != NULL); for (; samples >= 3; samples -= 3) { - int32_t vout0r0 = (int32_t) out0[0]; - int32_t vout0i0 = (int32_t) out0[1]; - int32_t vout0r1 = (int32_t) out0[2]; - int32_t vout0i1 = (int32_t) out0[3]; - int32_t vout0r2 = (int32_t) out0[4]; - int32_t vout0i2 = (int32_t) out0[5]; - int32_t vout1r0 = (int32_t) out1[0]; - int32_t vout1i0 = (int32_t) out1[1]; - int32_t vout1r1 = (int32_t) out1[2]; - int32_t vout1i1 = (int32_t) out1[3]; - int32_t vout1r2 = (int32_t) out1[4]; - int32_t vout1i2 = (int32_t) out1[5]; - int32_t vout2r0 = (int32_t) out2[0]; - int32_t vout2i0 = (int32_t) out2[1]; - int32_t vout2r1 = (int32_t) out2[2]; - int32_t vout2i1 = (int32_t) out2[3]; - int32_t vout2r2 = (int32_t) out2[4]; - int32_t vout2i2 = (int32_t) out2[5]; - int32_t vout3r0 = (int32_t) out3[0]; - int32_t vout3i0 = (int32_t) out3[1]; - int32_t vout3r1 = (int32_t) out3[2]; - int32_t vout3i1 = (int32_t) out3[3]; - int32_t vout3r2 = (int32_t) out3[4]; - int32_t vout3i2 = (int32_t) out3[5]; + int32_t vout0r0 = (int32_t) data0[0]; + int32_t vout0i0 = (int32_t) data0[1]; + int32_t vout0r1 = (int32_t) data0[2]; + int32_t vout0i1 = (int32_t) data0[3]; + int32_t vout0r2 = (int32_t) data0[4]; + int32_t vout0i2 = (int32_t) data0[5]; + int32_t vout1r0 = (int32_t) data1[0]; + int32_t vout1i0 = (int32_t) data1[1]; + int32_t vout1r1 = (int32_t) data1[2]; + int32_t vout1i1 = (int32_t) data1[3]; + int32_t vout1r2 = (int32_t) data1[4]; + int32_t vout1i2 = (int32_t) data1[5]; + int32_t vout2r0 = (int32_t) data2[0]; + int32_t vout2i0 = (int32_t) data2[1]; + int32_t vout2r1 = (int32_t) data2[2]; + int32_t vout2i1 = (int32_t) data2[3]; + int32_t vout2r2 = (int32_t) data2[4]; + int32_t vout2i2 = (int32_t) data2[5]; + int32_t vout3r0 = (int32_t) data3[0]; + int32_t vout3i0 = (int32_t) data3[1]; + int32_t vout3r1 = (int32_t) data3[2]; + int32_t vout3i1 = (int32_t) data3[3]; + int32_t vout3r2 = (int32_t) data3[4]; + int32_t vout3i2 = (int32_t) data3[5]; const int32_t vtw1r0 = (const int32_t) tw1[0]; const int32_t vtw1i0 = (const int32_t) tw1[1]; @@ -182,46 +182,46 @@ void xnn_cs16_bfly4_ukernel__scalar_x3( vout3i1 = vtmp5i1 + vtmp4r1; vout3i2 = vtmp5i2 + vtmp4r2; - out0[0] = (int16_t) vout0r0; - out0[1] = (int16_t) vout0i0; - out0[2] = (int16_t) vout0r1; - out0[3] = (int16_t) vout0i1; - out0[4] = (int16_t) vout0r2; - out0[5] = (int16_t) vout0i2; - out0 += 3 * 2; - out1[0] = (int16_t) vout1r0; - out1[1] = (int16_t) vout1i0; - out1[2] = (int16_t) vout1r1; - out1[3] = (int16_t) vout1i1; - out1[4] = (int16_t) vout1r2; - out1[5] = (int16_t) vout1i2; - out1 += 3 * 2; - out2[0] = (int16_t) vout2r0; - out2[1] = (int16_t) vout2i0; - out2[2] = (int16_t) vout2r1; - out2[3] = (int16_t) vout2i1; - out2[4] = (int16_t) vout2r2; - out2[5] = (int16_t) vout2i2; - out2 += 3 * 2; - out3[0] = (int16_t) vout3r0; - out3[1] = (int16_t) vout3i0; - out3[2] = (int16_t) vout3r1; - out3[3] = (int16_t) vout3i1; - out3[4] = (int16_t) vout3r2; - out3[5] = (int16_t) vout3i2; - out3 += 3 * 2; + data0[0] = (int16_t) vout0r0; + data0[1] = (int16_t) vout0i0; + data0[2] = (int16_t) vout0r1; + data0[3] = (int16_t) vout0i1; + data0[4] = (int16_t) vout0r2; + data0[5] = (int16_t) vout0i2; + data0 += 3 * 2; + data1[0] = (int16_t) vout1r0; + data1[1] = (int16_t) vout1i0; + data1[2] = (int16_t) vout1r1; + data1[3] = (int16_t) vout1i1; + data1[4] = (int16_t) vout1r2; + data1[5] = (int16_t) vout1i2; + data1 += 3 * 2; + data2[0] = (int16_t) vout2r0; + data2[1] = (int16_t) vout2i0; + data2[2] = (int16_t) vout2r1; + data2[3] = (int16_t) vout2i1; + data2[4] = (int16_t) vout2r2; + data2[5] = (int16_t) vout2i2; + data2 += 3 * 2; + data3[0] = (int16_t) vout3r0; + data3[1] = (int16_t) vout3i0; + data3[2] = (int16_t) vout3r1; + data3[3] = (int16_t) vout3i1; + data3[4] = (int16_t) vout3r2; + data3[5] = (int16_t) vout3i2; + data3 += 3 * 2; } if XNN_UNLIKELY(samples != 0) { do { - int32_t vout0r = (int32_t) out0[0]; - int32_t vout0i = (int32_t) out0[1]; - int32_t vout1r = (int32_t) out1[0]; - int32_t vout1i = (int32_t) out1[1]; - int32_t vout2r = (int32_t) out2[0]; - int32_t vout2i = (int32_t) out2[1]; - int32_t vout3r = (int32_t) out3[0]; - int32_t vout3i = (int32_t) out3[1]; + int32_t vout0r = (int32_t) data0[0]; + int32_t vout0i = (int32_t) data0[1]; + int32_t vout1r = (int32_t) data1[0]; + int32_t vout1i = (int32_t) data1[1]; + int32_t vout2r = (int32_t) data2[0]; + int32_t vout2i = (int32_t) data2[1]; + int32_t vout3r = (int32_t) data3[0]; + int32_t vout3i = (int32_t) data3[1]; const int32_t vtw1r = (const int32_t) tw1[0]; const int32_t vtw1i = (const int32_t) tw1[1]; @@ -269,18 +269,18 @@ void xnn_cs16_bfly4_ukernel__scalar_x3( vout3r = vtmp5r - vtmp4i; vout3i = vtmp5i + vtmp4r; - out0[0] = (int16_t) vout0r; - out0[1] = (int16_t) vout0i; - out1[0] = (int16_t) vout1r; - out1[1] = (int16_t) vout1i; - out2[0] = (int16_t) vout2r; - out2[1] = (int16_t) vout2i; - out3[0] = (int16_t) vout3r; - out3[1] = (int16_t) vout3i; - out0 += 2; - out1 += 2; - out2 += 2; - out3 += 2; + data0[0] = (int16_t) vout0r; + data0[1] = (int16_t) vout0i; + data1[0] = (int16_t) vout1r; + data1[1] = (int16_t) vout1i; + data2[0] = (int16_t) vout2r; + data2[1] = (int16_t) vout2i; + data3[0] = (int16_t) vout3r; + data3[1] = (int16_t) vout3i; + data0 += 2; + data1 += 2; + data2 += 2; + data3 += 2; } while(--samples != 0); } } diff --git a/src/cs16-bfly4/gen/scalar-x4.c b/src/cs16-bfly4/gen/scalar-x4.c index ac7117c1e..37c0a8f6b 100644 --- a/src/cs16-bfly4/gen/scalar-x4.c +++ b/src/cs16-bfly4/gen/scalar-x4.c @@ -19,15 +19,15 @@ void xnn_cs16_bfly4_ukernel__scalar_x4( size_t samples, int16_t* data, const size_t stride, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ const int16_t* tw1 = twiddle; - const int16_t* tw2 = tw1; - const int16_t* tw3 = tw1; - int16_t* out0 = data; - int16_t* out1 = data + samples * 2; - int16_t* out2 = data + samples * 4; - int16_t* out3 = data + samples * 6; + const int16_t* tw2 = twiddle; + const int16_t* tw3 = twiddle; + int16_t* data0 = data; + int16_t* data1 = data + samples * 2; + int16_t* data2 = data + samples * 4; + int16_t* data3 = data + samples * 6; assert(samples != 0); assert(data != NULL); @@ -35,38 +35,38 @@ void xnn_cs16_bfly4_ukernel__scalar_x4( assert(twiddle != NULL); for (; samples >= 4; samples -= 4) { - int32_t vout0r0 = (int32_t) out0[0]; - int32_t vout0i0 = (int32_t) out0[1]; - int32_t vout0r1 = (int32_t) out0[2]; - int32_t vout0i1 = (int32_t) out0[3]; - int32_t vout0r2 = (int32_t) out0[4]; - int32_t vout0i2 = (int32_t) out0[5]; - int32_t vout0r3 = (int32_t) out0[6]; - int32_t vout0i3 = (int32_t) out0[7]; - int32_t vout1r0 = (int32_t) out1[0]; - int32_t vout1i0 = (int32_t) out1[1]; - int32_t vout1r1 = (int32_t) out1[2]; - int32_t vout1i1 = (int32_t) out1[3]; - int32_t vout1r2 = (int32_t) out1[4]; - int32_t vout1i2 = (int32_t) out1[5]; - int32_t vout1r3 = (int32_t) out1[6]; - int32_t vout1i3 = (int32_t) out1[7]; - int32_t vout2r0 = (int32_t) out2[0]; - int32_t vout2i0 = (int32_t) out2[1]; - int32_t vout2r1 = (int32_t) out2[2]; - int32_t vout2i1 = (int32_t) out2[3]; - int32_t vout2r2 = (int32_t) out2[4]; - int32_t vout2i2 = (int32_t) out2[5]; - int32_t vout2r3 = (int32_t) out2[6]; - int32_t vout2i3 = (int32_t) out2[7]; - int32_t vout3r0 = (int32_t) out3[0]; - int32_t vout3i0 = (int32_t) out3[1]; - int32_t vout3r1 = (int32_t) out3[2]; - int32_t vout3i1 = (int32_t) out3[3]; - int32_t vout3r2 = (int32_t) out3[4]; - int32_t vout3i2 = (int32_t) out3[5]; - int32_t vout3r3 = (int32_t) out3[6]; - int32_t vout3i3 = (int32_t) out3[7]; + int32_t vout0r0 = (int32_t) data0[0]; + int32_t vout0i0 = (int32_t) data0[1]; + int32_t vout0r1 = (int32_t) data0[2]; + int32_t vout0i1 = (int32_t) data0[3]; + int32_t vout0r2 = (int32_t) data0[4]; + int32_t vout0i2 = (int32_t) data0[5]; + int32_t vout0r3 = (int32_t) data0[6]; + int32_t vout0i3 = (int32_t) data0[7]; + int32_t vout1r0 = (int32_t) data1[0]; + int32_t vout1i0 = (int32_t) data1[1]; + int32_t vout1r1 = (int32_t) data1[2]; + int32_t vout1i1 = (int32_t) data1[3]; + int32_t vout1r2 = (int32_t) data1[4]; + int32_t vout1i2 = (int32_t) data1[5]; + int32_t vout1r3 = (int32_t) data1[6]; + int32_t vout1i3 = (int32_t) data1[7]; + int32_t vout2r0 = (int32_t) data2[0]; + int32_t vout2i0 = (int32_t) data2[1]; + int32_t vout2r1 = (int32_t) data2[2]; + int32_t vout2i1 = (int32_t) data2[3]; + int32_t vout2r2 = (int32_t) data2[4]; + int32_t vout2i2 = (int32_t) data2[5]; + int32_t vout2r3 = (int32_t) data2[6]; + int32_t vout2i3 = (int32_t) data2[7]; + int32_t vout3r0 = (int32_t) data3[0]; + int32_t vout3i0 = (int32_t) data3[1]; + int32_t vout3r1 = (int32_t) data3[2]; + int32_t vout3i1 = (int32_t) data3[3]; + int32_t vout3r2 = (int32_t) data3[4]; + int32_t vout3i2 = (int32_t) data3[5]; + int32_t vout3r3 = (int32_t) data3[6]; + int32_t vout3i3 = (int32_t) data3[7]; const int32_t vtw1r0 = (const int32_t) tw1[0]; const int32_t vtw1i0 = (const int32_t) tw1[1]; @@ -229,54 +229,54 @@ void xnn_cs16_bfly4_ukernel__scalar_x4( vout3i2 = vtmp5i2 + vtmp4r2; vout3i3 = vtmp5i3 + vtmp4r3; - out0[0] = (int16_t) vout0r0; - out0[1] = (int16_t) vout0i0; - out0[2] = (int16_t) vout0r1; - out0[3] = (int16_t) vout0i1; - out0[4] = (int16_t) vout0r2; - out0[5] = (int16_t) vout0i2; - out0[6] = (int16_t) vout0r3; - out0[7] = (int16_t) vout0i3; - out0 += 4 * 2; - out1[0] = (int16_t) vout1r0; - out1[1] = (int16_t) vout1i0; - out1[2] = (int16_t) vout1r1; - out1[3] = (int16_t) vout1i1; - out1[4] = (int16_t) vout1r2; - out1[5] = (int16_t) vout1i2; - out1[6] = (int16_t) vout1r3; - out1[7] = (int16_t) vout1i3; - out1 += 4 * 2; - out2[0] = (int16_t) vout2r0; - out2[1] = (int16_t) vout2i0; - out2[2] = (int16_t) vout2r1; - out2[3] = (int16_t) vout2i1; - out2[4] = (int16_t) vout2r2; - out2[5] = (int16_t) vout2i2; - out2[6] = (int16_t) vout2r3; - out2[7] = (int16_t) vout2i3; - out2 += 4 * 2; - out3[0] = (int16_t) vout3r0; - out3[1] = (int16_t) vout3i0; - out3[2] = (int16_t) vout3r1; - out3[3] = (int16_t) vout3i1; - out3[4] = (int16_t) vout3r2; - out3[5] = (int16_t) vout3i2; - out3[6] = (int16_t) vout3r3; - out3[7] = (int16_t) vout3i3; - out3 += 4 * 2; + data0[0] = (int16_t) vout0r0; + data0[1] = (int16_t) vout0i0; + data0[2] = (int16_t) vout0r1; + data0[3] = (int16_t) vout0i1; + data0[4] = (int16_t) vout0r2; + data0[5] = (int16_t) vout0i2; + data0[6] = (int16_t) vout0r3; + data0[7] = (int16_t) vout0i3; + data0 += 4 * 2; + data1[0] = (int16_t) vout1r0; + data1[1] = (int16_t) vout1i0; + data1[2] = (int16_t) vout1r1; + data1[3] = (int16_t) vout1i1; + data1[4] = (int16_t) vout1r2; + data1[5] = (int16_t) vout1i2; + data1[6] = (int16_t) vout1r3; + data1[7] = (int16_t) vout1i3; + data1 += 4 * 2; + data2[0] = (int16_t) vout2r0; + data2[1] = (int16_t) vout2i0; + data2[2] = (int16_t) vout2r1; + data2[3] = (int16_t) vout2i1; + data2[4] = (int16_t) vout2r2; + data2[5] = (int16_t) vout2i2; + data2[6] = (int16_t) vout2r3; + data2[7] = (int16_t) vout2i3; + data2 += 4 * 2; + data3[0] = (int16_t) vout3r0; + data3[1] = (int16_t) vout3i0; + data3[2] = (int16_t) vout3r1; + data3[3] = (int16_t) vout3i1; + data3[4] = (int16_t) vout3r2; + data3[5] = (int16_t) vout3i2; + data3[6] = (int16_t) vout3r3; + data3[7] = (int16_t) vout3i3; + data3 += 4 * 2; } if XNN_UNLIKELY(samples != 0) { do { - int32_t vout0r = (int32_t) out0[0]; - int32_t vout0i = (int32_t) out0[1]; - int32_t vout1r = (int32_t) out1[0]; - int32_t vout1i = (int32_t) out1[1]; - int32_t vout2r = (int32_t) out2[0]; - int32_t vout2i = (int32_t) out2[1]; - int32_t vout3r = (int32_t) out3[0]; - int32_t vout3i = (int32_t) out3[1]; + int32_t vout0r = (int32_t) data0[0]; + int32_t vout0i = (int32_t) data0[1]; + int32_t vout1r = (int32_t) data1[0]; + int32_t vout1i = (int32_t) data1[1]; + int32_t vout2r = (int32_t) data2[0]; + int32_t vout2i = (int32_t) data2[1]; + int32_t vout3r = (int32_t) data3[0]; + int32_t vout3i = (int32_t) data3[1]; const int32_t vtw1r = (const int32_t) tw1[0]; const int32_t vtw1i = (const int32_t) tw1[1]; @@ -324,18 +324,18 @@ void xnn_cs16_bfly4_ukernel__scalar_x4( vout3r = vtmp5r - vtmp4i; vout3i = vtmp5i + vtmp4r; - out0[0] = (int16_t) vout0r; - out0[1] = (int16_t) vout0i; - out1[0] = (int16_t) vout1r; - out1[1] = (int16_t) vout1i; - out2[0] = (int16_t) vout2r; - out2[1] = (int16_t) vout2i; - out3[0] = (int16_t) vout3r; - out3[1] = (int16_t) vout3i; - out0 += 2; - out1 += 2; - out2 += 2; - out3 += 2; + data0[0] = (int16_t) vout0r; + data0[1] = (int16_t) vout0i; + data1[0] = (int16_t) vout1r; + data1[1] = (int16_t) vout1i; + data2[0] = (int16_t) vout2r; + data2[1] = (int16_t) vout2i; + data3[0] = (int16_t) vout3r; + data3[1] = (int16_t) vout3i; + data0 += 2; + data1 += 2; + data2 += 2; + data3 += 2; } while(--samples != 0); } } diff --git a/src/cs16-bfly4/neon-m1.c b/src/cs16-bfly4/neon-samples1.c index cb9d60b40..55f663020 100644 --- a/src/cs16-bfly4/neon-m1.c +++ b/src/cs16-bfly4/neon-samples1.c @@ -13,12 +13,12 @@ #include <arm_neon.h> -void xnn_cs16_bfly4m1_ukernel__neon( +void xnn_cs16_bfly4_samples1_ukernel__neon( size_t samples, int16_t* data, const size_t stride, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ assert(samples == 1); assert(data != NULL); assert(stride != 0); diff --git a/src/cs16-bfly4/scalar-m1.c b/src/cs16-bfly4/scalar-samples1.c index d0f2af509..2f3226786 100644 --- a/src/cs16-bfly4/scalar-m1.c +++ b/src/cs16-bfly4/scalar-samples1.c @@ -11,12 +11,12 @@ #include <xnnpack/fft.h> -void xnn_cs16_bfly4m1_ukernel__scalar( +void xnn_cs16_bfly4_samples1_ukernel__scalar( size_t samples, int16_t* data, const size_t stride, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ assert(samples == 1); assert(data != NULL); assert(stride != 0); diff --git a/src/cs16-bfly4/scalar.c.in b/src/cs16-bfly4/scalar.c.in index 23d5bfcbd..42c8e278c 100644 --- a/src/cs16-bfly4/scalar.c.in +++ b/src/cs16-bfly4/scalar.c.in @@ -12,27 +12,21 @@ $assert SAMPLE_TILE >= 1 #include <xnnpack/fft.h> -$VARIANT = "m%s" % M if M else "" -void xnn_cs16_bfly4${VARIANT}_ukernel__scalar_x${SAMPLE_TILE}( +void xnn_cs16_bfly4_ukernel__scalar_x${SAMPLE_TILE}( size_t samples, int16_t* data, const size_t stride, - const int16_t* twiddle) { - - $if M != 1: - const int16_t* tw1 = twiddle; - const int16_t* tw2 = tw1; - const int16_t* tw3 = tw1; - int16_t* out0 = data; - $if M != 1: - int16_t* out1 = data + samples * 2; - int16_t* out2 = data + samples * 4; - int16_t* out3 = data + samples * 6; - - $if M != 0: - assert(samples == ${M}); - $else: - assert(samples != 0); + const int16_t* twiddle) +{ + const int16_t* tw1 = twiddle; + const int16_t* tw2 = twiddle; + const int16_t* tw3 = twiddle; + int16_t* data0 = data; + int16_t* data1 = data + samples * 2; + int16_t* data2 = data + samples * 4; + int16_t* data3 = data + samples * 6; + + assert(samples != 0); assert(data != NULL); assert(stride != 0); assert(twiddle != NULL); @@ -40,17 +34,17 @@ void xnn_cs16_bfly4${VARIANT}_ukernel__scalar_x${SAMPLE_TILE}( $if SAMPLE_TILE > 1: for (; samples >= ${SAMPLE_TILE}; samples -= ${SAMPLE_TILE}) { $for C in range(SAMPLE_TILE): - int32_t vout0r${C} = (int32_t) out0[${C * 2 + 0}]; - int32_t vout0i${C} = (int32_t) out0[${C * 2 + 1}]; + int32_t vout0r${C} = (int32_t) data0[${C * 2 + 0}]; + int32_t vout0i${C} = (int32_t) data0[${C * 2 + 1}]; $for C in range(SAMPLE_TILE): - int32_t vout1r${C} = (int32_t) out1[${C * 2 + 0}]; - int32_t vout1i${C} = (int32_t) out1[${C * 2 + 1}]; + int32_t vout1r${C} = (int32_t) data1[${C * 2 + 0}]; + int32_t vout1i${C} = (int32_t) data1[${C * 2 + 1}]; $for C in range(SAMPLE_TILE): - int32_t vout2r${C} = (int32_t) out2[${C * 2 + 0}]; - int32_t vout2i${C} = (int32_t) out2[${C * 2 + 1}]; + int32_t vout2r${C} = (int32_t) data2[${C * 2 + 0}]; + int32_t vout2i${C} = (int32_t) data2[${C * 2 + 1}]; $for C in range(SAMPLE_TILE): - int32_t vout3r${C} = (int32_t) out3[${C * 2 + 0}]; - int32_t vout3i${C} = (int32_t) out3[${C * 2 + 1}]; + int32_t vout3r${C} = (int32_t) data3[${C * 2 + 0}]; + int32_t vout3i${C} = (int32_t) data3[${C * 2 + 1}]; $for C in range(SAMPLE_TILE): const int32_t vtw1r${C} = (const int32_t) tw1[0]; @@ -130,54 +124,43 @@ void xnn_cs16_bfly4${VARIANT}_ukernel__scalar_x${SAMPLE_TILE}( vout3i${C} = vtmp5i${C} + vtmp4r${C}; $for C in range(SAMPLE_TILE): - out0[${C * 2 + 0}] = (int16_t) vout0r${C}; - out0[${C * 2 + 1}] = (int16_t) vout0i${C}; - out0 += ${SAMPLE_TILE} * 2; + data0[${C * 2 + 0}] = (int16_t) vout0r${C}; + data0[${C * 2 + 1}] = (int16_t) vout0i${C}; + data0 += ${SAMPLE_TILE} * 2; $for C in range(SAMPLE_TILE): - out1[${C * 2 + 0}] = (int16_t) vout1r${C}; - out1[${C * 2 + 1}] = (int16_t) vout1i${C}; - out1 += ${SAMPLE_TILE} * 2; + data1[${C * 2 + 0}] = (int16_t) vout1r${C}; + data1[${C * 2 + 1}] = (int16_t) vout1i${C}; + data1 += ${SAMPLE_TILE} * 2; $for C in range(SAMPLE_TILE): - out2[${C * 2 + 0}] = (int16_t) vout2r${C}; - out2[${C * 2 + 1}] = (int16_t) vout2i${C}; - out2 += ${SAMPLE_TILE} * 2; + data2[${C * 2 + 0}] = (int16_t) vout2r${C}; + data2[${C * 2 + 1}] = (int16_t) vout2i${C}; + data2 += ${SAMPLE_TILE} * 2; $for C in range(SAMPLE_TILE): - out3[${C * 2 + 0}] = (int16_t) vout3r${C}; - out3[${C * 2 + 1}] = (int16_t) vout3i${C}; - out3 += ${SAMPLE_TILE} * 2; + data3[${C * 2 + 0}] = (int16_t) vout3r${C}; + data3[${C * 2 + 1}] = (int16_t) vout3i${C}; + data3 += ${SAMPLE_TILE} * 2; } if XNN_UNLIKELY(samples != 0) { do { - $if M == 1: - int32_t vout0r = (int32_t) out0[0]; - int32_t vout0i = (int32_t) out0[1]; - int32_t vout1r = (int32_t) out0[2]; - int32_t vout1i = (int32_t) out0[3]; - int32_t vout2r = (int32_t) out0[4]; - int32_t vout2i = (int32_t) out0[5]; - int32_t vout3r = (int32_t) out0[6]; - int32_t vout3i = (int32_t) out0[7]; - $else: - int32_t vout0r = (int32_t) out0[0]; - int32_t vout0i = (int32_t) out0[1]; - int32_t vout1r = (int32_t) out1[0]; - int32_t vout1i = (int32_t) out1[1]; - int32_t vout2r = (int32_t) out2[0]; - int32_t vout2i = (int32_t) out2[1]; - int32_t vout3r = (int32_t) out3[0]; - int32_t vout3i = (int32_t) out3[1]; - - $if M != 1: - const int32_t vtw1r = (const int32_t) tw1[0]; - const int32_t vtw1i = (const int32_t) tw1[1]; - const int32_t vtw2r = (const int32_t) tw2[0]; - const int32_t vtw2i = (const int32_t) tw2[1]; - const int32_t vtw3r = (const int32_t) tw3[0]; - const int32_t vtw3i = (const int32_t) tw3[1]; - tw1 += stride * 2; - tw2 += stride * 4; - tw3 += stride * 6; + int32_t vout0r = (int32_t) data0[0]; + int32_t vout0i = (int32_t) data0[1]; + int32_t vout1r = (int32_t) data1[0]; + int32_t vout1i = (int32_t) data1[1]; + int32_t vout2r = (int32_t) data2[0]; + int32_t vout2i = (int32_t) data2[1]; + int32_t vout3r = (int32_t) data3[0]; + int32_t vout3i = (int32_t) data3[1]; + + const int32_t vtw1r = (const int32_t) tw1[0]; + const int32_t vtw1i = (const int32_t) tw1[1]; + const int32_t vtw2r = (const int32_t) tw2[0]; + const int32_t vtw2i = (const int32_t) tw2[1]; + const int32_t vtw3r = (const int32_t) tw3[0]; + const int32_t vtw3i = (const int32_t) tw3[1]; + tw1 += stride * 2; + tw2 += stride * 4; + tw3 += stride * 6; // Note 32767 / 4 = 8191. Should be 8192. vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); @@ -189,21 +172,12 @@ void xnn_cs16_bfly4${VARIANT}_ukernel__scalar_x${SAMPLE_TILE}( vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); - $if M == 1: - // Note 32767 should be 32768 representing a multiply by 1. - const int32_t vtmp0r = math_asr_s32(vout1r * 32767 + 16384, 15); - const int32_t vtmp0i = math_asr_s32(vout1i * 32767 + 16384, 15); - const int32_t vtmp1r = math_asr_s32(vout2r * 32767 + 16384, 15); - const int32_t vtmp1i = math_asr_s32(vout2i * 32767 + 16384, 15); - const int32_t vtmp2r = math_asr_s32(vout3r * 32767 + 16384, 15); - const int32_t vtmp2i = math_asr_s32(vout3i * 32767 + 16384, 15); - $else: - const int32_t vtmp0r = math_asr_s32(vout1r * vtw1r - vout1i * vtw1i + 16384, 15); - const int32_t vtmp0i = math_asr_s32(vout1r * vtw1i + vout1i * vtw1r + 16384, 15); - const int32_t vtmp1r = math_asr_s32(vout2r * vtw2r - vout2i * vtw2i + 16384, 15); - const int32_t vtmp1i = math_asr_s32(vout2r * vtw2i + vout2i * vtw2r + 16384, 15); - const int32_t vtmp2r = math_asr_s32(vout3r * vtw3r - vout3i * vtw3i + 16384, 15); - const int32_t vtmp2i = math_asr_s32(vout3r * vtw3i + vout3i * vtw3r + 16384, 15); + const int32_t vtmp0r = math_asr_s32(vout1r * vtw1r - vout1i * vtw1i + 16384, 15); + const int32_t vtmp0i = math_asr_s32(vout1r * vtw1i + vout1i * vtw1r + 16384, 15); + const int32_t vtmp1r = math_asr_s32(vout2r * vtw2r - vout2i * vtw2i + 16384, 15); + const int32_t vtmp1i = math_asr_s32(vout2r * vtw2i + vout2i * vtw2r + 16384, 15); + const int32_t vtmp2r = math_asr_s32(vout3r * vtw3r - vout3i * vtw3i + 16384, 15); + const int32_t vtmp2i = math_asr_s32(vout3r * vtw3i + vout3i * vtw3r + 16384, 15); const int32_t vtmp5r = vout0r - vtmp1r; const int32_t vtmp5i = vout0i - vtmp1i; @@ -224,28 +198,18 @@ void xnn_cs16_bfly4${VARIANT}_ukernel__scalar_x${SAMPLE_TILE}( vout3r = vtmp5r - vtmp4i; vout3i = vtmp5i + vtmp4r; - $if M == 1: - out0[0] = (int16_t) vout0r; - out0[1] = (int16_t) vout0i; - out0[2] = (int16_t) vout1r; - out0[3] = (int16_t) vout1i; - out0[4] = (int16_t) vout2r; - out0[5] = (int16_t) vout2i; - out0[6] = (int16_t) vout3r; - out0[7] = (int16_t) vout3i; - $else: - out0[0] = (int16_t) vout0r; - out0[1] = (int16_t) vout0i; - out1[0] = (int16_t) vout1r; - out1[1] = (int16_t) vout1i; - out2[0] = (int16_t) vout2r; - out2[1] = (int16_t) vout2i; - out3[0] = (int16_t) vout3r; - out3[1] = (int16_t) vout3i; - out0 += 2; - out1 += 2; - out2 += 2; - out3 += 2; + data0[0] = (int16_t) vout0r; + data0[1] = (int16_t) vout0i; + data1[0] = (int16_t) vout1r; + data1[1] = (int16_t) vout1i; + data2[0] = (int16_t) vout2r; + data2[1] = (int16_t) vout2i; + data3[0] = (int16_t) vout3r; + data3[1] = (int16_t) vout3i; + data0 += 2; + data1 += 2; + data2 += 2; + data3 += 2; } while(--samples != 0); } } diff --git a/src/cs16-fftr/gen/scalar-x1.c b/src/cs16-fftr/gen/scalar-x1.c index 2131ed918..f04dda53c 100644 --- a/src/cs16-fftr/gen/scalar-x1.c +++ b/src/cs16-fftr/gen/scalar-x1.c @@ -18,8 +18,8 @@ void xnn_cs16_fftr_ukernel__scalar_x1( size_t samples, int16_t* data, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ assert(samples >= 2); assert(samples % 2 == 0); assert(data != NULL); diff --git a/src/cs16-fftr/gen/scalar-x2.c b/src/cs16-fftr/gen/scalar-x2.c index 83dc7c20c..b8d279c19 100644 --- a/src/cs16-fftr/gen/scalar-x2.c +++ b/src/cs16-fftr/gen/scalar-x2.c @@ -18,8 +18,8 @@ void xnn_cs16_fftr_ukernel__scalar_x2( size_t samples, int16_t* data, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ assert(samples >= 2); assert(samples % 2 == 0); assert(data != NULL); diff --git a/src/cs16-fftr/gen/scalar-x3.c b/src/cs16-fftr/gen/scalar-x3.c index 24d016848..2ab829abe 100644 --- a/src/cs16-fftr/gen/scalar-x3.c +++ b/src/cs16-fftr/gen/scalar-x3.c @@ -18,8 +18,8 @@ void xnn_cs16_fftr_ukernel__scalar_x3( size_t samples, int16_t* data, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ assert(samples >= 2); assert(samples % 2 == 0); assert(data != NULL); diff --git a/src/cs16-fftr/gen/scalar-x4.c b/src/cs16-fftr/gen/scalar-x4.c index be23e44c1..16a295aa0 100644 --- a/src/cs16-fftr/gen/scalar-x4.c +++ b/src/cs16-fftr/gen/scalar-x4.c @@ -18,8 +18,8 @@ void xnn_cs16_fftr_ukernel__scalar_x4( size_t samples, int16_t* data, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ assert(samples >= 2); assert(samples % 2 == 0); assert(data != NULL); diff --git a/src/cs16-fftr/scalar.c.in b/src/cs16-fftr/scalar.c.in index 8af6c6c88..3c075d259 100644 --- a/src/cs16-fftr/scalar.c.in +++ b/src/cs16-fftr/scalar.c.in @@ -15,8 +15,8 @@ $assert SAMPLE_TILE >= 1 void xnn_cs16_fftr_ukernel__scalar_x${SAMPLE_TILE}( size_t samples, int16_t* data, - const int16_t* twiddle) { - + const int16_t* twiddle) +{ assert(samples >= 2); assert(samples % 2 == 0); assert(data != NULL); diff --git a/src/xnnpack/fft.h b/src/xnnpack/fft.h index 90d9bacc4..de5a75361 100644 --- a/src/xnnpack/fft.h +++ b/src/xnnpack/fft.h @@ -26,8 +26,8 @@ DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x1) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x2) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x3) DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_ukernel__scalar_x4) -DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__scalar) -DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4m1_ukernel__neon) +DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_samples1_ukernel__scalar) +DECLARE_CS16_BFLY4_UKERNEL_FUNCTION(xnn_cs16_bfly4_samples1_ukernel__neon) #define DECLARE_CS16_FFTR_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ diff --git a/test/cs16-bfly4.cc b/test/cs16-bfly4.cc index 67e8cb7ed..933b1b3f4 100644 --- a/test/cs16-bfly4.cc +++ b/test/cs16-bfly4.cc @@ -18,12 +18,12 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(CS16_BFLY4M1__NEON, samples_eq_1) { + TEST(CS16_BFLY4_SAMPLES1__NEON, samples_eq_1) { TEST_REQUIRES_ARM_NEON; BFly4MicrokernelTester() .samples(1) .stride(64) - .Test(xnn_cs16_bfly4m1_ukernel__neon); + .Test(xnn_cs16_bfly4_samples1_ukernel__neon); } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -144,9 +144,9 @@ TEST(CS16_BFLY4__SCALAR_X4, samples_eq_64) { } -TEST(CS16_BFLY4M1__SCALAR, samples_eq_1) { +TEST(CS16_BFLY4_SAMPLES1__SCALAR, samples_eq_1) { BFly4MicrokernelTester() .samples(1) .stride(64) - .Test(xnn_cs16_bfly4m1_ukernel__scalar); + .Test(xnn_cs16_bfly4_samples1_ukernel__scalar); } diff --git a/test/cs16-bfly4.yaml b/test/cs16-bfly4.yaml index 5f24af79a..e42f1a70a 100644 --- a/test/cs16-bfly4.yaml +++ b/test/cs16-bfly4.yaml @@ -5,11 +5,11 @@ # NEON -- name: xnn_cs16_bfly4m1_ukernel__neon +- name: xnn_cs16_bfly4_samples1_ukernel__neon # Scalar - name: xnn_cs16_bfly4_ukernel__scalar_x1 - name: xnn_cs16_bfly4_ukernel__scalar_x2 - name: xnn_cs16_bfly4_ukernel__scalar_x3 - name: xnn_cs16_bfly4_ukernel__scalar_x4 -- name: xnn_cs16_bfly4m1_ukernel__scalar +- name: xnn_cs16_bfly4_samples1_ukernel__scalar diff --git a/tools/generate-bfly4-test.py b/tools/generate-bfly4-test.py index 1bd2a31a7..f24b455ab 100755 --- a/tools/generate-bfly4-test.py +++ b/tools/generate-bfly4-test.py @@ -27,17 +27,19 @@ parser.set_defaults(defines=list()) def split_ukernel_name(name): - m = 0 - samples_tile = 1 - match = re.fullmatch(r"xnn_cs16_bfly4(m(\d+))?_ukernel__(.+)(_x(\d+))?", name) - assert match is not None + match = re.fullmatch(r"xnn_cs16_bfly4(_samples(\d+))?_ukernel__(.+)(_x(\d+))?", name) + assert match is not None, name if match.group(2): - m = int(match.group(2)) + samples = int(match.group(2)) + else: + samples = 0 if match.group(5): samples_tile = int(match.group(5)) + else: + samples_tile = 1 arch, isa = xnncommon.parse_target_name(target_name=match.group(3)) - return m, samples_tile, arch, isa + return samples, samples_tile, arch, isa BFLY4_TEST_TEMPLATE = """\ @@ -50,7 +52,7 @@ TEST(${TEST_NAME}, samples_eq_1) { .Test(${", ".join(TEST_ARGS)}); } -$if M == 0: +$if SAMPLES == 0: TEST(${TEST_NAME}, samples_eq_4) { $if ISA_CHECK: ${ISA_CHECK}; @@ -81,12 +83,12 @@ $if M == 0: """ -def generate_test_cases(ukernel, m, samples_tile, isa): +def generate_test_cases(ukernel, samples, samples_tile, isa): """Generates all tests cases for a BFly4 micro-kernel. Args: ukernel: C name of the micro-kernel function. - m: fixed number of samples for specialized m1 microkernel. + samples: fixed number of samples for specialized samples1 microkernel. samples_tile: Number of samples processed per one iteration of the inner loop of the micro-kernel. isa: instruction set required to run the micro-kernel. Generated unit test @@ -101,7 +103,7 @@ def generate_test_cases(ukernel, m, samples_tile, isa): "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), "TEST_ARGS": [ukernel], "DATATYPE": datatype, - "M": m, + "SAMPLES": samples, "SAMPLE_TILE": samples_tile, "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), "next_prime": next_prime, @@ -138,12 +140,12 @@ def main(args): for ukernel_spec in spec_yaml: name = ukernel_spec["name"] - m, samples_tile, arch, isa = split_ukernel_name(name) + samples, samples_tile, arch, isa = split_ukernel_name(name) # specification can override architecture arch = ukernel_spec.get("arch", arch) - test_case = generate_test_cases(name, m, samples_tile, isa) + test_case = generate_test_cases(name, samples, samples_tile, isa) tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) txt_changed = True |