diff options
author | Alan Kelly <alankelly@google.com> | 2022-08-23 07:35:09 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-08-23 07:36:05 -0700 |
commit | ddbb37a87d134781f23d35fed4ad8262241a50ad (patch) | |
tree | f29f21ad65a094b32df3e7b0d94dcad3fa706e38 | |
parent | 0f51d3524b7d5ea4d40f24ab3aa9a5c0215bea50 (diff) | |
download | XNNPACK-ddbb37a87d134781f23d35fed4ad8262241a50ad.tar.gz |
Variable size transpose ukernels no longer assume that input and output element strides are equal to the element size.
PiperOrigin-RevId: 469454228
-rw-r--r-- | bench/xx-transpose.cc | 4 | ||||
-rw-r--r-- | src/operator-run.c | 20 | ||||
-rw-r--r-- | src/xnnpack/microfnptr.h | 6 | ||||
-rw-r--r-- | src/xnnpack/transpose.h | 16 | ||||
-rw-r--r-- | src/xx-transpose/1x1-memcpy.c | 18 | ||||
-rw-r--r-- | test/transpose-microkernel-tester.h | 44 | ||||
-rw-r--r-- | test/x16-transpose.cc | 1284 | ||||
-rw-r--r-- | test/x24-transpose.cc | 416 | ||||
-rw-r--r-- | test/x32-transpose.cc | 1774 | ||||
-rw-r--r-- | test/x64-transpose.cc | 610 | ||||
-rw-r--r-- | test/x8-transpose.cc | 850 | ||||
-rw-r--r-- | test/xx-transpose.cc | 111 |
12 files changed, 5121 insertions, 32 deletions
diff --git a/bench/xx-transpose.cc b/bench/xx-transpose.cc index a6441c4e1..ae596bbb6 100644 --- a/bench/xx-transpose.cc +++ b/bench/xx-transpose.cc @@ -41,8 +41,8 @@ void transpose( std::fill(y.begin(), y.end(), 0); for (auto _ : state) { - transpose(x.data(), y.data(), tile_wbytes, tile_hbytes, element_size, width, - height); + transpose(x.data(), y.data(), tile_wbytes, tile_hbytes, element_size, + element_size, element_size, width, height); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); diff --git a/src/operator-run.c b/src/operator-run.c index ce3b4d362..ab4d5ace5 100644 --- a/src/operator-run.c +++ b/src/operator-run.c @@ -159,13 +159,15 @@ void xnn_compute_transposev_2d( const size_t ld_output = context->output_stride[0]; const void* x = (const void*) ((uintptr_t) context->x + i * context->input_stride[0] + j * ld_input); - void* y = (void*) ((uintptr_t) context->y + element_size * j + i * context->output_stride[0]); + void* y = (void*) ((uintptr_t) context->y + context->output_stride[1] * j + i * context->output_stride[0]); context->variable_size_ukernel( x, y, ld_input, ld_output, + context->input_stride[0], + context->output_stride[1], element_size, tile_i, tile_j); @@ -185,13 +187,15 @@ void xnn_compute_transposev_3d( const void* x = (const void*)((uintptr_t)context->x + i * context->input_stride[0] + j * context->input_stride[1] + k * ld_input); void* y = (void*)((uintptr_t)context->y + i * context->output_stride[0] + j * context->output_stride[1] + - k * element_size); + k * context->output_stride[2]); context->variable_size_ukernel( x, y, ld_input, ld_output, + context->input_stride[1], + context->output_stride[2], element_size, tile_j, tile_k); @@ -211,7 +215,7 @@ void xnn_compute_transposev_4d( const size_t ld_output = context->output_stride[2]; const void* x = (const void*)((uintptr_t)context->x + i * context->input_stride[0] + j * context->input_stride[1] + k * context->input_stride[2] + l * ld_input); - void* y = (void*)((uintptr_t)context->y + element_size * l + i * context->output_stride[0] + + void* y = (void*)((uintptr_t)context->y + context->output_stride[3] * l + i * context->output_stride[0] + j * context->output_stride[1] + k * context->output_stride[2]); context->variable_size_ukernel( @@ -219,6 +223,8 @@ void xnn_compute_transposev_4d( y, ld_input, ld_output, + context->input_stride[2], + context->output_stride[3], element_size, tile_k, tile_l); @@ -239,7 +245,7 @@ void xnn_compute_transposev_5d( const size_t ld_output = context->output_stride[3]; const void* x = (const void*)((uintptr_t)context->x + i * context->input_stride[0] + j * context->input_stride[1] + k * context->input_stride[2] + l * context->input_stride[3] + m * ld_input); - void* y = (void*)((uintptr_t)context->y + element_size * m + i * context->output_stride[0] + + void* y = (void*)((uintptr_t)context->y + context->output_stride[4] * m + i * context->output_stride[0] + j * context->output_stride[1] + k * context->output_stride[2] + l * context->output_stride[3]); context->variable_size_ukernel( @@ -247,6 +253,8 @@ void xnn_compute_transposev_5d( y, ld_input, ld_output, + context->input_stride[3], + context->output_stride[4], element_size, tile_l, tile_m); @@ -269,7 +277,7 @@ void xnn_compute_transposev_6d( const void* x = (const void*)((uintptr_t)context->x + i * context->input_stride[0] + j * context->input_stride[1] + k * context->input_stride[2] + l * context->input_stride[3] + m * context->input_stride[4] + n * ld_input); - void* y = (void*)((uintptr_t)context->y + element_size * n + i * context->output_stride[0] + + void* y = (void*)((uintptr_t)context->y + context->output_stride[5] * n + i * context->output_stride[0] + j * context->output_stride[1] + k * context->output_stride[2] + l * context->output_stride[3] + m * context->output_stride[4]); @@ -278,6 +286,8 @@ void xnn_compute_transposev_6d( y, ld_input, ld_output, + context->input_stride[4], + context->output_stride[4], element_size, tile_m, tile_n); diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index 8555ea44c..93525b6fe 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -1009,8 +1009,10 @@ typedef void (*xnn_transposec_ukernel_function)( typedef void (*xnn_transposev_ukernel_function)( const void* input, void* output, - size_t input_stride, - size_t output_stride, + size_t input_row_stride, + size_t output_row_stride, + size_t input_element_stride, + size_t output_element_stride, size_t element_size, size_t block_width, size_t block_height); diff --git a/src/xnnpack/transpose.h b/src/xnnpack/transpose.h index 3634f6c8d..5e2a8afa9 100644 --- a/src/xnnpack/transpose.h +++ b/src/xnnpack/transpose.h @@ -14,13 +14,15 @@ extern "C" { #endif -#define DECLARE_XX_TRANSPOSEV_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name(const void* input, \ - void* output, \ - size_t input_stride, \ - size_t output_stride, \ - size_t element_size, \ - size_t block_width, \ +#define DECLARE_XX_TRANSPOSEV_UKERNEL_FUNCTION(fn_name) \ + XNN_INTERNAL void fn_name(const void* input, \ + void* output, \ + size_t input_row_stride, \ + size_t output_row_stride, \ + size_t input_element_stride, \ + size_t output_element_stride, \ + size_t element_size, \ + size_t block_width, \ size_t block_height); DECLARE_XX_TRANSPOSEV_UKERNEL_FUNCTION(xnn_xx_transposev_ukernel__1x1_memcpy) diff --git a/src/xx-transpose/1x1-memcpy.c b/src/xx-transpose/1x1-memcpy.c index e8e5cdb31..47a39beae 100644 --- a/src/xx-transpose/1x1-memcpy.c +++ b/src/xx-transpose/1x1-memcpy.c @@ -13,18 +13,16 @@ void xnn_xx_transposev_ukernel__1x1_memcpy( const void* input, void* output, - size_t input_stride, - size_t output_stride, + size_t input_row_stride, + size_t output_row_stride, + size_t input_element_stride, + size_t output_element_stride, size_t element_size, size_t block_width, size_t block_height) { - const size_t tile_height = 1; - const size_t tile_width = 1; - const size_t tile_wbytes = tile_width * element_size; - const size_t input_reset = tile_wbytes - block_height * input_stride; - const size_t output_reset = tile_width * output_stride - block_height * element_size; - const size_t input_offset = tile_height * input_stride; + const size_t input_reset = input_element_stride - block_height * input_row_stride; + const size_t output_reset = output_row_stride - block_height * output_element_stride; const void* i = (const void*) input; void* o = (void*) output; @@ -33,8 +31,8 @@ void xnn_xx_transposev_ukernel__1x1_memcpy( size_t bh = block_height; for (; bh >= 1; bh -= 1) { memcpy(o, i, element_size); - i = (const void*) ((uintptr_t) i + input_offset); - o = (void*) ((uintptr_t) o + element_size); + i = (const void*) ((uintptr_t) i + input_row_stride); + o = (void*) ((uintptr_t) o + output_element_stride); } i = (const void*) ((uintptr_t) i + input_reset); diff --git a/test/transpose-microkernel-tester.h b/test/transpose-microkernel-tester.h index 7fd79259d..c9c7f9f51 100644 --- a/test/transpose-microkernel-tester.h +++ b/test/transpose-microkernel-tester.h @@ -58,6 +58,34 @@ class TransposeMicrokernelTester { inline size_t output_stride() const { return this->output_stride_; } + inline TransposeMicrokernelTester& input_element_stride(size_t input_element_stride) { + assert(input_element_stride >= element_size_); + this->input_element_stride_ = input_element_stride; + return *this; + } + + inline size_t input_element_stride() const { + if (input_element_stride_ == 0) { + return element_size_; + } else { + return input_element_stride_; + } + } + + inline TransposeMicrokernelTester& output_element_stride(size_t output_element_stride) { + assert(output_element_stride >= element_size_); + this->output_element_stride_ = output_element_stride; + return *this; + } + + inline size_t output_element_stride() const { + if (output_element_stride_ == 0) { + return element_size_; + } else { + return output_element_stride_; + } + } + inline TransposeMicrokernelTester& iterations(size_t iterations) { this->iterations_ = iterations; return *this; @@ -66,16 +94,18 @@ class TransposeMicrokernelTester { inline size_t iterations() const { return this->iterations_; } void Test(xnn_transposev_ukernel_function transpose) const { - std::vector<uint8_t> input(input_stride() * block_height() * element_size() + XNN_EXTRA_BYTES); - std::vector<uint8_t> output(output_stride() * block_width() * element_size()); + std::vector<uint8_t> input(input_stride() * block_height() * input_element_stride() + XNN_EXTRA_BYTES); + std::vector<uint8_t> output(output_stride() * block_width() * output_element_stride()); std::iota(input.begin(), input.end(), 0); std::fill(output.begin(), output.end(), UINT8_C(0xA5)); // Call optimized micro-kernel. transpose(input.data(), output.data(), - input_stride() * element_size(), - output_stride() * element_size(), + input_stride() * input_element_stride(), + output_stride() * output_element_stride(), + input_element_stride(), + output_element_stride(), element_size(), block_width(), block_height()); @@ -83,8 +113,8 @@ class TransposeMicrokernelTester { // Verify results. for (size_t c = 0; c < block_width(); c++) { for (size_t r = 0; r < block_height(); r++) { - ASSERT_EQ(std::memcmp(&input[element_size() * (c+ r * input_stride())], - &output[element_size() * (r + c * output_stride())], + ASSERT_EQ(std::memcmp(&input[input_element_stride() * (c+ r * input_stride())], + &output[output_element_stride() * (r + c * output_stride())], element_size()), 0) << "at row " << r << " / " << block_height() << ", at column " << c << " / " << block_width(); @@ -226,6 +256,8 @@ class TransposeMicrokernelTester { size_t element_size_ = 1; size_t input_stride_ = 1; size_t output_stride_ = 1; + size_t input_element_stride_ = 0; + size_t output_element_stride_ = 0; size_t block_height_ = 1; size_t block_width_ = 1; size_t iterations_ = 15; diff --git a/test/x16-transpose.cc b/test/x16-transpose.cc index 5098ce75c..a40285df4 100644 --- a/test/x16-transpose.cc +++ b/test/x16-transpose.cc @@ -165,6 +165,43 @@ TEST(X16_TRANSPOSEC__1X2_SCALAR_INT_2, bh_1_bw_2_is_4_os_2) { .Test(xnn_x16_transposec_ukernel__1x2_scalar_int); } +TEST(X16_TRANSPOSEC__1X2_SCALAR_INT_2, bh_17_bw_38_ies_13) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(17) + .block_width(38) + .block_height(17) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__1x2_scalar_int); +} + +TEST(X16_TRANSPOSEC__1X2_SCALAR_INT_2, bh_3_bw_10_oes_13) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(3) + .block_width(10) + .block_height(3) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__1x2_scalar_int); +} + +TEST(X16_TRANSPOSEC__1X2_SCALAR_INT_2, bh_7_bw_46_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(13) + .block_width(46) + .block_height(7) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__1x2_scalar_int); +} + TEST(X16_TRANSPOSEC__1X4_SCALAR_INT_2, bh_1_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -313,6 +350,43 @@ TEST(X16_TRANSPOSEC__1X4_SCALAR_INT_2, bh_1_bw_4_is_8_os_2) { .Test(xnn_x16_transposec_ukernel__1x4_scalar_int); } +TEST(X16_TRANSPOSEC__1X4_SCALAR_INT_2, bh_17_bw_76_ies_13) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(17) + .block_width(76) + .block_height(17) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__1x4_scalar_int); +} + +TEST(X16_TRANSPOSEC__1X4_SCALAR_INT_2, bh_3_bw_20_oes_13) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(3) + .block_width(20) + .block_height(3) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__1x4_scalar_int); +} + +TEST(X16_TRANSPOSEC__1X4_SCALAR_INT_2, bh_7_bw_92_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(13) + .block_width(92) + .block_height(7) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__1x4_scalar_int); +} + TEST(X16_TRANSPOSEC__2X1_SCALAR_INT_2, bh_2_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -461,6 +535,43 @@ TEST(X16_TRANSPOSEC__2X1_SCALAR_INT_2, bh_2_bw_1_is_2_os_4) { .Test(xnn_x16_transposec_ukernel__2x1_scalar_int); } +TEST(X16_TRANSPOSEC__2X1_SCALAR_INT_2, bh_34_bw_19_ies_13) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(34) + .block_width(19) + .block_height(34) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__2x1_scalar_int); +} + +TEST(X16_TRANSPOSEC__2X1_SCALAR_INT_2, bh_6_bw_5_oes_13) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(6) + .block_width(5) + .block_height(6) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__2x1_scalar_int); +} + +TEST(X16_TRANSPOSEC__2X1_SCALAR_INT_2, bh_14_bw_23_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(20) + .block_width(23) + .block_height(14) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__2x1_scalar_int); +} + TEST(X16_TRANSPOSEC__2X2_SCALAR_INT_2, bh_2_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -609,6 +720,43 @@ TEST(X16_TRANSPOSEC__2X2_SCALAR_INT_2, bh_2_bw_2_is_4_os_4) { .Test(xnn_x16_transposec_ukernel__2x2_scalar_int); } +TEST(X16_TRANSPOSEC__2X2_SCALAR_INT_2, bh_34_bw_38_ies_13) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__2x2_scalar_int); +} + +TEST(X16_TRANSPOSEC__2X2_SCALAR_INT_2, bh_6_bw_10_oes_13) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__2x2_scalar_int); +} + +TEST(X16_TRANSPOSEC__2X2_SCALAR_INT_2, bh_14_bw_46_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__2x2_scalar_int); +} + TEST(X16_TRANSPOSEC__2X4_SCALAR_INT_2, bh_2_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -757,6 +905,43 @@ TEST(X16_TRANSPOSEC__2X4_SCALAR_INT_2, bh_2_bw_4_is_8_os_4) { .Test(xnn_x16_transposec_ukernel__2x4_scalar_int); } +TEST(X16_TRANSPOSEC__2X4_SCALAR_INT_2, bh_34_bw_76_ies_13) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(34) + .block_width(76) + .block_height(34) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__2x4_scalar_int); +} + +TEST(X16_TRANSPOSEC__2X4_SCALAR_INT_2, bh_6_bw_20_oes_13) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(6) + .block_width(20) + .block_height(6) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__2x4_scalar_int); +} + +TEST(X16_TRANSPOSEC__2X4_SCALAR_INT_2, bh_14_bw_92_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(20) + .block_width(92) + .block_height(14) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__2x4_scalar_int); +} + TEST(X16_TRANSPOSEC__4X1_SCALAR_INT_2, bh_4_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -905,6 +1090,43 @@ TEST(X16_TRANSPOSEC__4X1_SCALAR_INT_2, bh_4_bw_1_is_2_os_8) { .Test(xnn_x16_transposec_ukernel__4x1_scalar_int); } +TEST(X16_TRANSPOSEC__4X1_SCALAR_INT_2, bh_68_bw_19_ies_13) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(68) + .block_width(19) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x1_scalar_int); +} + +TEST(X16_TRANSPOSEC__4X1_SCALAR_INT_2, bh_12_bw_5_oes_13) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(12) + .block_width(5) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x1_scalar_int); +} + +TEST(X16_TRANSPOSEC__4X1_SCALAR_INT_2, bh_28_bw_23_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(34) + .block_width(23) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x1_scalar_int); +} + TEST(X16_TRANSPOSEC__4X2_SCALAR_INT_2, bh_4_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -1053,6 +1275,43 @@ TEST(X16_TRANSPOSEC__4X2_SCALAR_INT_2, bh_4_bw_2_is_4_os_8) { .Test(xnn_x16_transposec_ukernel__4x2_scalar_int); } +TEST(X16_TRANSPOSEC__4X2_SCALAR_INT_2, bh_68_bw_38_ies_13) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(68) + .block_width(38) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x2_scalar_int); +} + +TEST(X16_TRANSPOSEC__4X2_SCALAR_INT_2, bh_12_bw_10_oes_13) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(12) + .block_width(10) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x2_scalar_int); +} + +TEST(X16_TRANSPOSEC__4X2_SCALAR_INT_2, bh_28_bw_46_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(34) + .block_width(46) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x2_scalar_int); +} + TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -1201,6 +1460,43 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .Test(xnn_x16_transposec_ukernel__4x4_scalar_int); } +TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_68_bw_76_ies_13) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_scalar_int); +} + +TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_12_bw_20_oes_13) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_scalar_int); +} + +TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_28_bw_92_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_scalar_int); +} + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(X16_TRANSPOSEC__4X8_SSE2_2, bh_4_bw_8) { TEST_REQUIRES_X86_SSE2; @@ -1361,6 +1657,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__4x8_sse2); } + + TEST(X16_TRANSPOSEC__4X8_SSE2_2, bh_68_bw_152_ies_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(68) + .block_width(152) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x8_sse2); + } + + TEST(X16_TRANSPOSEC__4X8_SSE2_2, bh_12_bw_40_oes_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(12) + .block_width(40) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x8_sse2); + } + + TEST(X16_TRANSPOSEC__4X8_SSE2_2, bh_28_bw_184_ies_19_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(34) + .block_width(184) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x8_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1524,6 +1860,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_sse2); } + + TEST(X16_TRANSPOSEC__8X8_MULTI_MOV_SSE2_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_sse2); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_MOV_SSE2_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_sse2); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_MOV_SSE2_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1687,6 +2063,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_sse2); } + + TEST(X16_TRANSPOSEC__8X8_MULTI_SWITCH_SSE2_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_sse2); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_SWITCH_SSE2_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_sse2); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_SWITCH_SSE2_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1850,6 +2266,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_sse2); } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MOV_SSE2_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_sse2); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MOV_SSE2_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_sse2); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MOV_SSE2_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -2013,6 +2469,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_sse2); } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MULTI_SSE2_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_sse2); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MULTI_SSE2_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_sse2); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MULTI_SSE2_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -2176,6 +2672,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_reuse_switch_sse2); } + + TEST(X16_TRANSPOSEC__8X8_REUSE_SWITCH_SSE2_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_switch_sse2); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_SWITCH_SSE2_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_switch_sse2); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_SWITCH_SSE2_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_switch_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -2327,6 +2863,43 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_wasmsimd); } + + TEST(X16_TRANSPOSEC__8X8_MULTI_MOV_WASMSIMD_2, bh_136_bw_152_ies_13) { + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_wasmsimd); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_MOV_WASMSIMD_2, bh_24_bw_40_oes_13) { + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_wasmsimd); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_MOV_WASMSIMD_2, bh_56_bw_184_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -2478,6 +3051,43 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_wasmsimd); } + + TEST(X16_TRANSPOSEC__8X8_MULTI_SWITCH_WASMSIMD_2, bh_136_bw_152_ies_13) { + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_wasmsimd); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_SWITCH_WASMSIMD_2, bh_24_bw_40_oes_13) { + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_wasmsimd); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_SWITCH_WASMSIMD_2, bh_56_bw_184_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -2629,6 +3239,43 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_wasmsimd); } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MOV_WASMSIMD_2, bh_136_bw_152_ies_13) { + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_wasmsimd); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MOV_WASMSIMD_2, bh_24_bw_40_oes_13) { + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_wasmsimd); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MOV_WASMSIMD_2, bh_56_bw_184_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -2780,6 +3427,43 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_wasmsimd); } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MULTI_WASMSIMD_2, bh_136_bw_152_ies_13) { + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_wasmsimd); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MULTI_WASMSIMD_2, bh_24_bw_40_oes_13) { + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_wasmsimd); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MULTI_WASMSIMD_2, bh_56_bw_184_ies_19_oes_15) { + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -2943,6 +3627,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__4x4_multi_dec_zip_neon); } + + TEST(X16_TRANSPOSEC__4X4_MULTI_DEC_ZIP_NEON_2, bh_68_bw_76_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_dec_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_MULTI_DEC_ZIP_NEON_2, bh_12_bw_20_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_dec_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_MULTI_DEC_ZIP_NEON_2, bh_28_bw_92_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -3106,6 +3830,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__4x4_multi_mov_zip_neon); } + + TEST(X16_TRANSPOSEC__4X4_MULTI_MOV_ZIP_NEON_2, bh_68_bw_76_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_mov_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_MULTI_MOV_ZIP_NEON_2, bh_12_bw_20_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_mov_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_MULTI_MOV_ZIP_NEON_2, bh_28_bw_92_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -3269,6 +4033,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__4x4_multi_multi_zip_neon); } + + TEST(X16_TRANSPOSEC__4X4_MULTI_MULTI_ZIP_NEON_2, bh_68_bw_76_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_multi_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_MULTI_MULTI_ZIP_NEON_2, bh_12_bw_20_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_multi_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_MULTI_MULTI_ZIP_NEON_2, bh_28_bw_92_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_multi_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -3432,6 +4236,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__4x4_multi_switch_zip_neon); } + + TEST(X16_TRANSPOSEC__4X4_MULTI_SWITCH_ZIP_NEON_2, bh_68_bw_76_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_switch_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_MULTI_SWITCH_ZIP_NEON_2, bh_12_bw_20_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_switch_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_MULTI_SWITCH_ZIP_NEON_2, bh_28_bw_92_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_multi_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -3595,6 +4439,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__4x4_reuse_dec_zip_neon); } + + TEST(X16_TRANSPOSEC__4X4_REUSE_DEC_ZIP_NEON_2, bh_68_bw_76_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_dec_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_REUSE_DEC_ZIP_NEON_2, bh_12_bw_20_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_dec_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_REUSE_DEC_ZIP_NEON_2, bh_28_bw_92_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -3758,6 +4642,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__4x4_reuse_mov_zip_neon); } + + TEST(X16_TRANSPOSEC__4X4_REUSE_MOV_ZIP_NEON_2, bh_68_bw_76_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_mov_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_REUSE_MOV_ZIP_NEON_2, bh_12_bw_20_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_mov_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_REUSE_MOV_ZIP_NEON_2, bh_28_bw_92_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -3921,6 +4845,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__4x4_reuse_multi_zip_neon); } + + TEST(X16_TRANSPOSEC__4X4_REUSE_MULTI_ZIP_NEON_2, bh_68_bw_76_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_multi_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_REUSE_MULTI_ZIP_NEON_2, bh_12_bw_20_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_multi_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_REUSE_MULTI_ZIP_NEON_2, bh_28_bw_92_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_multi_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -4084,6 +5048,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__4x4_reuse_switch_zip_neon); } + + TEST(X16_TRANSPOSEC__4X4_REUSE_SWITCH_ZIP_NEON_2, bh_68_bw_76_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_switch_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_REUSE_SWITCH_ZIP_NEON_2, bh_12_bw_20_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_switch_zip_neon); + } + + TEST(X16_TRANSPOSEC__4X4_REUSE_SWITCH_ZIP_NEON_2, bh_28_bw_92_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__4x4_reuse_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -4247,6 +5251,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon); } + + TEST(X16_TRANSPOSEC__8X8_MULTI_DEC_ZIP_NEON_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_DEC_ZIP_NEON_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_DEC_ZIP_NEON_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -4410,6 +5454,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon); } + + TEST(X16_TRANSPOSEC__8X8_MULTI_MOV_ZIP_NEON_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_MOV_ZIP_NEON_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_MOV_ZIP_NEON_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -4573,6 +5657,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon); } + + TEST(X16_TRANSPOSEC__8X8_MULTI_SWITCH_ZIP_NEON_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_SWITCH_ZIP_NEON_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_MULTI_SWITCH_ZIP_NEON_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -4736,6 +5860,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon); } + + TEST(X16_TRANSPOSEC__8X8_REUSE_DEC_ZIP_NEON_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_DEC_ZIP_NEON_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_DEC_ZIP_NEON_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -4899,6 +6063,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon); } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MOV_ZIP_NEON_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MOV_ZIP_NEON_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MOV_ZIP_NEON_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -5062,6 +6266,46 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon); } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MULTI_ZIP_NEON_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MULTI_ZIP_NEON_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_MULTI_ZIP_NEON_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -5225,4 +6469,44 @@ TEST(X16_TRANSPOSEC__4X4_SCALAR_INT_2, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon); } + + TEST(X16_TRANSPOSEC__8X8_REUSE_SWITCH_ZIP_NEON_2, bh_136_bw_152_ies_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(2) + .input_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_SWITCH_ZIP_NEON_2, bh_24_bw_40_oes_13) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(2) + .output_element_stride(13) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon); + } + + TEST(X16_TRANSPOSEC__8X8_REUSE_SWITCH_ZIP_NEON_2, bh_56_bw_184_ies_19_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(2) + .input_element_stride(19) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/test/x24-transpose.cc b/test/x24-transpose.cc index f02c913e2..bf0a151d4 100644 --- a/test/x24-transpose.cc +++ b/test/x24-transpose.cc @@ -165,6 +165,43 @@ TEST(X24_TRANSPOSEC__1X2_SCALAR_3, bh_1_bw_2_is_4_os_2) { .Test(xnn_x24_transposec_ukernel__1x2_scalar); } +TEST(X24_TRANSPOSEC__1X2_SCALAR_3, bh_17_bw_38_ies_14) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(17) + .block_width(38) + .block_height(17) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__1x2_scalar); +} + +TEST(X24_TRANSPOSEC__1X2_SCALAR_3, bh_3_bw_10_oes_14) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(3) + .block_width(10) + .block_height(3) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__1x2_scalar); +} + +TEST(X24_TRANSPOSEC__1X2_SCALAR_3, bh_7_bw_46_ies_20_oes_16) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(13) + .block_width(46) + .block_height(7) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__1x2_scalar); +} + TEST(X24_TRANSPOSEC__1X4_SCALAR_3, bh_1_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -313,6 +350,43 @@ TEST(X24_TRANSPOSEC__1X4_SCALAR_3, bh_1_bw_4_is_8_os_2) { .Test(xnn_x24_transposec_ukernel__1x4_scalar); } +TEST(X24_TRANSPOSEC__1X4_SCALAR_3, bh_17_bw_76_ies_14) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(17) + .block_width(76) + .block_height(17) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__1x4_scalar); +} + +TEST(X24_TRANSPOSEC__1X4_SCALAR_3, bh_3_bw_20_oes_14) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(3) + .block_width(20) + .block_height(3) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__1x4_scalar); +} + +TEST(X24_TRANSPOSEC__1X4_SCALAR_3, bh_7_bw_92_ies_20_oes_16) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(13) + .block_width(92) + .block_height(7) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__1x4_scalar); +} + TEST(X24_TRANSPOSEC__2X1_SCALAR_3, bh_2_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -461,6 +535,43 @@ TEST(X24_TRANSPOSEC__2X1_SCALAR_3, bh_2_bw_1_is_2_os_4) { .Test(xnn_x24_transposec_ukernel__2x1_scalar); } +TEST(X24_TRANSPOSEC__2X1_SCALAR_3, bh_34_bw_19_ies_14) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(34) + .block_width(19) + .block_height(34) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x1_scalar); +} + +TEST(X24_TRANSPOSEC__2X1_SCALAR_3, bh_6_bw_5_oes_14) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(6) + .block_width(5) + .block_height(6) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x1_scalar); +} + +TEST(X24_TRANSPOSEC__2X1_SCALAR_3, bh_14_bw_23_ies_20_oes_16) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(20) + .block_width(23) + .block_height(14) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x1_scalar); +} + TEST(X24_TRANSPOSEC__2X2_SCALAR_3, bh_2_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -609,6 +720,43 @@ TEST(X24_TRANSPOSEC__2X2_SCALAR_3, bh_2_bw_2_is_4_os_4) { .Test(xnn_x24_transposec_ukernel__2x2_scalar); } +TEST(X24_TRANSPOSEC__2X2_SCALAR_3, bh_34_bw_38_ies_14) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x2_scalar); +} + +TEST(X24_TRANSPOSEC__2X2_SCALAR_3, bh_6_bw_10_oes_14) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x2_scalar); +} + +TEST(X24_TRANSPOSEC__2X2_SCALAR_3, bh_14_bw_46_ies_20_oes_16) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x2_scalar); +} + TEST(X24_TRANSPOSEC__2X4_SCALAR_3, bh_2_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -757,6 +905,43 @@ TEST(X24_TRANSPOSEC__2X4_SCALAR_3, bh_2_bw_4_is_8_os_4) { .Test(xnn_x24_transposec_ukernel__2x4_scalar); } +TEST(X24_TRANSPOSEC__2X4_SCALAR_3, bh_34_bw_76_ies_14) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(34) + .block_width(76) + .block_height(34) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x4_scalar); +} + +TEST(X24_TRANSPOSEC__2X4_SCALAR_3, bh_6_bw_20_oes_14) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(6) + .block_width(20) + .block_height(6) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x4_scalar); +} + +TEST(X24_TRANSPOSEC__2X4_SCALAR_3, bh_14_bw_92_ies_20_oes_16) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(20) + .block_width(92) + .block_height(14) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x4_scalar); +} + TEST(X24_TRANSPOSEC__4X1_SCALAR_3, bh_4_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -905,6 +1090,43 @@ TEST(X24_TRANSPOSEC__4X1_SCALAR_3, bh_4_bw_1_is_2_os_8) { .Test(xnn_x24_transposec_ukernel__4x1_scalar); } +TEST(X24_TRANSPOSEC__4X1_SCALAR_3, bh_68_bw_19_ies_14) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(68) + .block_width(19) + .block_height(68) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x1_scalar); +} + +TEST(X24_TRANSPOSEC__4X1_SCALAR_3, bh_12_bw_5_oes_14) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(12) + .block_width(5) + .block_height(12) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x1_scalar); +} + +TEST(X24_TRANSPOSEC__4X1_SCALAR_3, bh_28_bw_23_ies_20_oes_16) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(34) + .block_width(23) + .block_height(28) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x1_scalar); +} + TEST(X24_TRANSPOSEC__4X2_SCALAR_3, bh_4_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -1053,6 +1275,43 @@ TEST(X24_TRANSPOSEC__4X2_SCALAR_3, bh_4_bw_2_is_4_os_8) { .Test(xnn_x24_transposec_ukernel__4x2_scalar); } +TEST(X24_TRANSPOSEC__4X2_SCALAR_3, bh_68_bw_38_ies_14) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(68) + .block_width(38) + .block_height(68) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x2_scalar); +} + +TEST(X24_TRANSPOSEC__4X2_SCALAR_3, bh_12_bw_10_oes_14) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(12) + .block_width(10) + .block_height(12) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x2_scalar); +} + +TEST(X24_TRANSPOSEC__4X2_SCALAR_3, bh_28_bw_46_ies_20_oes_16) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(34) + .block_width(46) + .block_height(28) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x2_scalar); +} + TEST(X24_TRANSPOSEC__4X4_SCALAR_3, bh_4_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -1201,6 +1460,43 @@ TEST(X24_TRANSPOSEC__4X4_SCALAR_3, bh_4_bw_4_is_8_os_8) { .Test(xnn_x24_transposec_ukernel__4x4_scalar); } +TEST(X24_TRANSPOSEC__4X4_SCALAR_3, bh_68_bw_76_ies_14) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x4_scalar); +} + +TEST(X24_TRANSPOSEC__4X4_SCALAR_3, bh_12_bw_20_oes_14) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x4_scalar); +} + +TEST(X24_TRANSPOSEC__4X4_SCALAR_3, bh_28_bw_92_ies_20_oes_16) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x4_scalar); +} + #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(X24_TRANSPOSEC__2X2_NEON_TBL_3, bh_2_bw_2) { TEST_REQUIRES_ARM_NEON; @@ -1361,6 +1657,46 @@ TEST(X24_TRANSPOSEC__4X4_SCALAR_3, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x24_transposec_ukernel__2x2_neon_tbl); } + + TEST(X24_TRANSPOSEC__2X2_NEON_TBL_3, bh_34_bw_38_ies_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x2_neon_tbl); + } + + TEST(X24_TRANSPOSEC__2X2_NEON_TBL_3, bh_6_bw_10_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x2_neon_tbl); + } + + TEST(X24_TRANSPOSEC__2X2_NEON_TBL_3, bh_14_bw_46_ies_20_oes_16) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__2x2_neon_tbl); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -1524,6 +1860,46 @@ TEST(X24_TRANSPOSEC__4X4_SCALAR_3, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x24_transposec_ukernel__4x4_aarch64_neon_tbl); } + + TEST(X24_TRANSPOSEC__4X4_AARCH64_NEON_TBL_3, bh_68_bw_76_ies_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x4_aarch64_neon_tbl); + } + + TEST(X24_TRANSPOSEC__4X4_AARCH64_NEON_TBL_3, bh_12_bw_20_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x4_aarch64_neon_tbl); + } + + TEST(X24_TRANSPOSEC__4X4_AARCH64_NEON_TBL_3, bh_28_bw_92_ies_20_oes_16) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x4_aarch64_neon_tbl); + } #endif // XNN_ARCH_ARM64 @@ -1687,4 +2063,44 @@ TEST(X24_TRANSPOSEC__4X4_SCALAR_3, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x24_transposec_ukernel__4x4_ssse3); } + + TEST(X24_TRANSPOSEC__4X4_SSSE3_3, bh_68_bw_76_ies_14) { + TEST_REQUIRES_X86_SSSE3; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x4_ssse3); + } + + TEST(X24_TRANSPOSEC__4X4_SSSE3_3, bh_12_bw_20_oes_14) { + TEST_REQUIRES_X86_SSSE3; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x4_ssse3); + } + + TEST(X24_TRANSPOSEC__4X4_SSSE3_3, bh_28_bw_92_ies_20_oes_16) { + TEST_REQUIRES_X86_SSSE3; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_x24_transposec_ukernel__4x4_ssse3); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/x32-transpose.cc b/test/x32-transpose.cc index 16e859013..d22307b83 100644 --- a/test/x32-transpose.cc +++ b/test/x32-transpose.cc @@ -165,6 +165,43 @@ TEST(X32_TRANSPOSEC__1X2_SCALAR_FLOAT_4, bh_1_bw_2_is_4_os_2) { .Test(xnn_x32_transposec_ukernel__1x2_scalar_float); } +TEST(X32_TRANSPOSEC__1X2_SCALAR_FLOAT_4, bh_17_bw_38_ies_15) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(17) + .block_width(38) + .block_height(17) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x2_scalar_float); +} + +TEST(X32_TRANSPOSEC__1X2_SCALAR_FLOAT_4, bh_3_bw_10_oes_15) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(3) + .block_width(10) + .block_height(3) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x2_scalar_float); +} + +TEST(X32_TRANSPOSEC__1X2_SCALAR_FLOAT_4, bh_7_bw_46_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(13) + .block_width(46) + .block_height(7) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x2_scalar_float); +} + TEST(X32_TRANSPOSEC__1X2_SCALAR_INT_4, bh_1_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -313,6 +350,43 @@ TEST(X32_TRANSPOSEC__1X2_SCALAR_INT_4, bh_1_bw_2_is_4_os_2) { .Test(xnn_x32_transposec_ukernel__1x2_scalar_int); } +TEST(X32_TRANSPOSEC__1X2_SCALAR_INT_4, bh_17_bw_38_ies_15) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(17) + .block_width(38) + .block_height(17) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x2_scalar_int); +} + +TEST(X32_TRANSPOSEC__1X2_SCALAR_INT_4, bh_3_bw_10_oes_15) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(3) + .block_width(10) + .block_height(3) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x2_scalar_int); +} + +TEST(X32_TRANSPOSEC__1X2_SCALAR_INT_4, bh_7_bw_46_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(13) + .block_width(46) + .block_height(7) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x2_scalar_int); +} + TEST(X32_TRANSPOSEC__1X4_SCALAR_FLOAT_4, bh_1_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -461,6 +535,43 @@ TEST(X32_TRANSPOSEC__1X4_SCALAR_FLOAT_4, bh_1_bw_4_is_8_os_2) { .Test(xnn_x32_transposec_ukernel__1x4_scalar_float); } +TEST(X32_TRANSPOSEC__1X4_SCALAR_FLOAT_4, bh_17_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(17) + .block_width(76) + .block_height(17) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x4_scalar_float); +} + +TEST(X32_TRANSPOSEC__1X4_SCALAR_FLOAT_4, bh_3_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(3) + .block_width(20) + .block_height(3) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x4_scalar_float); +} + +TEST(X32_TRANSPOSEC__1X4_SCALAR_FLOAT_4, bh_7_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(13) + .block_width(92) + .block_height(7) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x4_scalar_float); +} + TEST(X32_TRANSPOSEC__1X4_SCALAR_INT_4, bh_1_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -609,6 +720,43 @@ TEST(X32_TRANSPOSEC__1X4_SCALAR_INT_4, bh_1_bw_4_is_8_os_2) { .Test(xnn_x32_transposec_ukernel__1x4_scalar_int); } +TEST(X32_TRANSPOSEC__1X4_SCALAR_INT_4, bh_17_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(17) + .block_width(76) + .block_height(17) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x4_scalar_int); +} + +TEST(X32_TRANSPOSEC__1X4_SCALAR_INT_4, bh_3_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(3) + .block_width(20) + .block_height(3) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x4_scalar_int); +} + +TEST(X32_TRANSPOSEC__1X4_SCALAR_INT_4, bh_7_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(13) + .block_width(92) + .block_height(7) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__1x4_scalar_int); +} + TEST(X32_TRANSPOSEC__2X1_SCALAR_FLOAT_4, bh_2_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -757,6 +905,43 @@ TEST(X32_TRANSPOSEC__2X1_SCALAR_FLOAT_4, bh_2_bw_1_is_2_os_4) { .Test(xnn_x32_transposec_ukernel__2x1_scalar_float); } +TEST(X32_TRANSPOSEC__2X1_SCALAR_FLOAT_4, bh_34_bw_19_ies_15) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(34) + .block_width(19) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x1_scalar_float); +} + +TEST(X32_TRANSPOSEC__2X1_SCALAR_FLOAT_4, bh_6_bw_5_oes_15) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(6) + .block_width(5) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x1_scalar_float); +} + +TEST(X32_TRANSPOSEC__2X1_SCALAR_FLOAT_4, bh_14_bw_23_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(20) + .block_width(23) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x1_scalar_float); +} + TEST(X32_TRANSPOSEC__2X1_SCALAR_INT_4, bh_2_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -905,6 +1090,43 @@ TEST(X32_TRANSPOSEC__2X1_SCALAR_INT_4, bh_2_bw_1_is_2_os_4) { .Test(xnn_x32_transposec_ukernel__2x1_scalar_int); } +TEST(X32_TRANSPOSEC__2X1_SCALAR_INT_4, bh_34_bw_19_ies_15) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(34) + .block_width(19) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x1_scalar_int); +} + +TEST(X32_TRANSPOSEC__2X1_SCALAR_INT_4, bh_6_bw_5_oes_15) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(6) + .block_width(5) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x1_scalar_int); +} + +TEST(X32_TRANSPOSEC__2X1_SCALAR_INT_4, bh_14_bw_23_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(20) + .block_width(23) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x1_scalar_int); +} + TEST(X32_TRANSPOSEC__2X2_SCALAR_FLOAT_4, bh_2_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -1053,6 +1275,43 @@ TEST(X32_TRANSPOSEC__2X2_SCALAR_FLOAT_4, bh_2_bw_2_is_4_os_4) { .Test(xnn_x32_transposec_ukernel__2x2_scalar_float); } +TEST(X32_TRANSPOSEC__2X2_SCALAR_FLOAT_4, bh_34_bw_38_ies_15) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_scalar_float); +} + +TEST(X32_TRANSPOSEC__2X2_SCALAR_FLOAT_4, bh_6_bw_10_oes_15) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_scalar_float); +} + +TEST(X32_TRANSPOSEC__2X2_SCALAR_FLOAT_4, bh_14_bw_46_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_scalar_float); +} + TEST(X32_TRANSPOSEC__2X2_SCALAR_INT_4, bh_2_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -1201,6 +1460,43 @@ TEST(X32_TRANSPOSEC__2X2_SCALAR_INT_4, bh_2_bw_2_is_4_os_4) { .Test(xnn_x32_transposec_ukernel__2x2_scalar_int); } +TEST(X32_TRANSPOSEC__2X2_SCALAR_INT_4, bh_34_bw_38_ies_15) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_scalar_int); +} + +TEST(X32_TRANSPOSEC__2X2_SCALAR_INT_4, bh_6_bw_10_oes_15) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_scalar_int); +} + +TEST(X32_TRANSPOSEC__2X2_SCALAR_INT_4, bh_14_bw_46_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_scalar_int); +} + TEST(X32_TRANSPOSEC__2X4_SCALAR_FLOAT_4, bh_2_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -1349,6 +1645,43 @@ TEST(X32_TRANSPOSEC__2X4_SCALAR_FLOAT_4, bh_2_bw_4_is_8_os_4) { .Test(xnn_x32_transposec_ukernel__2x4_scalar_float); } +TEST(X32_TRANSPOSEC__2X4_SCALAR_FLOAT_4, bh_34_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(34) + .block_width(76) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x4_scalar_float); +} + +TEST(X32_TRANSPOSEC__2X4_SCALAR_FLOAT_4, bh_6_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(6) + .block_width(20) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x4_scalar_float); +} + +TEST(X32_TRANSPOSEC__2X4_SCALAR_FLOAT_4, bh_14_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(20) + .block_width(92) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x4_scalar_float); +} + TEST(X32_TRANSPOSEC__2X4_SCALAR_INT_4, bh_2_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -1497,6 +1830,43 @@ TEST(X32_TRANSPOSEC__2X4_SCALAR_INT_4, bh_2_bw_4_is_8_os_4) { .Test(xnn_x32_transposec_ukernel__2x4_scalar_int); } +TEST(X32_TRANSPOSEC__2X4_SCALAR_INT_4, bh_34_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(34) + .block_width(76) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x4_scalar_int); +} + +TEST(X32_TRANSPOSEC__2X4_SCALAR_INT_4, bh_6_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(6) + .block_width(20) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x4_scalar_int); +} + +TEST(X32_TRANSPOSEC__2X4_SCALAR_INT_4, bh_14_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(20) + .block_width(92) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x4_scalar_int); +} + TEST(X32_TRANSPOSEC__4X1_SCALAR_FLOAT_4, bh_4_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -1645,6 +2015,43 @@ TEST(X32_TRANSPOSEC__4X1_SCALAR_FLOAT_4, bh_4_bw_1_is_2_os_8) { .Test(xnn_x32_transposec_ukernel__4x1_scalar_float); } +TEST(X32_TRANSPOSEC__4X1_SCALAR_FLOAT_4, bh_68_bw_19_ies_15) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(68) + .block_width(19) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x1_scalar_float); +} + +TEST(X32_TRANSPOSEC__4X1_SCALAR_FLOAT_4, bh_12_bw_5_oes_15) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(12) + .block_width(5) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x1_scalar_float); +} + +TEST(X32_TRANSPOSEC__4X1_SCALAR_FLOAT_4, bh_28_bw_23_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(34) + .block_width(23) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x1_scalar_float); +} + TEST(X32_TRANSPOSEC__4X1_SCALAR_INT_4, bh_4_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -1793,6 +2200,43 @@ TEST(X32_TRANSPOSEC__4X1_SCALAR_INT_4, bh_4_bw_1_is_2_os_8) { .Test(xnn_x32_transposec_ukernel__4x1_scalar_int); } +TEST(X32_TRANSPOSEC__4X1_SCALAR_INT_4, bh_68_bw_19_ies_15) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(68) + .block_width(19) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x1_scalar_int); +} + +TEST(X32_TRANSPOSEC__4X1_SCALAR_INT_4, bh_12_bw_5_oes_15) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(12) + .block_width(5) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x1_scalar_int); +} + +TEST(X32_TRANSPOSEC__4X1_SCALAR_INT_4, bh_28_bw_23_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(34) + .block_width(23) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x1_scalar_int); +} + TEST(X32_TRANSPOSEC__4X2_SCALAR_FLOAT_4, bh_4_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -1941,6 +2385,43 @@ TEST(X32_TRANSPOSEC__4X2_SCALAR_FLOAT_4, bh_4_bw_2_is_4_os_8) { .Test(xnn_x32_transposec_ukernel__4x2_scalar_float); } +TEST(X32_TRANSPOSEC__4X2_SCALAR_FLOAT_4, bh_68_bw_38_ies_15) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(68) + .block_width(38) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x2_scalar_float); +} + +TEST(X32_TRANSPOSEC__4X2_SCALAR_FLOAT_4, bh_12_bw_10_oes_15) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(12) + .block_width(10) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x2_scalar_float); +} + +TEST(X32_TRANSPOSEC__4X2_SCALAR_FLOAT_4, bh_28_bw_46_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(34) + .block_width(46) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x2_scalar_float); +} + TEST(X32_TRANSPOSEC__4X2_SCALAR_INT_4, bh_4_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -2089,6 +2570,43 @@ TEST(X32_TRANSPOSEC__4X2_SCALAR_INT_4, bh_4_bw_2_is_4_os_8) { .Test(xnn_x32_transposec_ukernel__4x2_scalar_int); } +TEST(X32_TRANSPOSEC__4X2_SCALAR_INT_4, bh_68_bw_38_ies_15) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(68) + .block_width(38) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x2_scalar_int); +} + +TEST(X32_TRANSPOSEC__4X2_SCALAR_INT_4, bh_12_bw_10_oes_15) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(12) + .block_width(10) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x2_scalar_int); +} + +TEST(X32_TRANSPOSEC__4X2_SCALAR_INT_4, bh_28_bw_46_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(34) + .block_width(46) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x2_scalar_int); +} + TEST(X32_TRANSPOSEC__4X4_SCALAR_FLOAT_4, bh_4_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -2237,6 +2755,43 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_FLOAT_4, bh_4_bw_4_is_8_os_8) { .Test(xnn_x32_transposec_ukernel__4x4_scalar_float); } +TEST(X32_TRANSPOSEC__4X4_SCALAR_FLOAT_4, bh_68_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_scalar_float); +} + +TEST(X32_TRANSPOSEC__4X4_SCALAR_FLOAT_4, bh_12_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_scalar_float); +} + +TEST(X32_TRANSPOSEC__4X4_SCALAR_FLOAT_4, bh_28_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_scalar_float); +} + TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -2385,6 +2940,43 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .Test(xnn_x32_transposec_ukernel__4x4_scalar_int); } +TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_68_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_scalar_int); +} + +TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_12_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_scalar_int); +} + +TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_28_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_scalar_int); +} + #if XNN_ARCH_ARM64 TEST(X32_TRANSPOSEC__4X4_AARCH64_NEON_TBL_4, bh_4_bw_4) { TEST_REQUIRES_ARM_NEON; @@ -2545,6 +3137,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_aarch64_neon_tbl); } + + TEST(X32_TRANSPOSEC__4X4_AARCH64_NEON_TBL_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_aarch64_neon_tbl); + } + + TEST(X32_TRANSPOSEC__4X4_AARCH64_NEON_TBL_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_aarch64_neon_tbl); + } + + TEST(X32_TRANSPOSEC__4X4_AARCH64_NEON_TBL_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_aarch64_neon_tbl); + } #endif // XNN_ARCH_ARM64 @@ -2708,6 +3340,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_sse2); } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MOV_SSE2_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MOV_SSE2_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MOV_SSE2_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -2871,6 +3543,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_sse2); } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MULTI_SSE2_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MULTI_SSE2_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MULTI_SSE2_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -3034,6 +3746,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_sse2); } + + TEST(X32_TRANSPOSEC__4X4_MULTI_SWITCH_SSE2_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_SWITCH_SSE2_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_SWITCH_SSE2_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -3197,6 +3949,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_sse2); } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MOV_SSE2_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MOV_SSE2_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MOV_SSE2_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -3360,6 +4152,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_sse2); } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MULTI_SSE2_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MULTI_SSE2_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MULTI_SSE2_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -3523,6 +4355,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_sse2); } + + TEST(X32_TRANSPOSEC__4X4_REUSE_SWITCH_SSE2_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_SWITCH_SSE2_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_sse2); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_SWITCH_SSE2_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -3686,6 +4558,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_sse); } + + TEST(X32_TRANSPOSEC__4X4_SSE_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_X86_SSE; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_sse); + } + + TEST(X32_TRANSPOSEC__4X4_SSE_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_X86_SSE; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_sse); + } + + TEST(X32_TRANSPOSEC__4X4_SSE_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_X86_SSE; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_sse); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -3837,6 +4749,43 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_wasmsimd); } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MOV_WASMSIMD_4, bh_68_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MOV_WASMSIMD_4, bh_12_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MOV_WASMSIMD_4, bh_28_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -3988,6 +4937,43 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_wasmsimd); } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MULTI_WASMSIMD_4, bh_68_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MULTI_WASMSIMD_4, bh_12_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MULTI_WASMSIMD_4, bh_28_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -4139,6 +5125,43 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_wasmsimd); } + + TEST(X32_TRANSPOSEC__4X4_MULTI_SWITCH_WASMSIMD_4, bh_68_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_SWITCH_WASMSIMD_4, bh_12_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_SWITCH_WASMSIMD_4, bh_28_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -4290,6 +5313,43 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_wasmsimd); } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MOV_WASMSIMD_4, bh_68_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MOV_WASMSIMD_4, bh_12_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MOV_WASMSIMD_4, bh_28_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -4441,6 +5501,43 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_wasmsimd); } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MULTI_WASMSIMD_4, bh_68_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MULTI_WASMSIMD_4, bh_12_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MULTI_WASMSIMD_4, bh_28_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -4592,6 +5689,43 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_wasmsimd); } + + TEST(X32_TRANSPOSEC__4X4_REUSE_SWITCH_WASMSIMD_4, bh_68_bw_76_ies_15) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_SWITCH_WASMSIMD_4, bh_12_bw_20_oes_15) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_wasmsimd); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_SWITCH_WASMSIMD_4, bh_28_bw_92_ies_21_oes_17) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -4755,6 +5889,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__2x2_multi_dec_zip_neon); } + + TEST(X32_TRANSPOSEC__2X2_MULTI_DEC_ZIP_NEON_4, bh_34_bw_38_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_dec_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_MULTI_DEC_ZIP_NEON_4, bh_6_bw_10_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_dec_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_MULTI_DEC_ZIP_NEON_4, bh_14_bw_46_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -4918,6 +6092,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__2x2_multi_mov_zip_neon); } + + TEST(X32_TRANSPOSEC__2X2_MULTI_MOV_ZIP_NEON_4, bh_34_bw_38_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_mov_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_MULTI_MOV_ZIP_NEON_4, bh_6_bw_10_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_mov_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_MULTI_MOV_ZIP_NEON_4, bh_14_bw_46_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -5081,6 +6295,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__2x2_multi_multi_zip_neon); } + + TEST(X32_TRANSPOSEC__2X2_MULTI_MULTI_ZIP_NEON_4, bh_34_bw_38_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_multi_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_MULTI_MULTI_ZIP_NEON_4, bh_6_bw_10_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_multi_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_MULTI_MULTI_ZIP_NEON_4, bh_14_bw_46_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_multi_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -5244,6 +6498,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__2x2_multi_switch_zip_neon); } + + TEST(X32_TRANSPOSEC__2X2_MULTI_SWITCH_ZIP_NEON_4, bh_34_bw_38_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_switch_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_MULTI_SWITCH_ZIP_NEON_4, bh_6_bw_10_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_switch_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_MULTI_SWITCH_ZIP_NEON_4, bh_14_bw_46_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_multi_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -5407,6 +6701,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__2x2_reuse_dec_zip_neon); } + + TEST(X32_TRANSPOSEC__2X2_REUSE_DEC_ZIP_NEON_4, bh_34_bw_38_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_dec_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_REUSE_DEC_ZIP_NEON_4, bh_6_bw_10_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_dec_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_REUSE_DEC_ZIP_NEON_4, bh_14_bw_46_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -5570,6 +6904,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__2x2_reuse_mov_zip_neon); } + + TEST(X32_TRANSPOSEC__2X2_REUSE_MOV_ZIP_NEON_4, bh_34_bw_38_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_mov_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_REUSE_MOV_ZIP_NEON_4, bh_6_bw_10_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_mov_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_REUSE_MOV_ZIP_NEON_4, bh_14_bw_46_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -5733,6 +7107,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__2x2_reuse_multi_zip_neon); } + + TEST(X32_TRANSPOSEC__2X2_REUSE_MULTI_ZIP_NEON_4, bh_34_bw_38_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_multi_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_REUSE_MULTI_ZIP_NEON_4, bh_6_bw_10_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_multi_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_REUSE_MULTI_ZIP_NEON_4, bh_14_bw_46_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_multi_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -5896,6 +7310,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__2x2_reuse_switch_zip_neon); } + + TEST(X32_TRANSPOSEC__2X2_REUSE_SWITCH_ZIP_NEON_4, bh_34_bw_38_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_switch_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_REUSE_SWITCH_ZIP_NEON_4, bh_6_bw_10_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_switch_zip_neon); + } + + TEST(X32_TRANSPOSEC__2X2_REUSE_SWITCH_ZIP_NEON_4, bh_14_bw_46_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__2x2_reuse_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -6059,6 +7513,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_multi_dec_zip_neon); } + + TEST(X32_TRANSPOSEC__4X4_MULTI_DEC_ZIP_NEON_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_dec_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_DEC_ZIP_NEON_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_dec_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_DEC_ZIP_NEON_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -6222,6 +7716,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_zip_neon); } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MOV_ZIP_NEON_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MOV_ZIP_NEON_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MOV_ZIP_NEON_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -6385,6 +7919,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_zip_neon); } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MULTI_ZIP_NEON_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MULTI_ZIP_NEON_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_MULTI_ZIP_NEON_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_multi_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -6548,6 +8122,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_zip_neon); } + + TEST(X32_TRANSPOSEC__4X4_MULTI_SWITCH_ZIP_NEON_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_SWITCH_ZIP_NEON_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_MULTI_SWITCH_ZIP_NEON_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_multi_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -6711,6 +8325,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_reuse_dec_zip_neon); } + + TEST(X32_TRANSPOSEC__4X4_REUSE_DEC_ZIP_NEON_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_dec_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_DEC_ZIP_NEON_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_dec_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_DEC_ZIP_NEON_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -6874,6 +8528,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_zip_neon); } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MOV_ZIP_NEON_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MOV_ZIP_NEON_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MOV_ZIP_NEON_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -7037,6 +8731,46 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_zip_neon); } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MULTI_ZIP_NEON_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MULTI_ZIP_NEON_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_MULTI_ZIP_NEON_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_multi_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -7200,4 +8934,44 @@ TEST(X32_TRANSPOSEC__4X4_SCALAR_INT_4, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_zip_neon); } + + TEST(X32_TRANSPOSEC__4X4_REUSE_SWITCH_ZIP_NEON_4, bh_68_bw_76_ies_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(4) + .input_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_SWITCH_ZIP_NEON_4, bh_12_bw_20_oes_15) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(4) + .output_element_stride(15) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_zip_neon); + } + + TEST(X32_TRANSPOSEC__4X4_REUSE_SWITCH_ZIP_NEON_4, bh_28_bw_92_ies_21_oes_17) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(4) + .input_element_stride(21) + .output_element_stride(17) + .iterations(1) + .Test(xnn_x32_transposec_ukernel__4x4_reuse_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/test/x64-transpose.cc b/test/x64-transpose.cc index cadc6f09a..e0363b728 100644 --- a/test/x64-transpose.cc +++ b/test/x64-transpose.cc @@ -165,6 +165,43 @@ TEST(X64_TRANSPOSEC__1X2_SCALAR_FLOAT_8, bh_1_bw_2_is_4_os_2) { .Test(xnn_x64_transposec_ukernel__1x2_scalar_float); } +TEST(X64_TRANSPOSEC__1X2_SCALAR_FLOAT_8, bh_17_bw_38_ies_19) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(17) + .block_width(38) + .block_height(17) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__1x2_scalar_float); +} + +TEST(X64_TRANSPOSEC__1X2_SCALAR_FLOAT_8, bh_3_bw_10_oes_19) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(3) + .block_width(10) + .block_height(3) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__1x2_scalar_float); +} + +TEST(X64_TRANSPOSEC__1X2_SCALAR_FLOAT_8, bh_7_bw_46_ies_25_oes_21) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(13) + .block_width(46) + .block_height(7) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__1x2_scalar_float); +} + TEST(X64_TRANSPOSEC__1X2_SCALAR_INT_8, bh_1_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -313,6 +350,43 @@ TEST(X64_TRANSPOSEC__1X2_SCALAR_INT_8, bh_1_bw_2_is_4_os_2) { .Test(xnn_x64_transposec_ukernel__1x2_scalar_int); } +TEST(X64_TRANSPOSEC__1X2_SCALAR_INT_8, bh_17_bw_38_ies_19) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(17) + .block_width(38) + .block_height(17) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__1x2_scalar_int); +} + +TEST(X64_TRANSPOSEC__1X2_SCALAR_INT_8, bh_3_bw_10_oes_19) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(3) + .block_width(10) + .block_height(3) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__1x2_scalar_int); +} + +TEST(X64_TRANSPOSEC__1X2_SCALAR_INT_8, bh_7_bw_46_ies_25_oes_21) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(13) + .block_width(46) + .block_height(7) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__1x2_scalar_int); +} + TEST(X64_TRANSPOSEC__2X1_SCALAR_FLOAT_8, bh_2_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -461,6 +535,43 @@ TEST(X64_TRANSPOSEC__2X1_SCALAR_FLOAT_8, bh_2_bw_1_is_2_os_4) { .Test(xnn_x64_transposec_ukernel__2x1_scalar_float); } +TEST(X64_TRANSPOSEC__2X1_SCALAR_FLOAT_8, bh_34_bw_19_ies_19) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(34) + .block_width(19) + .block_height(34) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x1_scalar_float); +} + +TEST(X64_TRANSPOSEC__2X1_SCALAR_FLOAT_8, bh_6_bw_5_oes_19) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(6) + .block_width(5) + .block_height(6) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x1_scalar_float); +} + +TEST(X64_TRANSPOSEC__2X1_SCALAR_FLOAT_8, bh_14_bw_23_ies_25_oes_21) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(20) + .block_width(23) + .block_height(14) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x1_scalar_float); +} + TEST(X64_TRANSPOSEC__2X1_SCALAR_INT_8, bh_2_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -609,6 +720,43 @@ TEST(X64_TRANSPOSEC__2X1_SCALAR_INT_8, bh_2_bw_1_is_2_os_4) { .Test(xnn_x64_transposec_ukernel__2x1_scalar_int); } +TEST(X64_TRANSPOSEC__2X1_SCALAR_INT_8, bh_34_bw_19_ies_19) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(34) + .block_width(19) + .block_height(34) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x1_scalar_int); +} + +TEST(X64_TRANSPOSEC__2X1_SCALAR_INT_8, bh_6_bw_5_oes_19) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(6) + .block_width(5) + .block_height(6) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x1_scalar_int); +} + +TEST(X64_TRANSPOSEC__2X1_SCALAR_INT_8, bh_14_bw_23_ies_25_oes_21) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(20) + .block_width(23) + .block_height(14) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x1_scalar_int); +} + TEST(X64_TRANSPOSEC__2X2_SCALAR_FLOAT_8, bh_2_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -757,6 +905,43 @@ TEST(X64_TRANSPOSEC__2X2_SCALAR_FLOAT_8, bh_2_bw_2_is_4_os_4) { .Test(xnn_x64_transposec_ukernel__2x2_scalar_float); } +TEST(X64_TRANSPOSEC__2X2_SCALAR_FLOAT_8, bh_34_bw_38_ies_19) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_scalar_float); +} + +TEST(X64_TRANSPOSEC__2X2_SCALAR_FLOAT_8, bh_6_bw_10_oes_19) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_scalar_float); +} + +TEST(X64_TRANSPOSEC__2X2_SCALAR_FLOAT_8, bh_14_bw_46_ies_25_oes_21) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_scalar_float); +} + TEST(X64_TRANSPOSEC__2X2_SCALAR_INT_8, bh_2_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -905,6 +1090,43 @@ TEST(X64_TRANSPOSEC__2X2_SCALAR_INT_8, bh_2_bw_2_is_4_os_4) { .Test(xnn_x64_transposec_ukernel__2x2_scalar_int); } +TEST(X64_TRANSPOSEC__2X2_SCALAR_INT_8, bh_34_bw_38_ies_19) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_scalar_int); +} + +TEST(X64_TRANSPOSEC__2X2_SCALAR_INT_8, bh_6_bw_10_oes_19) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_scalar_int); +} + +TEST(X64_TRANSPOSEC__2X2_SCALAR_INT_8, bh_14_bw_46_ies_25_oes_21) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_scalar_int); +} + TEST(X64_TRANSPOSEC__4X1_SCALAR_FLOAT_8, bh_4_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -1053,6 +1275,43 @@ TEST(X64_TRANSPOSEC__4X1_SCALAR_FLOAT_8, bh_4_bw_1_is_2_os_8) { .Test(xnn_x64_transposec_ukernel__4x1_scalar_float); } +TEST(X64_TRANSPOSEC__4X1_SCALAR_FLOAT_8, bh_68_bw_19_ies_19) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(68) + .block_width(19) + .block_height(68) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x1_scalar_float); +} + +TEST(X64_TRANSPOSEC__4X1_SCALAR_FLOAT_8, bh_12_bw_5_oes_19) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(12) + .block_width(5) + .block_height(12) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x1_scalar_float); +} + +TEST(X64_TRANSPOSEC__4X1_SCALAR_FLOAT_8, bh_28_bw_23_ies_25_oes_21) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(34) + .block_width(23) + .block_height(28) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x1_scalar_float); +} + TEST(X64_TRANSPOSEC__4X1_SCALAR_INT_8, bh_4_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -1201,6 +1460,43 @@ TEST(X64_TRANSPOSEC__4X1_SCALAR_INT_8, bh_4_bw_1_is_2_os_8) { .Test(xnn_x64_transposec_ukernel__4x1_scalar_int); } +TEST(X64_TRANSPOSEC__4X1_SCALAR_INT_8, bh_68_bw_19_ies_19) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(68) + .block_width(19) + .block_height(68) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x1_scalar_int); +} + +TEST(X64_TRANSPOSEC__4X1_SCALAR_INT_8, bh_12_bw_5_oes_19) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(12) + .block_width(5) + .block_height(12) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x1_scalar_int); +} + +TEST(X64_TRANSPOSEC__4X1_SCALAR_INT_8, bh_28_bw_23_ies_25_oes_21) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(34) + .block_width(23) + .block_height(28) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x1_scalar_int); +} + TEST(X64_TRANSPOSEC__4X2_SCALAR_FLOAT_8, bh_4_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -1349,6 +1645,43 @@ TEST(X64_TRANSPOSEC__4X2_SCALAR_FLOAT_8, bh_4_bw_2_is_4_os_8) { .Test(xnn_x64_transposec_ukernel__4x2_scalar_float); } +TEST(X64_TRANSPOSEC__4X2_SCALAR_FLOAT_8, bh_68_bw_38_ies_19) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(68) + .block_width(38) + .block_height(68) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x2_scalar_float); +} + +TEST(X64_TRANSPOSEC__4X2_SCALAR_FLOAT_8, bh_12_bw_10_oes_19) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(12) + .block_width(10) + .block_height(12) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x2_scalar_float); +} + +TEST(X64_TRANSPOSEC__4X2_SCALAR_FLOAT_8, bh_28_bw_46_ies_25_oes_21) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(34) + .block_width(46) + .block_height(28) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x2_scalar_float); +} + TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_4_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -1497,6 +1830,43 @@ TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_4_bw_2_is_4_os_8) { .Test(xnn_x64_transposec_ukernel__4x2_scalar_int); } +TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_68_bw_38_ies_19) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(68) + .block_width(38) + .block_height(68) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x2_scalar_int); +} + +TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_12_bw_10_oes_19) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(12) + .block_width(10) + .block_height(12) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x2_scalar_int); +} + +TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_28_bw_46_ies_25_oes_21) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(34) + .block_width(46) + .block_height(28) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__4x2_scalar_int); +} + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(X64_TRANSPOSEC__2X2_MULTI_MOV_SSE2_8, bh_2_bw_2) { TEST_REQUIRES_X86_SSE2; @@ -1657,6 +2027,46 @@ TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_4_bw_2_is_4_os_8) { .iterations(1) .Test(xnn_x64_transposec_ukernel__2x2_multi_mov_sse2); } + + TEST(X64_TRANSPOSEC__2X2_MULTI_MOV_SSE2_8, bh_34_bw_38_ies_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_multi_mov_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_MULTI_MOV_SSE2_8, bh_6_bw_10_oes_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_multi_mov_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_MULTI_MOV_SSE2_8, bh_14_bw_46_ies_25_oes_21) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_multi_mov_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1820,6 +2230,46 @@ TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_4_bw_2_is_4_os_8) { .iterations(1) .Test(xnn_x64_transposec_ukernel__2x2_multi_multi_sse2); } + + TEST(X64_TRANSPOSEC__2X2_MULTI_MULTI_SSE2_8, bh_34_bw_38_ies_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_multi_multi_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_MULTI_MULTI_SSE2_8, bh_6_bw_10_oes_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_multi_multi_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_MULTI_MULTI_SSE2_8, bh_14_bw_46_ies_25_oes_21) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_multi_multi_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1983,6 +2433,46 @@ TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_4_bw_2_is_4_os_8) { .iterations(1) .Test(xnn_x64_transposec_ukernel__2x2_multi_switch_sse2); } + + TEST(X64_TRANSPOSEC__2X2_MULTI_SWITCH_SSE2_8, bh_34_bw_38_ies_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_multi_switch_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_MULTI_SWITCH_SSE2_8, bh_6_bw_10_oes_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_multi_switch_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_MULTI_SWITCH_SSE2_8, bh_14_bw_46_ies_25_oes_21) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_multi_switch_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -2146,6 +2636,46 @@ TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_4_bw_2_is_4_os_8) { .iterations(1) .Test(xnn_x64_transposec_ukernel__2x2_reuse_mov_sse2); } + + TEST(X64_TRANSPOSEC__2X2_REUSE_MOV_SSE2_8, bh_34_bw_38_ies_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_reuse_mov_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_REUSE_MOV_SSE2_8, bh_6_bw_10_oes_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_reuse_mov_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_REUSE_MOV_SSE2_8, bh_14_bw_46_ies_25_oes_21) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_reuse_mov_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -2309,6 +2839,46 @@ TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_4_bw_2_is_4_os_8) { .iterations(1) .Test(xnn_x64_transposec_ukernel__2x2_reuse_multi_sse2); } + + TEST(X64_TRANSPOSEC__2X2_REUSE_MULTI_SSE2_8, bh_34_bw_38_ies_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_reuse_multi_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_REUSE_MULTI_SSE2_8, bh_6_bw_10_oes_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_reuse_multi_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_REUSE_MULTI_SSE2_8, bh_14_bw_46_ies_25_oes_21) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_reuse_multi_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -2472,4 +3042,44 @@ TEST(X64_TRANSPOSEC__4X2_SCALAR_INT_8, bh_4_bw_2_is_4_os_8) { .iterations(1) .Test(xnn_x64_transposec_ukernel__2x2_reuse_switch_sse2); } + + TEST(X64_TRANSPOSEC__2X2_REUSE_SWITCH_SSE2_8, bh_34_bw_38_ies_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(8) + .input_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_reuse_switch_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_REUSE_SWITCH_SSE2_8, bh_6_bw_10_oes_19) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(8) + .output_element_stride(19) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_reuse_switch_sse2); + } + + TEST(X64_TRANSPOSEC__2X2_REUSE_SWITCH_SSE2_8, bh_14_bw_46_ies_25_oes_21) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(8) + .input_element_stride(25) + .output_element_stride(21) + .iterations(1) + .Test(xnn_x64_transposec_ukernel__2x2_reuse_switch_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/x8-transpose.cc b/test/x8-transpose.cc index 13cf9ce4d..5eca8cc54 100644 --- a/test/x8-transpose.cc +++ b/test/x8-transpose.cc @@ -165,6 +165,43 @@ TEST(X8_TRANSPOSEC__1X2_SCALAR_INT_1, bh_1_bw_2_is_4_os_2) { .Test(xnn_x8_transposec_ukernel__1x2_scalar_int); } +TEST(X8_TRANSPOSEC__1X2_SCALAR_INT_1, bh_17_bw_38_ies_12) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(17) + .block_width(38) + .block_height(17) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__1x2_scalar_int); +} + +TEST(X8_TRANSPOSEC__1X2_SCALAR_INT_1, bh_3_bw_10_oes_12) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(3) + .block_width(10) + .block_height(3) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__1x2_scalar_int); +} + +TEST(X8_TRANSPOSEC__1X2_SCALAR_INT_1, bh_7_bw_46_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(13) + .block_width(46) + .block_height(7) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__1x2_scalar_int); +} + TEST(X8_TRANSPOSEC__1X4_SCALAR_INT_1, bh_1_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -313,6 +350,43 @@ TEST(X8_TRANSPOSEC__1X4_SCALAR_INT_1, bh_1_bw_4_is_8_os_2) { .Test(xnn_x8_transposec_ukernel__1x4_scalar_int); } +TEST(X8_TRANSPOSEC__1X4_SCALAR_INT_1, bh_17_bw_76_ies_12) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(17) + .block_width(76) + .block_height(17) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__1x4_scalar_int); +} + +TEST(X8_TRANSPOSEC__1X4_SCALAR_INT_1, bh_3_bw_20_oes_12) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(3) + .block_width(20) + .block_height(3) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__1x4_scalar_int); +} + +TEST(X8_TRANSPOSEC__1X4_SCALAR_INT_1, bh_7_bw_92_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(13) + .block_width(92) + .block_height(7) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__1x4_scalar_int); +} + TEST(X8_TRANSPOSEC__2X1_SCALAR_INT_1, bh_2_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -461,6 +535,43 @@ TEST(X8_TRANSPOSEC__2X1_SCALAR_INT_1, bh_2_bw_1_is_2_os_4) { .Test(xnn_x8_transposec_ukernel__2x1_scalar_int); } +TEST(X8_TRANSPOSEC__2X1_SCALAR_INT_1, bh_34_bw_19_ies_12) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(34) + .block_width(19) + .block_height(34) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__2x1_scalar_int); +} + +TEST(X8_TRANSPOSEC__2X1_SCALAR_INT_1, bh_6_bw_5_oes_12) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(6) + .block_width(5) + .block_height(6) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__2x1_scalar_int); +} + +TEST(X8_TRANSPOSEC__2X1_SCALAR_INT_1, bh_14_bw_23_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(20) + .block_width(23) + .block_height(14) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__2x1_scalar_int); +} + TEST(X8_TRANSPOSEC__2X2_SCALAR_INT_1, bh_2_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -609,6 +720,43 @@ TEST(X8_TRANSPOSEC__2X2_SCALAR_INT_1, bh_2_bw_2_is_4_os_4) { .Test(xnn_x8_transposec_ukernel__2x2_scalar_int); } +TEST(X8_TRANSPOSEC__2X2_SCALAR_INT_1, bh_34_bw_38_ies_12) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(34) + .block_width(38) + .block_height(34) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__2x2_scalar_int); +} + +TEST(X8_TRANSPOSEC__2X2_SCALAR_INT_1, bh_6_bw_10_oes_12) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(6) + .block_width(10) + .block_height(6) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__2x2_scalar_int); +} + +TEST(X8_TRANSPOSEC__2X2_SCALAR_INT_1, bh_14_bw_46_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(20) + .block_width(46) + .block_height(14) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__2x2_scalar_int); +} + TEST(X8_TRANSPOSEC__2X4_SCALAR_INT_1, bh_2_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -757,6 +905,43 @@ TEST(X8_TRANSPOSEC__2X4_SCALAR_INT_1, bh_2_bw_4_is_8_os_4) { .Test(xnn_x8_transposec_ukernel__2x4_scalar_int); } +TEST(X8_TRANSPOSEC__2X4_SCALAR_INT_1, bh_34_bw_76_ies_12) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(34) + .block_width(76) + .block_height(34) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__2x4_scalar_int); +} + +TEST(X8_TRANSPOSEC__2X4_SCALAR_INT_1, bh_6_bw_20_oes_12) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(6) + .block_width(20) + .block_height(6) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__2x4_scalar_int); +} + +TEST(X8_TRANSPOSEC__2X4_SCALAR_INT_1, bh_14_bw_92_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(20) + .block_width(92) + .block_height(14) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__2x4_scalar_int); +} + TEST(X8_TRANSPOSEC__4X1_SCALAR_INT_1, bh_4_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -905,6 +1090,43 @@ TEST(X8_TRANSPOSEC__4X1_SCALAR_INT_1, bh_4_bw_1_is_2_os_8) { .Test(xnn_x8_transposec_ukernel__4x1_scalar_int); } +TEST(X8_TRANSPOSEC__4X1_SCALAR_INT_1, bh_68_bw_19_ies_12) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(68) + .block_width(19) + .block_height(68) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__4x1_scalar_int); +} + +TEST(X8_TRANSPOSEC__4X1_SCALAR_INT_1, bh_12_bw_5_oes_12) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(12) + .block_width(5) + .block_height(12) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__4x1_scalar_int); +} + +TEST(X8_TRANSPOSEC__4X1_SCALAR_INT_1, bh_28_bw_23_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(34) + .block_width(23) + .block_height(28) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__4x1_scalar_int); +} + TEST(X8_TRANSPOSEC__4X2_SCALAR_INT_1, bh_4_bw_2) { TransposeMicrokernelTester() .input_stride(4) @@ -1053,6 +1275,43 @@ TEST(X8_TRANSPOSEC__4X2_SCALAR_INT_1, bh_4_bw_2_is_4_os_8) { .Test(xnn_x8_transposec_ukernel__4x2_scalar_int); } +TEST(X8_TRANSPOSEC__4X2_SCALAR_INT_1, bh_68_bw_38_ies_12) { + TransposeMicrokernelTester() + .input_stride(38) + .output_stride(68) + .block_width(38) + .block_height(68) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__4x2_scalar_int); +} + +TEST(X8_TRANSPOSEC__4X2_SCALAR_INT_1, bh_12_bw_10_oes_12) { + TransposeMicrokernelTester() + .input_stride(10) + .output_stride(12) + .block_width(10) + .block_height(12) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__4x2_scalar_int); +} + +TEST(X8_TRANSPOSEC__4X2_SCALAR_INT_1, bh_28_bw_46_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(51) + .output_stride(34) + .block_width(46) + .block_height(28) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__4x2_scalar_int); +} + TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4) { TransposeMicrokernelTester() .input_stride(8) @@ -1201,6 +1460,43 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .Test(xnn_x8_transposec_ukernel__4x4_scalar_int); } +TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_68_bw_76_ies_12) { + TransposeMicrokernelTester() + .input_stride(76) + .output_stride(68) + .block_width(76) + .block_height(68) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__4x4_scalar_int); +} + +TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_12_bw_20_oes_12) { + TransposeMicrokernelTester() + .input_stride(20) + .output_stride(12) + .block_width(20) + .block_height(12) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__4x4_scalar_int); +} + +TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_28_bw_92_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(97) + .output_stride(34) + .block_width(92) + .block_height(28) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__4x4_scalar_int); +} + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(X8_TRANSPOSEC__16X16_REUSE_MOV_SSE2_1, bh_16_bw_16) { TEST_REQUIRES_X86_SSE2; @@ -1361,6 +1657,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2); } + + TEST(X8_TRANSPOSEC__16X16_REUSE_MOV_SSE2_1, bh_272_bw_304_ies_12) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(304) + .output_stride(272) + .block_width(304) + .block_height(272) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_MOV_SSE2_1, bh_48_bw_80_oes_12) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(80) + .output_stride(48) + .block_width(80) + .block_height(48) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_MOV_SSE2_1, bh_112_bw_368_ies_18_oes_14) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(373) + .output_stride(118) + .block_width(368) + .block_height(112) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1524,6 +1860,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_sse2); } + + TEST(X8_TRANSPOSEC__16X16_REUSE_SWITCH_SSE2_1, bh_272_bw_304_ies_12) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(304) + .output_stride(272) + .block_width(304) + .block_height(272) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_sse2); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_SWITCH_SSE2_1, bh_48_bw_80_oes_12) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(80) + .output_stride(48) + .block_width(80) + .block_height(48) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_sse2); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_SWITCH_SSE2_1, bh_112_bw_368_ies_18_oes_14) { + TEST_REQUIRES_X86_SSE2; + TransposeMicrokernelTester() + .input_stride(373) + .output_stride(118) + .block_width(368) + .block_height(112) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_sse2); + } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1675,6 +2051,43 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd); } + + TEST(X8_TRANSPOSEC__16X16_REUSE_MOV_WASMSIMD_1, bh_272_bw_304_ies_12) { + TransposeMicrokernelTester() + .input_stride(304) + .output_stride(272) + .block_width(304) + .block_height(272) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_MOV_WASMSIMD_1, bh_48_bw_80_oes_12) { + TransposeMicrokernelTester() + .input_stride(80) + .output_stride(48) + .block_width(80) + .block_height(48) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_MOV_WASMSIMD_1, bh_112_bw_368_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(373) + .output_stride(118) + .block_width(368) + .block_height(112) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1826,6 +2239,43 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_wasmsimd); } + + TEST(X8_TRANSPOSEC__16X16_REUSE_SWITCH_WASMSIMD_1, bh_272_bw_304_ies_12) { + TransposeMicrokernelTester() + .input_stride(304) + .output_stride(272) + .block_width(304) + .block_height(272) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_wasmsimd); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_SWITCH_WASMSIMD_1, bh_48_bw_80_oes_12) { + TransposeMicrokernelTester() + .input_stride(80) + .output_stride(48) + .block_width(80) + .block_height(48) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_wasmsimd); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_SWITCH_WASMSIMD_1, bh_112_bw_368_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(373) + .output_stride(118) + .block_width(368) + .block_height(112) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_wasmsimd); + } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1989,6 +2439,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__8x8_multi_dec_zip_neon); } + + TEST(X8_TRANSPOSEC__8X8_MULTI_DEC_ZIP_NEON_1, bh_136_bw_152_ies_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_multi_dec_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_MULTI_DEC_ZIP_NEON_1, bh_24_bw_40_oes_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_multi_dec_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_MULTI_DEC_ZIP_NEON_1, bh_56_bw_184_ies_18_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_multi_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -2152,6 +2642,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__8x8_multi_mov_zip_neon); } + + TEST(X8_TRANSPOSEC__8X8_MULTI_MOV_ZIP_NEON_1, bh_136_bw_152_ies_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_multi_mov_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_MULTI_MOV_ZIP_NEON_1, bh_24_bw_40_oes_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_multi_mov_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_MULTI_MOV_ZIP_NEON_1, bh_56_bw_184_ies_18_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_multi_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -2315,6 +2845,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__8x8_multi_switch_zip_neon); } + + TEST(X8_TRANSPOSEC__8X8_MULTI_SWITCH_ZIP_NEON_1, bh_136_bw_152_ies_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_multi_switch_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_MULTI_SWITCH_ZIP_NEON_1, bh_24_bw_40_oes_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_multi_switch_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_MULTI_SWITCH_ZIP_NEON_1, bh_56_bw_184_ies_18_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_multi_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -2478,6 +3048,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__8x8_reuse_dec_zip_neon); } + + TEST(X8_TRANSPOSEC__8X8_REUSE_DEC_ZIP_NEON_1, bh_136_bw_152_ies_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_dec_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_REUSE_DEC_ZIP_NEON_1, bh_24_bw_40_oes_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_dec_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_REUSE_DEC_ZIP_NEON_1, bh_56_bw_184_ies_18_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -2641,6 +3251,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__8x8_reuse_mov_zip_neon); } + + TEST(X8_TRANSPOSEC__8X8_REUSE_MOV_ZIP_NEON_1, bh_136_bw_152_ies_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_mov_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_REUSE_MOV_ZIP_NEON_1, bh_24_bw_40_oes_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_mov_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_REUSE_MOV_ZIP_NEON_1, bh_56_bw_184_ies_18_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -2804,6 +3454,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__8x8_reuse_multi_zip_neon); } + + TEST(X8_TRANSPOSEC__8X8_REUSE_MULTI_ZIP_NEON_1, bh_136_bw_152_ies_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_multi_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_REUSE_MULTI_ZIP_NEON_1, bh_24_bw_40_oes_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_multi_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_REUSE_MULTI_ZIP_NEON_1, bh_56_bw_184_ies_18_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_multi_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -2967,6 +3657,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__8x8_reuse_switch_zip_neon); } + + TEST(X8_TRANSPOSEC__8X8_REUSE_SWITCH_ZIP_NEON_1, bh_136_bw_152_ies_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(152) + .output_stride(136) + .block_width(152) + .block_height(136) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_switch_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_REUSE_SWITCH_ZIP_NEON_1, bh_24_bw_40_oes_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(40) + .output_stride(24) + .block_width(40) + .block_height(24) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_switch_zip_neon); + } + + TEST(X8_TRANSPOSEC__8X8_REUSE_SWITCH_ZIP_NEON_1, bh_56_bw_184_ies_18_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(189) + .output_stride(62) + .block_width(184) + .block_height(56) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__8x8_reuse_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -3130,6 +3860,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon); } + + TEST(X8_TRANSPOSEC__16X16_REUSE_DEC_ZIP_NEON_1, bh_272_bw_304_ies_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(304) + .output_stride(272) + .block_width(304) + .block_height(272) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_DEC_ZIP_NEON_1, bh_48_bw_80_oes_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(80) + .output_stride(48) + .block_width(80) + .block_height(48) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_DEC_ZIP_NEON_1, bh_112_bw_368_ies_18_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(373) + .output_stride(118) + .block_width(368) + .block_height(112) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -3293,6 +4063,46 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_zip_neon); } + + TEST(X8_TRANSPOSEC__16X16_REUSE_MOV_ZIP_NEON_1, bh_272_bw_304_ies_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(304) + .output_stride(272) + .block_width(304) + .block_height(272) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_zip_neon); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_MOV_ZIP_NEON_1, bh_48_bw_80_oes_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(80) + .output_stride(48) + .block_width(80) + .block_height(48) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_zip_neon); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_MOV_ZIP_NEON_1, bh_112_bw_368_ies_18_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(373) + .output_stride(118) + .block_width(368) + .block_height(112) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_mov_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -3456,4 +4266,44 @@ TEST(X8_TRANSPOSEC__4X4_SCALAR_INT_1, bh_4_bw_4_is_8_os_8) { .iterations(1) .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_zip_neon); } + + TEST(X8_TRANSPOSEC__16X16_REUSE_SWITCH_ZIP_NEON_1, bh_272_bw_304_ies_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(304) + .output_stride(272) + .block_width(304) + .block_height(272) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_zip_neon); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_SWITCH_ZIP_NEON_1, bh_48_bw_80_oes_12) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(80) + .output_stride(48) + .block_width(80) + .block_height(48) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_zip_neon); + } + + TEST(X8_TRANSPOSEC__16X16_REUSE_SWITCH_ZIP_NEON_1, bh_112_bw_368_ies_18_oes_14) { + TEST_REQUIRES_ARM_NEON; + TransposeMicrokernelTester() + .input_stride(373) + .output_stride(118) + .block_width(368) + .block_height(112) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_x8_transposec_ukernel__16x16_reuse_switch_zip_neon); + } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/test/xx-transpose.cc b/test/xx-transpose.cc index 8b52b72c3..6b26cc2d3 100644 --- a/test/xx-transpose.cc +++ b/test/xx-transpose.cc @@ -164,6 +164,43 @@ TEST(XX_TRANSPOSEV__1X1_MEMCPY_1, bh_1_bw_1_is_2_os_2) { .iterations(1) .Test(xnn_xx_transposev_ukernel__1x1_memcpy); } + +TEST(XX_TRANSPOSEV__1X1_MEMCPY_1, bh_17_bw_19_ies_12) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(17) + .block_width(19) + .block_height(17) + .element_size(1) + .input_element_stride(12) + .iterations(1) + .Test(xnn_xx_transposev_ukernel__1x1_memcpy); +} + +TEST(XX_TRANSPOSEV__1X1_MEMCPY_1, bh_3_bw_5_oes_12) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(3) + .block_width(5) + .block_height(3) + .element_size(1) + .output_element_stride(12) + .iterations(1) + .Test(xnn_xx_transposev_ukernel__1x1_memcpy); +} + +TEST(XX_TRANSPOSEV__1X1_MEMCPY_1, bh_7_bw_23_ies_18_oes_14) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(13) + .block_width(23) + .block_height(7) + .element_size(1) + .input_element_stride(18) + .output_element_stride(14) + .iterations(1) + .Test(xnn_xx_transposev_ukernel__1x1_memcpy); +} TEST(XX_TRANSPOSEV__1X1_MEMCPY_3, bh_1_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -311,6 +348,43 @@ TEST(XX_TRANSPOSEV__1X1_MEMCPY_3, bh_1_bw_1_is_2_os_2) { .iterations(1) .Test(xnn_xx_transposev_ukernel__1x1_memcpy); } + +TEST(XX_TRANSPOSEV__1X1_MEMCPY_3, bh_17_bw_19_ies_14) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(17) + .block_width(19) + .block_height(17) + .element_size(3) + .input_element_stride(14) + .iterations(1) + .Test(xnn_xx_transposev_ukernel__1x1_memcpy); +} + +TEST(XX_TRANSPOSEV__1X1_MEMCPY_3, bh_3_bw_5_oes_14) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(3) + .block_width(5) + .block_height(3) + .element_size(3) + .output_element_stride(14) + .iterations(1) + .Test(xnn_xx_transposev_ukernel__1x1_memcpy); +} + +TEST(XX_TRANSPOSEV__1X1_MEMCPY_3, bh_7_bw_23_ies_20_oes_16) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(13) + .block_width(23) + .block_height(7) + .element_size(3) + .input_element_stride(20) + .output_element_stride(16) + .iterations(1) + .Test(xnn_xx_transposev_ukernel__1x1_memcpy); +} TEST(XX_TRANSPOSEV__1X1_MEMCPY_5, bh_1_bw_1) { TransposeMicrokernelTester() .input_stride(2) @@ -457,4 +531,41 @@ TEST(XX_TRANSPOSEV__1X1_MEMCPY_5, bh_1_bw_1_is_2_os_2) { .element_size(5) .iterations(1) .Test(xnn_xx_transposev_ukernel__1x1_memcpy); +} + +TEST(XX_TRANSPOSEV__1X1_MEMCPY_5, bh_17_bw_19_ies_16) { + TransposeMicrokernelTester() + .input_stride(19) + .output_stride(17) + .block_width(19) + .block_height(17) + .element_size(5) + .input_element_stride(16) + .iterations(1) + .Test(xnn_xx_transposev_ukernel__1x1_memcpy); +} + +TEST(XX_TRANSPOSEV__1X1_MEMCPY_5, bh_3_bw_5_oes_16) { + TransposeMicrokernelTester() + .input_stride(5) + .output_stride(3) + .block_width(5) + .block_height(3) + .element_size(5) + .output_element_stride(16) + .iterations(1) + .Test(xnn_xx_transposev_ukernel__1x1_memcpy); +} + +TEST(XX_TRANSPOSEV__1X1_MEMCPY_5, bh_7_bw_23_ies_22_oes_18) { + TransposeMicrokernelTester() + .input_stride(28) + .output_stride(13) + .block_width(23) + .block_height(7) + .element_size(5) + .input_element_stride(22) + .output_element_stride(18) + .iterations(1) + .Test(xnn_xx_transposev_ukernel__1x1_memcpy); }
\ No newline at end of file |