diff options
author | T.J. Alumbaugh <talumbau@google.com> | 2021-05-11 07:36:33 -0700 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2021-05-11 07:36:58 -0700 |
commit | d37128311b445e758136b8602d1bbd2a755e115d (patch) | |
tree | abc5abbb09eaf02a55a12c5d12f53ac531895839 | |
parent | 3c93cda8211efa01128d48950f0d6ee5233c5b9b (diff) | |
download | ruy-d37128311b445e758136b8602d1bbd2a755e115d.tar.gz |
Fork Neon Float kernel for X1
PiperOrigin-RevId: 373147434
-rw-r--r-- | ruy/kernel_arm.h | 5 | ||||
-rw-r--r-- | ruy/kernel_arm64.cc | 466 |
2 files changed, 471 insertions, 0 deletions
diff --git a/ruy/kernel_arm.h b/ruy/kernel_arm.h index 09c3206..15a5a89 100644 --- a/ruy/kernel_arm.h +++ b/ruy/kernel_arm.h @@ -133,6 +133,7 @@ struct Kernel<Path::kNeonDotprod, std::int8_t, std::int8_t, std::int32_t, #endif void KernelFloatNeon(const KernelParamsFloat<8, 8>& params); +void KernelFloatNeonX1(const KernelParamsFloat<8, 8>& params); void KernelFloatNeonA55ish(const KernelParamsFloat<8, 8>& params); void KernelFloat32Neon(const KernelParamsFloat<8, 4>& params); void KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8, 8>& params); @@ -154,6 +155,8 @@ struct Kernel<Path::kNeon, float, float, float, float> { end_col, dst, ¶ms); if (__builtin_expect(tuning == Tuning::kA55ish, true)) { KernelFloatNeonA55ish(params); + } else if (tuning == Tuning::kX1) { + KernelFloatNeonX1(params); } else { KernelFloatNeon(params); } @@ -202,6 +205,8 @@ struct Kernel<Path::kNeonDotprod, float, float, float, float> { end_col, dst, ¶ms); if (__builtin_expect(tuning == Tuning::kA55ish, true)) { KernelFloatNeonDotprodA55ish(params); + } else if (tuning == Tuning::kX1) { + KernelFloatNeonX1(params); } else { KernelFloatNeon(params); } diff --git a/ruy/kernel_arm64.cc b/ruy/kernel_arm64.cc index f6769ba..b06a06e 100644 --- a/ruy/kernel_arm64.cc +++ b/ruy/kernel_arm64.cc @@ -8324,6 +8324,472 @@ void KernelFloatNeon(const KernelParamsFloat<8, 8>& params) { "v26", "v27", "v28", "v29", "v30", "v31"); } +// A fork of the standard float kernel where we omit the manual loop unrolling +// to recover performance on the X1. For now, the X1 core is the only CPU that +// uses this kernel. +void KernelFloatNeonX1(const KernelParamsFloat<8, 8>& params) { + CheckOffsetsInKernelParamsFloat(params); + profiler::ScopeLabel label("Kernel (kNeon) X1"); + + const float* lhs_col_ptr = params.lhs_base_ptr; + const float* rhs_col_ptr = params.rhs_base_ptr; + const float* lhs_ptr = lhs_col_ptr; + const float* rhs_ptr = rhs_col_ptr; + float* dst_col_ptr = params.dst_base_ptr; + float* dst_ptr = dst_col_ptr; + int row = params.start_row; + int col = params.start_col; + + // The asm kernel below has the following NEON register allocation: + // + // v16 -- v31 are accumulators. + // During accumulation, v0 -- v15 are used to load data from LHS and RHS. + // At least v0 and v1 are used to load a 8x1 block of LHS, and v2 and + // v3 are used to load a 1x8 block of RHS, like this: + // + // RHS 1x8 block + // /-----------------------------------------| + // |v2.s[0] ... v2.s[3] v3.s[0] ... v3.s[3]| + // \-----------------------------------------/ + // LHS 8x1 block + // /---------------------\ /-----------------------------------------| + // | v0.s[0] | |v16.s[0] ... v30.s[0]| + // | ... | | ... ... | + // | v0.s[3] | |v16.s[3] ... v30.s[3]| + // | v1.s[0] | |v17.s[0] ... v31.s[0]| + // | ... | | ... ... | + // | v1.s[3] | |v17.s[3] ... v31.s[3]| + // \---------------------/ \-----------------------------------------/ + // accumulators 8x8 block + // + // In the RUY_OPT_MAX_STREAMING part of the kernel, this elementary step + // is repeated 4 times, using 4x more registers for LHS and RHS, so that + // is where instead of using v0 -- v3 for LHS and RHS, we use v0 -- v15. + // + // Outside of the RUY_OPT_MAX_STREAMING part of the kernel, v4 -- v7 are + // unused, and v8 -- v15 are used for floading parameters used for the + // post-accumulation part of the kernel. + asm volatile( +#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n" + + // clang-format off + + // Load some parameters into registers. + "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n" + "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n" + "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n" + "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n" + "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n" + "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n" + "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n" + "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n" + + // Load the first 32 bytes of LHS and RHS data. + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v2.4s}, [%[rhs_ptr]], #16\n" + "ld1 {v3.4s}, [%[rhs_ptr]], #16\n" + + // Clear accumulators. + RUY_MAKE_ZERO(v16) + RUY_MAKE_ZERO(v17) + RUY_MAKE_ZERO(v18) + RUY_MAKE_ZERO(v19) + RUY_MAKE_ZERO(v20) + RUY_MAKE_ZERO(v21) + RUY_MAKE_ZERO(v22) + RUY_MAKE_ZERO(v23) + RUY_MAKE_ZERO(v24) + RUY_MAKE_ZERO(v25) + RUY_MAKE_ZERO(v26) + RUY_MAKE_ZERO(v27) + RUY_MAKE_ZERO(v28) + RUY_MAKE_ZERO(v29) + RUY_MAKE_ZERO(v30) + RUY_MAKE_ZERO(v31) + + // w1 is the number of levels of depth that we have already loaded + // LHS and RHS data for. Corresponding to the initial ld1 instructions + // above, this is currently 1. + "mov w1, #1\n" + + // Main loop of the whole GEMM, over rows and columns of the + // destination matrix. + "1:\n" + + "fmla v16.4s, v0.4s, v2.s[0]\n" + "fmla v18.4s, v0.4s, v2.s[1]\n" + "fmla v20.4s, v0.4s, v2.s[2]\n" + "fmla v22.4s, v0.4s, v2.s[3]\n" + + // Accumulation loop + "cmp w1, w12\n" + "beq 79f\n" + + "2:\n" + "fmla v24.4s, v0.4s, v3.s[0]\n" + "fmla v26.4s, v0.4s, v3.s[1]\n" + "ld1 {v4.4s}, [%[rhs_ptr]], #16\n" + "fmla v28.4s, v0.4s, v3.s[2]\n" + "fmla v30.4s, v0.4s, v3.s[3]\n" + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "fmla v25.4s, v1.4s, v3.s[0]\n" + "fmla v27.4s, v1.4s, v3.s[1]\n" + "add w1, w1, #1\n" + "fmla v29.4s, v1.4s, v3.s[2]\n" + "fmla v31.4s, v1.4s, v3.s[3]\n" + "ld1 {v3.4s}, [%[rhs_ptr]], #16\n" + "fmla v17.4s, v1.4s, v2.s[0]\n" + "fmla v19.4s, v1.4s, v2.s[1]\n" + "cmp w1, w12\n" + "fmla v21.4s, v1.4s, v2.s[2]\n" + "fmla v23.4s, v1.4s, v2.s[3]\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "fmla v16.4s, v0.4s, v4.s[0]\n" + "fmla v18.4s, v0.4s, v4.s[1]\n" + "mov v2.16b, v4.16b\n" + "fmla v20.4s, v0.4s, v4.s[2]\n" + "fmla v22.4s, v0.4s, v4.s[3]\n" + "blt 2b\n" + + "79:\n" + + // End of the inner loop on depth. Now perform the remaining + // multiply-adds of the last level of depth, for which the LHS + // and RHS data is already loaded. + + "fmla v24.4s, v0.4s, v3.s[0]\n" + "fmla v26.4s, v0.4s, v3.s[1]\n" + "fmla v28.4s, v0.4s, v3.s[2]\n" + "fmla v30.4s, v0.4s, v3.s[3]\n" + "fmla v25.4s, v1.4s, v3.s[0]\n" + "fmla v27.4s, v1.4s, v3.s[1]\n" + "fmla v29.4s, v1.4s, v3.s[2]\n" + "fmla v31.4s, v1.4s, v3.s[3]\n" + "fmla v17.4s, v1.4s, v2.s[0]\n" + "fmla v19.4s, v1.4s, v2.s[1]\n" + "fmla v21.4s, v1.4s, v2.s[2]\n" + "fmla v23.4s, v1.4s, v2.s[3]\n" + + // End of accumulation. The registers v16 -- v31 contain the final + // int32 accumulator values of the current 8x8 destination block. + // We now have to compute the final 8-bit values from these int32 + // accumulators, and advance to the next 8x8 block. We intertwine + // these two aspects whenever possible for optimal pipelining, both + // at the data flow level (prefetch data for next block as early as + // possible) and instruction pipelining level (some of the next-block + // work can dual-issue with some of the final work on the current + // block). + + // Logic to advance to the next block in preparation for the next + // iteration of the main loop. For now, we only want to compute + // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are + // not yet ready to update the values of row and col, as we still need + // the current values for the rest of the work on the current block. + + "cmp %w[row], w7\n" // Have we finished the last row? + "bge 4f\n" // If finished last row, go to 4 + // Not finished last row: then advance to next row. + "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n" + "b 5f\n" + "4:\n" // Finished last row... + "mov %[lhs_col_ptr], x5\n" // Go back to first row + // Now we need to advance to the next column. If we already + // finished the last column, then in principle we are done, however + // we can't just return here, as we need to allow the end work of the + // current block to complete. The good news is that at this point it + // doesn't matter what data we load for the next column, since + // we will exit from the main loop below before actually storing + // anything computed from that data. + "cmp %w[col], w8\n" // Have we finished the last column? + "bge 5f\n" // If yes, just carry on without updating the column pointer. + // Not finished last column: then advance to next column. + "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n" + "5:\n" + + // Set the LHS and RHS data pointers to the start of the columns just + // computed. + "mov %[lhs_ptr], %[lhs_col_ptr]\n" + "mov %[rhs_ptr], %[rhs_col_ptr]\n" + + // Load some parameters needed for the end work on current block. + "ldrb w4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n" + "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n" + + // Determine the channel index. + "tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n" + "csel w3, %w[row], %w[col], eq\n" + + // Offset the bias pointer as needed given the current row, col. + "add x5, x1, x3, lsl #2\n" + + // If there is no bias, use no offset, just address the passed zero + // data. + "tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n" + "csel x1, x1, x5, eq\n" + + // Load 8 bias values. + "ld1 {v14.4s}, [x1], #16\n" + "ld1 {v15.4s}, [x1]\n" + + // Now that we know what LHS and RHS data the next iteration of the + // main loop will need to load, we start loading the first 32 bytes of + // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore + // in the rest of the work on the current block. + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v2.4s}, [%[rhs_ptr]], #16\n" + "ld1 {v3.4s}, [%[rhs_ptr]], #16\n" + + // Perform the bias-addition. + // Jump based on channel dimension. + "tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n" + "bne 6f\n" + // Case where channels are rows + "fadd v16.4s, v16.4s, v14.4s\n" + "fadd v17.4s, v17.4s, v15.4s\n" + "fadd v18.4s, v18.4s, v14.4s\n" + "fadd v19.4s, v19.4s, v15.4s\n" + "fadd v20.4s, v20.4s, v14.4s\n" + "fadd v21.4s, v21.4s, v15.4s\n" + "fadd v22.4s, v22.4s, v14.4s\n" + "fadd v23.4s, v23.4s, v15.4s\n" + "fadd v24.4s, v24.4s, v14.4s\n" + "fadd v25.4s, v25.4s, v15.4s\n" + "fadd v26.4s, v26.4s, v14.4s\n" + "fadd v27.4s, v27.4s, v15.4s\n" + "fadd v28.4s, v28.4s, v14.4s\n" + "fadd v29.4s, v29.4s, v15.4s\n" + "fadd v30.4s, v30.4s, v14.4s\n" + "fadd v31.4s, v31.4s, v15.4s\n" + "b 7f\n" + + "6:\n" + // Case where channels are columns + "dup v8.4s, v14.s[0]\n" + "dup v9.4s, v14.s[1]\n" + "dup v10.4s, v14.s[2]\n" + "dup v11.4s, v14.s[3]\n" + "dup v12.4s, v15.s[0]\n" + "dup v13.4s, v15.s[1]\n" + "dup v14.4s, v15.s[2]\n" + "dup v15.4s, v15.s[3]\n" + "fadd v16.4s, v16.4s, v8.4s\n" + "fadd v17.4s, v17.4s, v8.4s\n" + "fadd v18.4s, v18.4s, v9.4s\n" + "fadd v19.4s, v19.4s, v9.4s\n" + "fadd v20.4s, v20.4s, v10.4s\n" + "fadd v21.4s, v21.4s, v10.4s\n" + "fadd v22.4s, v22.4s, v11.4s\n" + "fadd v23.4s, v23.4s, v11.4s\n" + "fadd v24.4s, v24.4s, v12.4s\n" + "fadd v25.4s, v25.4s, v12.4s\n" + "fadd v26.4s, v26.4s, v13.4s\n" + "fadd v27.4s, v27.4s, v13.4s\n" + "fadd v28.4s, v28.4s, v14.4s\n" + "fadd v29.4s, v29.4s, v14.4s\n" + "fadd v30.4s, v30.4s, v15.4s\n" + "fadd v31.4s, v31.4s, v15.4s\n" + "7:\n" + + // Load the clamp_min, clamp_max bounds + "ldr w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n" + "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n" + "dup v14.4s, w2\n" // clamp_min + "dup v15.4s, w3\n" // clamp_max + + // Apply the clamp_min bound + "fmax v16.4s, v16.4s, v14.4s\n" + "fmax v17.4s, v17.4s, v14.4s\n" + "fmax v18.4s, v18.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v14.4s\n" + "fmax v20.4s, v20.4s, v14.4s\n" + "fmax v21.4s, v21.4s, v14.4s\n" + "fmax v22.4s, v22.4s, v14.4s\n" + "fmax v23.4s, v23.4s, v14.4s\n" + "fmax v24.4s, v24.4s, v14.4s\n" + "fmax v25.4s, v25.4s, v14.4s\n" + "fmax v26.4s, v26.4s, v14.4s\n" + "fmax v27.4s, v27.4s, v14.4s\n" + "fmax v28.4s, v28.4s, v14.4s\n" + "fmax v29.4s, v29.4s, v14.4s\n" + "fmax v30.4s, v30.4s, v14.4s\n" + "fmax v31.4s, v31.4s, v14.4s\n" + + // Apply the clamp_max bound + "fmin v16.4s, v16.4s, v15.4s\n" + "fmin v17.4s, v17.4s, v15.4s\n" + "fmin v18.4s, v18.4s, v15.4s\n" + "fmin v19.4s, v19.4s, v15.4s\n" + "fmin v20.4s, v20.4s, v15.4s\n" + "fmin v21.4s, v21.4s, v15.4s\n" + "fmin v22.4s, v22.4s, v15.4s\n" + "fmin v23.4s, v23.4s, v15.4s\n" + "fmin v24.4s, v24.4s, v15.4s\n" + "fmin v25.4s, v25.4s, v15.4s\n" + "fmin v26.4s, v26.4s, v15.4s\n" + "fmin v27.4s, v27.4s, v15.4s\n" + "fmin v28.4s, v28.4s, v15.4s\n" + "fmin v29.4s, v29.4s, v15.4s\n" + "fmin v30.4s, v30.4s, v15.4s\n" + "fmin v31.4s, v31.4s, v15.4s\n" + + // Compute how much of the 8x8 block of destination 8bit values that + // we have computed, fit in the destination matrix. Typically, all of + // it fits, but when the destination matrix shape is not a multiple + // of 8x8, there are some 8x8 blocks along the boundaries that do + // not fit entirely. + "sub w1, %w[dst_rows], %w[row]\n" + "sub w2, %w[dst_cols], %w[col]\n" + "mov w3, #8\n" + "cmp w1, #8\n" + // Compute w1 = how many rows of the 8x8 block fit + "csel w1, w1, w3, le\n" + "cmp w2, #8\n" + // Compute w2 = how many cols of the 8x8 block fit + "csel w2, w2, w3, le\n" + + // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits. + "cmp w1, w3\n" + "ccmp w2, w3, 0, eq\n" + // Yes, all of the 8x8 block fits, go to fast path. + "beq 30f\n" + // Not all of the 8x8 block fits. + // Set (x3 address, x4 stride) to write to dst_tmp_buf + "mov x3, %[dst_tmp_buf]\n" + "mov x4, #32\n" + "b 31f\n" + "30:\n" + // Yes, all of the 8x8 block fits. + // Set (x3 address, x4 stride) to write directly to destination matrix. + "mov x3, %[dst_ptr]\n" + "mov x4, x11\n" + "31:\n" + + // Write our 8bit values to the destination described by + // (x3 address, x4 stride). + RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n") + "str q16, [x3, #0]\n" + "str q17, [x3, #16]\n" + "add x3, x3, x4\n" + RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n") + RUY_MAKE_ZERO(v16) + RUY_MAKE_ZERO(v17) + "str q18, [x3, #0]\n" + "str q19, [x3, #16]\n" + "add x3, x3, x4\n" + RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n") + RUY_MAKE_ZERO(v18) + RUY_MAKE_ZERO(v19) + "str q20, [x3, #0]\n" + "str q21, [x3, #16]\n" + "add x3, x3, x4\n" + RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n") + RUY_MAKE_ZERO(v20) + RUY_MAKE_ZERO(v21) + "str q22, [x3, #0]\n" + "str q23, [x3, #16]\n" + "add x3, x3, x4\n" + RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n") + RUY_MAKE_ZERO(v22) + RUY_MAKE_ZERO(v23) + "str q24, [x3, #0]\n" + "str q25, [x3, #16]\n" + "add x3, x3, x4\n" + RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n") + RUY_MAKE_ZERO(v24) + RUY_MAKE_ZERO(v25) + "str q26, [x3, #0]\n" + "str q27, [x3, #16]\n" + "add x3, x3, x4\n" + RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n") + RUY_MAKE_ZERO(v26) + RUY_MAKE_ZERO(v27) + "str q28, [x3, #0]\n" + "str q29, [x3, #16]\n" + "add x3, x3, x4\n" + RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n") + RUY_MAKE_ZERO(v28) + RUY_MAKE_ZERO(v29) + "str q30, [x3, #0]\n" + "str q31, [x3, #16]\n" + RUY_MAKE_ZERO(v30) + RUY_MAKE_ZERO(v31) + + // If all of the 8x8 block fits, we just finished writing it to the + // destination, so we skip the next part. + "beq 41f\n" + // Not all of the 8x8 block fits in the destination matrix. We just + // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over + // it to copy into the destination matrix the part that fits. + "mov x3, %[dst_tmp_buf]\n" + "mov x4, %[dst_ptr]\n" + "mov w6, #0\n" + "50:\n" + RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n") + "mov w5, #0\n" + "51:\n" + "ldr w7, [x3, x5, lsl #2]\n" + "str w7, [x4, x5, lsl #2]\n" + "add w5, w5, #1\n" + "cmp w5, w1\n" + "blt 51b\n" + "add w6, w6, #1\n" + "add x3, x3, #32\n" + "add x4, x4, x11\n" + "cmp w6, w2\n" + "blt 50b\n" + "41:\n" + "add %[dst_ptr], %[dst_ptr], #32\n" + // At this point we have completely finished writing values to the + // destination matrix for the current block. + + // Reload some params --- we had used x5 -- x7 for a few other things + // since the last time we had loaded them. + "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n" + "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n" + "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n" + + // Move to the next block of the destination matrix, for the next iter + // of the main loop. Notice that lhs_col_ptr, rhs_col_ptr have already + // been updated earlier. + // Have we reached the end row? + "cmp %w[row], w7\n" + "beq 20f\n" // yes, end row. + // Not end row. Move to the next row. + "add %w[row], %w[row], #8\n" + "b 21f\n" + "20:\n" + // Was already at end row. + "mov %w[row], w6\n" // Move back to first row. + "add %w[col], %w[col], #8\n" // Move to the next column. + "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n" + "mov %[dst_ptr], %[dst_col_ptr]\n" + "21:\n" + + // Main loop exit condition: have we hit the end column? + "cmp %w[col], w8\n" + + // w1 is the number of levels of depth that we have already loaded + // LHS and RHS data for. Corresponding to the initial ld1 instructions + // above, this is currently 1. + "mov w1, #1\n" + + "ble 1b\n" + + // clang-format on + + : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr), + [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), + [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col) + : [ params ] "r"(¶ms), [dst_rows] "r"(params.dst_rows), + [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf) + : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", + "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); +} + // Variant of KernelFloatNeon tuned for in-order CPUs that do not // support dotprod (while dotprod by itself is not relevant to floating-point, // this additional bit of information that we have about the target happens to |