aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarat Dukhan <maratek@google.com>2022-08-31 16:43:46 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-08-31 16:44:44 -0700
commit7497ff1ee99c4296c3bcb1d3765fa7e9851dfe88 (patch)
tree75d61524a93611f0cfea59bb1a991e797f3e9c33
parentd04252a13ed3cef5bd204d1e327fadae5020de43 (diff)
downloadXNNPACK-7497ff1ee99c4296c3bcb1d3765fa7e9851dfe88.tar.gz
Minor optimization in NEON U32 FILTERBANK-ACCUMULATE
PiperOrigin-RevId: 471388090
-rw-r--r--src/u32-filterbank-accumulate/gen/neon-x1.c10
-rw-r--r--src/u32-filterbank-accumulate/gen/neon-x2.c12
-rw-r--r--src/u32-filterbank-accumulate/neon.c.in15
3 files changed, 13 insertions, 24 deletions
diff --git a/src/u32-filterbank-accumulate/gen/neon-x1.c b/src/u32-filterbank-accumulate/gen/neon-x1.c
index 1fa5ba7a1..306bc191e 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x1.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x1.c
@@ -30,12 +30,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
assert(weights != NULL);
assert(output != NULL);
- uint64x2_t weight_accumulator = vdupq_n_u64(0);
-
-
// Compute unweight as initial weight
size_t n = (size_t) *weight_widths++;
assert(n != 0);
+ uint64x2_t weight_accumulator = vdupq_n_u64(0);
do {
const uint32x2_t vi = vld1_dup_u32(input); input += 1;
@@ -45,11 +43,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
} while (--n != 0);
- weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
-
do {
- n = (size_t) *weight_widths++;
+ size_t n = (size_t) *weight_widths++;
assert(n != 0);
+ weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
do {
const uint32x2_t vi = vld1_dup_u32(input); input += 1;
@@ -60,7 +57,6 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
} while (--n != 0);
vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1;
- weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
} while (--rows != 0);
}
diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c
index a88a1cdd4..b0804d592 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x2.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x2.c
@@ -30,12 +30,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
assert(weights != NULL);
assert(output != NULL);
- uint64x2_t weight_accumulator = vdupq_n_u64(0);
-
-
// Compute unweight as initial weight
size_t n = (size_t) *weight_widths++;
assert(n != 0);
+ uint64x2_t weight_accumulator = vdupq_n_u64(0);
do {
const uint32x2_t vi = vld1_dup_u32(input); input += 1;
@@ -45,13 +43,12 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
} while (--n != 0);
- weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
-
do {
- n = (size_t) *weight_widths++;
+ size_t n = (size_t) *weight_widths++;
assert(n != 0);
+ weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
- for (;n >= 2; n -= 2) {
+ for (; n >= 2; n -= 2) {
const uint32x2_t vi = vld1_u32(input); input += 2;
const uint16x4_t vw = vld1_u16(weights); weights += 4;
const uint32x4_t vw32 = vmovl_u16(vw);
@@ -69,7 +66,6 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
}
vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1;
- weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
} while (--rows != 0);
}
diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in
index c45572f5a..c4e95eba6 100644
--- a/src/u32-filterbank-accumulate/neon.c.in
+++ b/src/u32-filterbank-accumulate/neon.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert BATCH_TILE in [1, 2]
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
@@ -26,12 +27,10 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
assert(weights != NULL);
assert(output != NULL);
- uint64x2_t weight_accumulator = vdupq_n_u64(0);
-
-
// Compute unweight as initial weight
size_t n = (size_t) *weight_widths++;
assert(n != 0);
+ uint64x2_t weight_accumulator = vdupq_n_u64(0);
do {
const uint32x2_t vi = vld1_dup_u32(input); input += 1;
@@ -41,14 +40,13 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
} while (--n != 0);
- weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
-
do {
size_t n = (size_t) *weight_widths++;
assert(n != 0);
+ weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
- $if BATCH_TILE > 1:
- for (;n >= 2; n -= 2) {
+ $if BATCH_TILE == 2:
+ for (; n >= 2; n -= 2) {
const uint32x2_t vi = vld1_u32(input); input += 2;
const uint16x4_t vw = vld1_u16(weights); weights += 4;
const uint32x4_t vw32 = vmovl_u16(vw);
@@ -64,7 +62,7 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
}
- $else:
+ $elif BATCH_TILE == 1:
do {
const uint32x2_t vi = vld1_dup_u32(input); input += 1;
const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
@@ -74,7 +72,6 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
} while (--n != 0);
vst1_u64(output, vget_low_u64(weight_accumulator)); output += 1;
- weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
} while (--rows != 0);
}