aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-08-31 14:56:25 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-08-31 14:57:31 -0700
commitd44af71e28bc26e2df6fb291977a1879a642f078 (patch)
tree548d2cf061388b7a4ceb36b7a248e83df5cda9dc
parentb56380ce0a10dd528507198c3f720c4b85f020e3 (diff)
downloadXNNPACK-d44af71e28bc26e2df6fb291977a1879a642f078.tar.gz
filterbank_accumulate output 1 less value
- initial accumulation value not output - pass 1 less for rows. rows is number of output values. PiperOrigin-RevId: 471363593
-rw-r--r--bench/u32-filterbank-accumulate.cc7
-rw-r--r--src/u32-filterbank-accumulate/aarch32-arm-x1.S34
-rw-r--r--src/u32-filterbank-accumulate/aarch32-neon-x1.S37
-rw-r--r--src/u32-filterbank-accumulate/aarch32-neon-x2.S27
-rw-r--r--src/u32-filterbank-accumulate/gen/neon-x1.c17
-rw-r--r--src/u32-filterbank-accumulate/gen/neon-x2.c17
-rw-r--r--src/u32-filterbank-accumulate/gen/scalar-x1.c14
-rw-r--r--src/u32-filterbank-accumulate/neon.c.in15
-rw-r--r--src/u32-filterbank-accumulate/scalar.c.in14
-rw-r--r--test/filterbank-accumulate-microkernel-tester.h8
10 files changed, 168 insertions, 22 deletions
diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc
index 5f2818b7b..90d74c732 100644
--- a/bench/u32-filterbank-accumulate.cc
+++ b/bench/u32-filterbank-accumulate.cc
@@ -29,10 +29,11 @@ void filterbank_accumulate(
}
const size_t rows = state.range(0);
const size_t batch = state.range(1);
+ const size_t input_size = (rows + 1) * batch;
- std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> input(batch);
- std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> weight_widths(rows);
- std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> weights(batch * 2);
+ std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> input(input_size);
+ std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> weight_widths(rows + 1);
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> weights(input_size * 2);
std::vector<uint64_t, AlignedAllocator<uint64_t, 64>> output(rows);
std::iota(input.begin(), input.end(), 0);
std::fill(weight_widths.begin(), weight_widths.end(), batch);
diff --git a/src/u32-filterbank-accumulate/aarch32-arm-x1.S b/src/u32-filterbank-accumulate/aarch32-arm-x1.S
index ba349bca0..2474082e2 100644
--- a/src/u32-filterbank-accumulate/aarch32-arm-x1.S
+++ b/src/u32-filterbank-accumulate/aarch32-arm-x1.S
@@ -31,13 +31,29 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1
#endif
LDR r12, [sp] // output
PUSH {r4,r5,r6,r7,r8,r9,r10,r11} // push 32 bytes
+
MOV r8, 0 // weight_accumulator
MOV r9, 0
+
+ // Compute unweight as initial weight
+ LDRB r4, [r2], #1 // weight_widths
0:
+ LDR r5, [r3], #4 // weight+unweight
+ LDR r6, [r1], #4 // input
+ SUBS r4, r4, #1
+ UXTH r5, r5, ror #16 // unweight
+ UMLAL r8, r9, r6, r5 // initial weight_accumulator
+ BHI 0b
+
+ SUBS r0, r0, #1
+ BLS 3f
+
+1:
LDRB r4, [r2], #1 // weight_widths
MOV r10, 0 // unweight_accumulator
MOV r11, 0
-1:
+
+2:
LDR r5, [r3], #4 // weight+unweight
LDR r6, [r1], #4 // input
SUBS r4, r4, #1
@@ -45,13 +61,25 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1
UXTH r5, r5, ror #16 // unweight
UMLAL r8, r9, r6, r7 // weight_accumulator
UMLAL r10, r11, r6, r5 // unweight_accumulator
- BHI 1b
+ BHI 2b
STMIA r12!, {r8, r9}
SUBS r0, r0, #1
MOV r8, r10 // weight_accumulator = unweight_accumulator
MOV r9, r11
- BNE 0b
+ BHI 1b
+
+3:
+ LDRB r4, [r2], #1 // weight_widths
+4:
+ LDR r5, [r3], #4 // weight+unweight
+ LDR r6, [r1], #4 // input
+ SUBS r4, r4, #1
+ UXTH r7, r5 // weight
+ UMLAL r8, r9, r6, r7 // weight_accumulator
+ BHI 4b
+
+ STMIA r12!, {r8, r9}
POP {r4,r5,r6,r7,r8,r9,r10,r11}
BX lr
diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x1.S b/src/u32-filterbank-accumulate/aarch32-neon-x1.S
index 55011c80d..b3c4cd10e 100644
--- a/src/u32-filterbank-accumulate/aarch32-neon-x1.S
+++ b/src/u32-filterbank-accumulate/aarch32-neon-x1.S
@@ -31,22 +31,51 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
#endif
LDR r12, [sp] // output
PUSH {r4,lr} // push 8 bytes
+
VMOV.U8 d0, #0 // weight_accumulator
-0:
+
+ // Compute unweight as initial weight
LDRB r4, [r2], #1 // weight_widths
VMOV.U8 d1, #0 // unweight_accumulator
+0:
+ VLD1.32 {d3[]}, [r3]! // weight+unweight
+ VLD1.32 {d2[]}, [r1]! // input
+ SUBS r4, r4, #1
+ VMOVL.U16 q2, d3
+ VMLAL.U32 q0, d2, d4[1] // unweight
+ BHI 0b
+
+ SUBS r0, r0, #1
+ BLS 3f
+
1:
- VLD1.32 {d3[]}, [r3]! // weights
+ LDRB r4, [r2], #1 // weight_widths
+ VMOV.U8 d1, #0 // unweight_accumulator
+2:
+ VLD1.32 {d3[]}, [r3]! // weight+unweight
VLD1.32 {d2[]}, [r1]! // input
SUBS r4, r4, #1
VMOVL.U16 q2, d3
VMLAL.U32 q0, d4, d2
- BHI 1b
+ BHI 2b
VST1.64 {d0}, [r12]!
SUBS r0, r0, #1
VMOV d0, d1
- BNE 0b
+ BNE 1b
+
+3:
+ // Final row only compute weight
+ LDRB r4, [r2], #1 // weight_widths
+4:
+ VLD1.32 {d3[]}, [r3]! // weight+unweight
+ VLD1.32 {d2[]}, [r1]! // input
+ SUBS r4, r4, #1
+ VMOVL.U16 q2, d3
+ VMLAL.U32 q0, d2, d4[0] // weight
+ BHI 4b
+
+ VST1.64 {d0}, [r12]!
POP {r4,pc}
diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x2.S b/src/u32-filterbank-accumulate/aarch32-neon-x2.S
index 840d7c9f3..0f2419897 100644
--- a/src/u32-filterbank-accumulate/aarch32-neon-x2.S
+++ b/src/u32-filterbank-accumulate/aarch32-neon-x2.S
@@ -32,33 +32,46 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
LDR r12, [sp] // output
PUSH {r4,lr} // push 8 bytes
VMOV.U8 d0, #0 // weight_accumulator
+
+ // Compute unweight as initial weight
+ LDRB r4, [r2], #1 // weight_widths
+ VMOV.U8 d1, #0 // unweight_accumulator
0:
+ VLD1.32 {d3[]}, [r3]! // weight+unweight
+ VLD1.32 {d2[]}, [r1]! // input
+ SUBS r4, r4, #1
+ VMOVL.U16 q2, d3
+ VMLAL.U32 q0, d2, d4[1] // unweight
+ BHI 0b
+
+1:
LDRB r4, [r2], #1 // weight_widths
SUBS r4, r4, #1
VMOV.U8 d1, #0 // unweight_accumulator
- BLS 2f // less than 2 weights?
+ BLS 3f // less than 2 weights?
-1:
+2:
VLD1.16 {d3}, [r3]! // weights
VLD1.32 {d2}, [r1]! // input
SUBS r4, r4, #2
VMOVL.U16 q2, d3
VMLAL.U32 q0, d4, d2[0]
VMLAL.U32 q0, d5, d2[1]
- BHI 1b
+ BHI 2b
- BLO 3f // is there a remainder?
-2:
+ BLO 4f // is there a remainder?
+3:
VLD1.32 {d3[]}, [r3]! // weights
VLD1.32 {d2[]}, [r1]! // input
VMOVL.U16 q2, d3
VMLAL.U32 q0, d4, d2
-3:
+4:
VST1.64 {d0}, [r12]!
+
SUBS r0, r0, #1
VMOV d0, d1
- BNE 0b
+ BNE 1b
POP {r4,pc}
diff --git a/src/u32-filterbank-accumulate/gen/neon-x1.c b/src/u32-filterbank-accumulate/gen/neon-x1.c
index 0a59096c4..1fa5ba7a1 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x1.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x1.c
@@ -32,8 +32,23 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
uint64x2_t weight_accumulator = vdupq_n_u64(0);
+
+ // Compute unweight as initial weight
+ size_t n = (size_t) *weight_widths++;
+ assert(n != 0);
+
+ do {
+ const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+ const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+ const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+ weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+ } while (--n != 0);
+
+ weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
do {
- size_t n = (size_t) *weight_widths++;
+ n = (size_t) *weight_widths++;
assert(n != 0);
do {
diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c
index 743244a83..a88a1cdd4 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x2.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x2.c
@@ -32,8 +32,23 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
uint64x2_t weight_accumulator = vdupq_n_u64(0);
+
+ // Compute unweight as initial weight
+ size_t n = (size_t) *weight_widths++;
+ assert(n != 0);
+
+ do {
+ const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+ const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+ const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+ weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+ } while (--n != 0);
+
+ weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
do {
- size_t n = (size_t) *weight_widths++;
+ n = (size_t) *weight_widths++;
assert(n != 0);
for (;n >= 2; n -= 2) {
diff --git a/src/u32-filterbank-accumulate/gen/scalar-x1.c b/src/u32-filterbank-accumulate/gen/scalar-x1.c
index 8ec5c0771..cdffb619c 100644
--- a/src/u32-filterbank-accumulate/gen/scalar-x1.c
+++ b/src/u32-filterbank-accumulate/gen/scalar-x1.c
@@ -31,6 +31,20 @@ void xnn_u32_filterbank_accumulate_ukernel__scalar_x1(
uint64_t weight_accumulator = 0;
uint64_t unweight_accumulator = 0;
+ // compute unweight as initial weight
+ size_t n = (size_t) *weight_widths++;
+ assert(n != 0);
+ do {
+ const uint32_t vi = *input++;
+ const uint32_t vu = (uint32_t) weights[1]; // unweight
+ weights += 2;
+
+ const uint64_t vuacc = math_mulext_u32(vi, vu);
+
+ weight_accumulator += vuacc;
+
+ } while (--n != 0);
+
do {
size_t n = (size_t) *weight_widths++;
assert(n != 0);
diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in
index 77d88bea6..c45572f5a 100644
--- a/src/u32-filterbank-accumulate/neon.c.in
+++ b/src/u32-filterbank-accumulate/neon.c.in
@@ -28,6 +28,21 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
uint64x2_t weight_accumulator = vdupq_n_u64(0);
+
+ // Compute unweight as initial weight
+ size_t n = (size_t) *weight_widths++;
+ assert(n != 0);
+
+ do {
+ const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+ const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+ const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+ weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+ } while (--n != 0);
+
+ weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
do {
size_t n = (size_t) *weight_widths++;
assert(n != 0);
diff --git a/src/u32-filterbank-accumulate/scalar.c.in b/src/u32-filterbank-accumulate/scalar.c.in
index b9dc80c1a..903351a6a 100644
--- a/src/u32-filterbank-accumulate/scalar.c.in
+++ b/src/u32-filterbank-accumulate/scalar.c.in
@@ -28,6 +28,20 @@ void xnn_u32_filterbank_accumulate_ukernel__scalar_x${BATCH_TILE}(
uint64_t weight_accumulator = 0;
uint64_t unweight_accumulator = 0;
+ // compute unweight as initial weight
+ size_t n = (size_t) *weight_widths++;
+ assert(n != 0);
+ do {
+ const uint32_t vi = *input++;
+ const uint32_t vu = (uint32_t) weights[1]; // unweight
+ weights += 2;
+
+ const uint64_t vuacc = math_mulext_u32(vi, vu);
+
+ weight_accumulator += vuacc;
+
+ } while (--n != 0);
+
do {
size_t n = (size_t) *weight_widths++;
assert(n != 0);
diff --git a/test/filterbank-accumulate-microkernel-tester.h b/test/filterbank-accumulate-microkernel-tester.h
index 61e427893..68bbd4a82 100644
--- a/test/filterbank-accumulate-microkernel-tester.h
+++ b/test/filterbank-accumulate-microkernel-tester.h
@@ -48,7 +48,7 @@ class FilterbankAccumulateMicrokernelTester {
auto u16rng = std::bind(std::uniform_int_distribution<uint16_t>(), std::ref(rng));
auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), std::ref(rng));
- std::vector<uint8_t> filterbank_widths(rows());
+ std::vector<uint8_t> filterbank_widths(rows() + 1);
std::vector<uint64_t> output(rows());
std::vector<uint64_t> output_ref(rows());
@@ -65,14 +65,16 @@ class FilterbankAccumulateMicrokernelTester {
uint64_t weight_accumulator = 0;
uint64_t unweight_accumulator = 0;
size_t i = 0;
- for (size_t m = 0; m < rows(); m++) {
+ for (size_t m = 0; m <= rows(); m++) {
const size_t weight_width = filterbank_widths[m];
for (size_t n = 0; n < weight_width; n++) {
weight_accumulator += uint64_t(input[i]) * uint64_t(weights[i * 2]);
unweight_accumulator += uint64_t(input[i]) * uint64_t(weights[i * 2 + 1]);
i += 1;
}
- output_ref[m] = weight_accumulator;
+ if (m != 0) {
+ output_ref[m - 1] = weight_accumulator;
+ }
weight_accumulator = unweight_accumulator;
unweight_accumulator = 0;
}