diff options
author | Frank Barchard <fbarchard@google.com> | 2022-08-31 14:56:25 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-08-31 14:57:31 -0700 |
commit | d44af71e28bc26e2df6fb291977a1879a642f078 (patch) | |
tree | 548d2cf061388b7a4ceb36b7a248e83df5cda9dc | |
parent | b56380ce0a10dd528507198c3f720c4b85f020e3 (diff) | |
download | XNNPACK-d44af71e28bc26e2df6fb291977a1879a642f078.tar.gz |
filterbank_accumulate output 1 less value
- initial accumulation value not output
- pass 1 less for rows. rows is number of output values.
PiperOrigin-RevId: 471363593
-rw-r--r-- | bench/u32-filterbank-accumulate.cc | 7 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/aarch32-arm-x1.S | 34 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/aarch32-neon-x1.S | 37 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/aarch32-neon-x2.S | 27 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/gen/neon-x1.c | 17 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/gen/neon-x2.c | 17 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/gen/scalar-x1.c | 14 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/neon.c.in | 15 | ||||
-rw-r--r-- | src/u32-filterbank-accumulate/scalar.c.in | 14 | ||||
-rw-r--r-- | test/filterbank-accumulate-microkernel-tester.h | 8 |
10 files changed, 168 insertions, 22 deletions
diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc index 5f2818b7b..90d74c732 100644 --- a/bench/u32-filterbank-accumulate.cc +++ b/bench/u32-filterbank-accumulate.cc @@ -29,10 +29,11 @@ void filterbank_accumulate( } const size_t rows = state.range(0); const size_t batch = state.range(1); + const size_t input_size = (rows + 1) * batch; - std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> input(batch); - std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> weight_widths(rows); - std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> weights(batch * 2); + std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> input(input_size); + std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> weight_widths(rows + 1); + std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> weights(input_size * 2); std::vector<uint64_t, AlignedAllocator<uint64_t, 64>> output(rows); std::iota(input.begin(), input.end(), 0); std::fill(weight_widths.begin(), weight_widths.end(), batch); diff --git a/src/u32-filterbank-accumulate/aarch32-arm-x1.S b/src/u32-filterbank-accumulate/aarch32-arm-x1.S index ba349bca0..2474082e2 100644 --- a/src/u32-filterbank-accumulate/aarch32-arm-x1.S +++ b/src/u32-filterbank-accumulate/aarch32-arm-x1.S @@ -31,13 +31,29 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1 #endif LDR r12, [sp] // output PUSH {r4,r5,r6,r7,r8,r9,r10,r11} // push 32 bytes + MOV r8, 0 // weight_accumulator MOV r9, 0 + + // Compute unweight as initial weight + LDRB r4, [r2], #1 // weight_widths 0: + LDR r5, [r3], #4 // weight+unweight + LDR r6, [r1], #4 // input + SUBS r4, r4, #1 + UXTH r5, r5, ror #16 // unweight + UMLAL r8, r9, r6, r5 // initial weight_accumulator + BHI 0b + + SUBS r0, r0, #1 + BLS 3f + +1: LDRB r4, [r2], #1 // weight_widths MOV r10, 0 // unweight_accumulator MOV r11, 0 -1: + +2: LDR r5, [r3], #4 // weight+unweight LDR r6, [r1], #4 // input SUBS r4, r4, #1 @@ -45,13 +61,25 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1 UXTH r5, r5, ror #16 // unweight UMLAL r8, r9, r6, r7 // weight_accumulator UMLAL r10, r11, r6, r5 // unweight_accumulator - BHI 1b + BHI 2b STMIA r12!, {r8, r9} SUBS r0, r0, #1 MOV r8, r10 // weight_accumulator = unweight_accumulator MOV r9, r11 - BNE 0b + BHI 1b + +3: + LDRB r4, [r2], #1 // weight_widths +4: + LDR r5, [r3], #4 // weight+unweight + LDR r6, [r1], #4 // input + SUBS r4, r4, #1 + UXTH r7, r5 // weight + UMLAL r8, r9, r6, r7 // weight_accumulator + BHI 4b + + STMIA r12!, {r8, r9} POP {r4,r5,r6,r7,r8,r9,r10,r11} BX lr diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x1.S b/src/u32-filterbank-accumulate/aarch32-neon-x1.S index 55011c80d..b3c4cd10e 100644 --- a/src/u32-filterbank-accumulate/aarch32-neon-x1.S +++ b/src/u32-filterbank-accumulate/aarch32-neon-x1.S @@ -31,22 +31,51 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1 #endif LDR r12, [sp] // output PUSH {r4,lr} // push 8 bytes + VMOV.U8 d0, #0 // weight_accumulator -0: + + // Compute unweight as initial weight LDRB r4, [r2], #1 // weight_widths VMOV.U8 d1, #0 // unweight_accumulator +0: + VLD1.32 {d3[]}, [r3]! // weight+unweight + VLD1.32 {d2[]}, [r1]! // input + SUBS r4, r4, #1 + VMOVL.U16 q2, d3 + VMLAL.U32 q0, d2, d4[1] // unweight + BHI 0b + + SUBS r0, r0, #1 + BLS 3f + 1: - VLD1.32 {d3[]}, [r3]! // weights + LDRB r4, [r2], #1 // weight_widths + VMOV.U8 d1, #0 // unweight_accumulator +2: + VLD1.32 {d3[]}, [r3]! // weight+unweight VLD1.32 {d2[]}, [r1]! // input SUBS r4, r4, #1 VMOVL.U16 q2, d3 VMLAL.U32 q0, d4, d2 - BHI 1b + BHI 2b VST1.64 {d0}, [r12]! SUBS r0, r0, #1 VMOV d0, d1 - BNE 0b + BNE 1b + +3: + // Final row only compute weight + LDRB r4, [r2], #1 // weight_widths +4: + VLD1.32 {d3[]}, [r3]! // weight+unweight + VLD1.32 {d2[]}, [r1]! // input + SUBS r4, r4, #1 + VMOVL.U16 q2, d3 + VMLAL.U32 q0, d2, d4[0] // weight + BHI 4b + + VST1.64 {d0}, [r12]! POP {r4,pc} diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x2.S b/src/u32-filterbank-accumulate/aarch32-neon-x2.S index 840d7c9f3..0f2419897 100644 --- a/src/u32-filterbank-accumulate/aarch32-neon-x2.S +++ b/src/u32-filterbank-accumulate/aarch32-neon-x2.S @@ -32,33 +32,46 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2 LDR r12, [sp] // output PUSH {r4,lr} // push 8 bytes VMOV.U8 d0, #0 // weight_accumulator + + // Compute unweight as initial weight + LDRB r4, [r2], #1 // weight_widths + VMOV.U8 d1, #0 // unweight_accumulator 0: + VLD1.32 {d3[]}, [r3]! // weight+unweight + VLD1.32 {d2[]}, [r1]! // input + SUBS r4, r4, #1 + VMOVL.U16 q2, d3 + VMLAL.U32 q0, d2, d4[1] // unweight + BHI 0b + +1: LDRB r4, [r2], #1 // weight_widths SUBS r4, r4, #1 VMOV.U8 d1, #0 // unweight_accumulator - BLS 2f // less than 2 weights? + BLS 3f // less than 2 weights? -1: +2: VLD1.16 {d3}, [r3]! // weights VLD1.32 {d2}, [r1]! // input SUBS r4, r4, #2 VMOVL.U16 q2, d3 VMLAL.U32 q0, d4, d2[0] VMLAL.U32 q0, d5, d2[1] - BHI 1b + BHI 2b - BLO 3f // is there a remainder? -2: + BLO 4f // is there a remainder? +3: VLD1.32 {d3[]}, [r3]! // weights VLD1.32 {d2[]}, [r1]! // input VMOVL.U16 q2, d3 VMLAL.U32 q0, d4, d2 -3: +4: VST1.64 {d0}, [r12]! + SUBS r0, r0, #1 VMOV d0, d1 - BNE 0b + BNE 1b POP {r4,pc} diff --git a/src/u32-filterbank-accumulate/gen/neon-x1.c b/src/u32-filterbank-accumulate/gen/neon-x1.c index 0a59096c4..1fa5ba7a1 100644 --- a/src/u32-filterbank-accumulate/gen/neon-x1.c +++ b/src/u32-filterbank-accumulate/gen/neon-x1.c @@ -32,8 +32,23 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1( uint64x2_t weight_accumulator = vdupq_n_u64(0); + + // Compute unweight as initial weight + size_t n = (size_t) *weight_widths++; + assert(n != 0); + + do { + const uint32x2_t vi = vld1_dup_u32(input); input += 1; + const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; + const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); + + weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); + } while (--n != 0); + + weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); + do { - size_t n = (size_t) *weight_widths++; + n = (size_t) *weight_widths++; assert(n != 0); do { diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c index 743244a83..a88a1cdd4 100644 --- a/src/u32-filterbank-accumulate/gen/neon-x2.c +++ b/src/u32-filterbank-accumulate/gen/neon-x2.c @@ -32,8 +32,23 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2( uint64x2_t weight_accumulator = vdupq_n_u64(0); + + // Compute unweight as initial weight + size_t n = (size_t) *weight_widths++; + assert(n != 0); + + do { + const uint32x2_t vi = vld1_dup_u32(input); input += 1; + const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; + const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); + + weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); + } while (--n != 0); + + weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); + do { - size_t n = (size_t) *weight_widths++; + n = (size_t) *weight_widths++; assert(n != 0); for (;n >= 2; n -= 2) { diff --git a/src/u32-filterbank-accumulate/gen/scalar-x1.c b/src/u32-filterbank-accumulate/gen/scalar-x1.c index 8ec5c0771..cdffb619c 100644 --- a/src/u32-filterbank-accumulate/gen/scalar-x1.c +++ b/src/u32-filterbank-accumulate/gen/scalar-x1.c @@ -31,6 +31,20 @@ void xnn_u32_filterbank_accumulate_ukernel__scalar_x1( uint64_t weight_accumulator = 0; uint64_t unweight_accumulator = 0; + // compute unweight as initial weight + size_t n = (size_t) *weight_widths++; + assert(n != 0); + do { + const uint32_t vi = *input++; + const uint32_t vu = (uint32_t) weights[1]; // unweight + weights += 2; + + const uint64_t vuacc = math_mulext_u32(vi, vu); + + weight_accumulator += vuacc; + + } while (--n != 0); + do { size_t n = (size_t) *weight_widths++; assert(n != 0); diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in index 77d88bea6..c45572f5a 100644 --- a/src/u32-filterbank-accumulate/neon.c.in +++ b/src/u32-filterbank-accumulate/neon.c.in @@ -28,6 +28,21 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}( uint64x2_t weight_accumulator = vdupq_n_u64(0); + + // Compute unweight as initial weight + size_t n = (size_t) *weight_widths++; + assert(n != 0); + + do { + const uint32x2_t vi = vld1_dup_u32(input); input += 1; + const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2; + const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw)); + + weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi); + } while (--n != 0); + + weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0)); + do { size_t n = (size_t) *weight_widths++; assert(n != 0); diff --git a/src/u32-filterbank-accumulate/scalar.c.in b/src/u32-filterbank-accumulate/scalar.c.in index b9dc80c1a..903351a6a 100644 --- a/src/u32-filterbank-accumulate/scalar.c.in +++ b/src/u32-filterbank-accumulate/scalar.c.in @@ -28,6 +28,20 @@ void xnn_u32_filterbank_accumulate_ukernel__scalar_x${BATCH_TILE}( uint64_t weight_accumulator = 0; uint64_t unweight_accumulator = 0; + // compute unweight as initial weight + size_t n = (size_t) *weight_widths++; + assert(n != 0); + do { + const uint32_t vi = *input++; + const uint32_t vu = (uint32_t) weights[1]; // unweight + weights += 2; + + const uint64_t vuacc = math_mulext_u32(vi, vu); + + weight_accumulator += vuacc; + + } while (--n != 0); + do { size_t n = (size_t) *weight_widths++; assert(n != 0); diff --git a/test/filterbank-accumulate-microkernel-tester.h b/test/filterbank-accumulate-microkernel-tester.h index 61e427893..68bbd4a82 100644 --- a/test/filterbank-accumulate-microkernel-tester.h +++ b/test/filterbank-accumulate-microkernel-tester.h @@ -48,7 +48,7 @@ class FilterbankAccumulateMicrokernelTester { auto u16rng = std::bind(std::uniform_int_distribution<uint16_t>(), std::ref(rng)); auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), std::ref(rng)); - std::vector<uint8_t> filterbank_widths(rows()); + std::vector<uint8_t> filterbank_widths(rows() + 1); std::vector<uint64_t> output(rows()); std::vector<uint64_t> output_ref(rows()); @@ -65,14 +65,16 @@ class FilterbankAccumulateMicrokernelTester { uint64_t weight_accumulator = 0; uint64_t unweight_accumulator = 0; size_t i = 0; - for (size_t m = 0; m < rows(); m++) { + for (size_t m = 0; m <= rows(); m++) { const size_t weight_width = filterbank_widths[m]; for (size_t n = 0; n < weight_width; n++) { weight_accumulator += uint64_t(input[i]) * uint64_t(weights[i * 2]); unweight_accumulator += uint64_t(input[i]) * uint64_t(weights[i * 2 + 1]); i += 1; } - output_ref[m] = weight_accumulator; + if (m != 0) { + output_ref[m - 1] = weight_accumulator; + } weight_accumulator = unweight_accumulator; unweight_accumulator = 0; } |