filterbank_accumulate output 1 less value

- initial accumulation value not output - pass 1 less for rows. rows is number of output values. PiperOrigin-RevId: 471363593
author: Frank Barchard <fbarchard@google.com> 2022-08-31 14:56:25 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-08-31 14:57:31 -0700
commit: d44af71e28bc26e2df6fb291977a1879a642f078 (patch)
tree: 548d2cf061388b7a4ceb36b7a248e83df5cda9dc
parent: b56380ce0a10dd528507198c3f720c4b85f020e3 (diff)
download: XNNPACK-d44af71e28bc26e2df6fb291977a1879a642f078.tar.gz
10 files changed, 168 insertions, 22 deletions
diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc
index 5f2818b7b..90d74c732 100644
--- a/bench/u32-filterbank-accumulate.cc
+++ b/bench/u32-filterbank-accumulate.cc
@@ -29,10 +29,11 @@ void filterbank_accumulate(
   }
   const size_t rows = state.range(0);
   const size_t batch = state.range(1);
+  const size_t input_size = (rows + 1) * batch;
 
-  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> input(batch);
-  std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> weight_widths(rows);
-  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> weights(batch * 2);
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> input(input_size);
+  std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> weight_widths(rows + 1);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> weights(input_size * 2);
   std::vector<uint64_t, AlignedAllocator<uint64_t, 64>> output(rows);
   std::iota(input.begin(), input.end(), 0);
   std::fill(weight_widths.begin(), weight_widths.end(), batch);
diff --git a/src/u32-filterbank-accumulate/aarch32-arm-x1.S b/src/u32-filterbank-accumulate/aarch32-arm-x1.S
index ba349bca0..2474082e2 100644
--- a/src/u32-filterbank-accumulate/aarch32-arm-x1.S
+++ b/src/u32-filterbank-accumulate/aarch32-arm-x1.S
@@ -31,13 +31,29 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1
 #endif
         LDR     r12, [sp]               // output
         PUSH    {r4,r5,r6,r7,r8,r9,r10,r11}  // push 32 bytes
+
         MOV     r8, 0                   // weight_accumulator
         MOV     r9, 0
+
+        // Compute unweight as initial weight
+        LDRB    r4, [r2], #1            // weight_widths
 0:
+        LDR     r5, [r3], #4            // weight+unweight
+        LDR     r6, [r1], #4            // input
+        SUBS    r4, r4, #1
+        UXTH    r5, r5, ror #16         // unweight
+        UMLAL   r8, r9, r6, r5          // initial weight_accumulator
+        BHI     0b
+
+        SUBS    r0, r0, #1
+        BLS     3f
+
+1:
         LDRB    r4, [r2], #1            // weight_widths
         MOV     r10, 0                  // unweight_accumulator
         MOV     r11, 0
-1:
+
+2:
         LDR     r5, [r3], #4            // weight+unweight
         LDR     r6, [r1], #4            // input
         SUBS    r4, r4, #1
@@ -45,13 +61,25 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_arm_x1
         UXTH    r5, r5, ror #16         // unweight
         UMLAL   r8,  r9,  r6, r7        // weight_accumulator
         UMLAL   r10, r11, r6, r5        // unweight_accumulator
-        BHI     1b
+        BHI     2b
 
         STMIA   r12!, {r8, r9}
         SUBS    r0, r0, #1
         MOV     r8, r10                 // weight_accumulator = unweight_accumulator
         MOV     r9, r11
-        BNE     0b
+        BHI     1b
+
+3:
+        LDRB    r4, [r2], #1            // weight_widths
+4:
+        LDR     r5, [r3], #4            // weight+unweight
+        LDR     r6, [r1], #4            // input
+        SUBS    r4, r4, #1
+        UXTH    r7, r5                  // weight
+        UMLAL   r8,  r9,  r6, r7        // weight_accumulator
+        BHI     4b
+
+        STMIA   r12!, {r8, r9}
 
         POP     {r4,r5,r6,r7,r8,r9,r10,r11}
         BX      lr
diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x1.S b/src/u32-filterbank-accumulate/aarch32-neon-x1.S
index 55011c80d..b3c4cd10e 100644
--- a/src/u32-filterbank-accumulate/aarch32-neon-x1.S
+++ b/src/u32-filterbank-accumulate/aarch32-neon-x1.S
@@ -31,22 +31,51 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1
 #endif
         LDR     r12, [sp]               // output
         PUSH    {r4,lr}                 // push 8 bytes
+
         VMOV.U8 d0, #0                  // weight_accumulator
-0:
+
+        // Compute unweight as initial weight
         LDRB    r4, [r2], #1            // weight_widths
         VMOV.U8 d1, #0                  // unweight_accumulator
+0:
+        VLD1.32 {d3[]}, [r3]!           // weight+unweight
+        VLD1.32 {d2[]}, [r1]!           // input
+        SUBS    r4, r4, #1
+        VMOVL.U16 q2, d3
+        VMLAL.U32 q0, d2, d4[1]         // unweight
+        BHI     0b
+
+        SUBS    r0, r0, #1
+        BLS     3f
+
 1:
-        VLD1.32 {d3[]}, [r3]!           // weights
+        LDRB    r4, [r2], #1            // weight_widths
+        VMOV.U8 d1, #0                  // unweight_accumulator
+2:
+        VLD1.32 {d3[]}, [r3]!           // weight+unweight
         VLD1.32 {d2[]}, [r1]!           // input
         SUBS    r4, r4, #1
         VMOVL.U16 q2, d3
         VMLAL.U32 q0, d4, d2
-        BHI     1b
+        BHI     2b
 
         VST1.64 {d0}, [r12]!
         SUBS    r0, r0, #1
         VMOV    d0, d1
-        BNE     0b
+        BNE     1b
+
+3:
+        // Final row only compute weight
+        LDRB    r4, [r2], #1            // weight_widths
+4:
+        VLD1.32 {d3[]}, [r3]!           // weight+unweight
+        VLD1.32 {d2[]}, [r1]!           // input
+        SUBS    r4, r4, #1
+        VMOVL.U16 q2, d3
+        VMLAL.U32 q0, d2, d4[0]         // weight
+        BHI     4b
+
+        VST1.64 {d0}, [r12]!
 
         POP     {r4,pc}
 
diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x2.S b/src/u32-filterbank-accumulate/aarch32-neon-x2.S
index 840d7c9f3..0f2419897 100644
--- a/src/u32-filterbank-accumulate/aarch32-neon-x2.S
+++ b/src/u32-filterbank-accumulate/aarch32-neon-x2.S
@@ -32,33 +32,46 @@ BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2
         LDR     r12, [sp]               // output
         PUSH    {r4,lr}                 // push 8 bytes
         VMOV.U8 d0, #0                  // weight_accumulator
+
+        // Compute unweight as initial weight
+        LDRB    r4, [r2], #1            // weight_widths
+        VMOV.U8 d1, #0                  // unweight_accumulator
 0:
+        VLD1.32 {d3[]}, [r3]!           // weight+unweight
+        VLD1.32 {d2[]}, [r1]!           // input
+        SUBS    r4, r4, #1
+        VMOVL.U16 q2, d3
+        VMLAL.U32 q0, d2, d4[1]         // unweight
+        BHI     0b
+
+1:
         LDRB    r4, [r2], #1            // weight_widths
         SUBS    r4, r4, #1
         VMOV.U8 d1, #0                  // unweight_accumulator
-        BLS     2f                      // less than 2 weights?
+        BLS     3f                      // less than 2 weights?
 
-1:
+2:
         VLD1.16 {d3}, [r3]!             // weights
         VLD1.32 {d2}, [r1]!             // input
         SUBS    r4, r4, #2
         VMOVL.U16 q2, d3
         VMLAL.U32 q0, d4, d2[0]
         VMLAL.U32 q0, d5, d2[1]
-        BHI     1b
+        BHI     2b
 
-        BLO     3f                      // is there a remainder?
-2:
+        BLO     4f                      // is there a remainder?
+3:
         VLD1.32 {d3[]}, [r3]!           // weights
         VLD1.32 {d2[]}, [r1]!           // input
         VMOVL.U16 q2, d3
         VMLAL.U32 q0, d4, d2
 
-3:
+4:
         VST1.64 {d0}, [r12]!
+
         SUBS    r0, r0, #1
         VMOV    d0, d1
-        BNE     0b
+        BNE     1b
 
         POP     {r4,pc}
 
diff --git a/src/u32-filterbank-accumulate/gen/neon-x1.c b/src/u32-filterbank-accumulate/gen/neon-x1.c
index 0a59096c4..1fa5ba7a1 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x1.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x1.c
@@ -32,8 +32,23 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x1(
 
   uint64x2_t weight_accumulator = vdupq_n_u64(0);
 
+
+  // Compute unweight as initial weight
+  size_t n = (size_t) *weight_widths++;
+  assert(n != 0);
+
+  do {
+    const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+    const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+    const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+    weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+  } while (--n != 0);
+
+  weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
   do {
-    size_t n = (size_t) *weight_widths++;
+    n = (size_t) *weight_widths++;
     assert(n != 0);
 
     do {
diff --git a/src/u32-filterbank-accumulate/gen/neon-x2.c b/src/u32-filterbank-accumulate/gen/neon-x2.c
index 743244a83..a88a1cdd4 100644
--- a/src/u32-filterbank-accumulate/gen/neon-x2.c
+++ b/src/u32-filterbank-accumulate/gen/neon-x2.c
@@ -32,8 +32,23 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x2(
 
   uint64x2_t weight_accumulator = vdupq_n_u64(0);
 
+
+  // Compute unweight as initial weight
+  size_t n = (size_t) *weight_widths++;
+  assert(n != 0);
+
+  do {
+    const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+    const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+    const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+    weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+  } while (--n != 0);
+
+  weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
   do {
-    size_t n = (size_t) *weight_widths++;
+    n = (size_t) *weight_widths++;
     assert(n != 0);
 
     for (;n >= 2; n -= 2) {
diff --git a/src/u32-filterbank-accumulate/gen/scalar-x1.c b/src/u32-filterbank-accumulate/gen/scalar-x1.c
index 8ec5c0771..cdffb619c 100644
--- a/src/u32-filterbank-accumulate/gen/scalar-x1.c
+++ b/src/u32-filterbank-accumulate/gen/scalar-x1.c
@@ -31,6 +31,20 @@ void xnn_u32_filterbank_accumulate_ukernel__scalar_x1(
   uint64_t weight_accumulator = 0;
   uint64_t unweight_accumulator = 0;
 
+  // compute unweight as initial weight
+  size_t n = (size_t) *weight_widths++;
+  assert(n != 0);
+  do {
+    const uint32_t vi = *input++;
+    const uint32_t vu = (uint32_t) weights[1];  // unweight
+    weights += 2;
+
+    const uint64_t vuacc = math_mulext_u32(vi, vu);
+
+    weight_accumulator += vuacc;
+
+  } while (--n != 0);
+
   do {
     size_t n = (size_t) *weight_widths++;
     assert(n != 0);
diff --git a/src/u32-filterbank-accumulate/neon.c.in b/src/u32-filterbank-accumulate/neon.c.in
index 77d88bea6..c45572f5a 100644
--- a/src/u32-filterbank-accumulate/neon.c.in
+++ b/src/u32-filterbank-accumulate/neon.c.in
@@ -28,6 +28,21 @@ void xnn_u32_filterbank_accumulate_ukernel__neon_x${BATCH_TILE}(
 
   uint64x2_t weight_accumulator = vdupq_n_u64(0);
 
+
+  // Compute unweight as initial weight
+  size_t n = (size_t) *weight_widths++;
+  assert(n != 0);
+
+  do {
+    const uint32x2_t vi = vld1_dup_u32(input); input += 1;
+    const uint16x4_t vw = vreinterpret_u16_u32(vld1_dup_u32((const void*) weights)); weights += 2;
+    const uint32x2_t vw32 = vget_low_u32(vmovl_u16(vw));
+
+    weight_accumulator = vmlal_u32(weight_accumulator, vw32, vi);
+  } while (--n != 0);
+
+  weight_accumulator = vcombine_u64(vget_high_u64(weight_accumulator), vdup_n_u64(0));
+
   do {
     size_t n = (size_t) *weight_widths++;
     assert(n != 0);
diff --git a/src/u32-filterbank-accumulate/scalar.c.in b/src/u32-filterbank-accumulate/scalar.c.in
index b9dc80c1a..903351a6a 100644
--- a/src/u32-filterbank-accumulate/scalar.c.in
+++ b/src/u32-filterbank-accumulate/scalar.c.in
@@ -28,6 +28,20 @@ void xnn_u32_filterbank_accumulate_ukernel__scalar_x${BATCH_TILE}(
   uint64_t weight_accumulator = 0;
   uint64_t unweight_accumulator = 0;
 
+  // compute unweight as initial weight
+  size_t n = (size_t) *weight_widths++;
+  assert(n != 0);
+  do {
+    const uint32_t vi = *input++;
+    const uint32_t vu = (uint32_t) weights[1];  // unweight
+    weights += 2;
+
+    const uint64_t vuacc = math_mulext_u32(vi, vu);
+
+    weight_accumulator += vuacc;
+
+  } while (--n != 0);
+
   do {
     size_t n = (size_t) *weight_widths++;
     assert(n != 0);
diff --git a/test/filterbank-accumulate-microkernel-tester.h b/test/filterbank-accumulate-microkernel-tester.h
index 61e427893..68bbd4a82 100644
--- a/test/filterbank-accumulate-microkernel-tester.h
+++ b/test/filterbank-accumulate-microkernel-tester.h
@@ -48,7 +48,7 @@ class FilterbankAccumulateMicrokernelTester {
     auto u16rng = std::bind(std::uniform_int_distribution<uint16_t>(), std::ref(rng));
     auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), std::ref(rng));
 
-    std::vector<uint8_t> filterbank_widths(rows());
+    std::vector<uint8_t> filterbank_widths(rows() + 1);
     std::vector<uint64_t> output(rows());
     std::vector<uint64_t> output_ref(rows());
 
@@ -65,14 +65,16 @@ class FilterbankAccumulateMicrokernelTester {
       uint64_t weight_accumulator = 0;
       uint64_t unweight_accumulator = 0;
       size_t i = 0;
-      for (size_t m = 0; m < rows(); m++) {
+      for (size_t m = 0; m <= rows(); m++) {
         const size_t weight_width = filterbank_widths[m];
         for (size_t n = 0; n < weight_width; n++) {
           weight_accumulator += uint64_t(input[i]) * uint64_t(weights[i * 2]);
           unweight_accumulator += uint64_t(input[i]) * uint64_t(weights[i * 2 + 1]);
           i += 1;
         }
-        output_ref[m] = weight_accumulator;
+        if (m != 0) {
+          output_ref[m - 1] = weight_accumulator;
+        }
         weight_accumulator = unweight_accumulator;
         unweight_accumulator = 0;
       }
author	Frank Barchard <fbarchard@google.com>	2022-08-31 14:56:25 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-08-31 14:57:31 -0700
commit	d44af71e28bc26e2df6fb291977a1879a642f078 (patch)
tree	548d2cf061388b7a4ceb36b7a248e83df5cda9dc
parent	b56380ce0a10dd528507198c3f720c4b85f020e3 (diff)
download	XNNPACK-d44af71e28bc26e2df6fb291977a1879a642f078.tar.gz