[AArch64] Use LD1/ST1 rather than LD4/ST4 in ARGBMultiplyRow_NEON

There is no need to de-interleave channels here since we are applying the same operation across all lanes. LD4 and ST4 are known to be significantly slower than LD1/ST1 on some micro-architectures so we should prefer to avoid them where possible. Reduction in runtimes observed for ARGBMultiplyRow_NEON: Cortex-A55: -22.3% Cortex-A510: -56.6% Cortex-A76: -45.5% Cortex-X2: -54.6% Change-Id: I9103111a109a4d87d358e06eb513746314aaf66a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5454832 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
author: George Steed <george.steed@arm.com> 2024-03-24 07:01:15 +0000
committer: libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> 2024-04-16 07:28:56 +0000
commit: ea5646030053a179071379a77d5f3833676f2a11 (patch)
tree: 75cede318c4f0fa552624995c52567239ad94bc5
parent: 7266cda79ce20da1d5ad8da81943c9c6b167456c (diff)
download: libyuv-ea5646030053a179071379a77d5f3833676f2a11.tar.gz
1 files changed, 3 insertions, 3 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index d3482799..8dd9af7e 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -3779,8 +3779,8 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
   asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "ld1         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld1         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
       "umull       v0.8h, v0.8b, v4.8b           \n"  // multiply B
       "prfm        pldl1keep, [%0, 448]          \n"
@@ -3792,7 +3792,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
       "rshrn       v1.8b, v1.8h, #8              \n"  // 16 bit to 8 bit G
       "rshrn       v2.8b, v2.8h, #8              \n"  // 16 bit to 8 bit R
       "rshrn       v3.8b, v3.8h, #8              \n"  // 16 bit to 8 bit A
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "st1         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
author	George Steed <george.steed@arm.com>	2024-03-24 07:01:15 +0000
committer	libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>	2024-04-16 07:28:56 +0000
commit	ea5646030053a179071379a77d5f3833676f2a11 (patch)
tree	75cede318c4f0fa552624995c52567239ad94bc5
parent	7266cda79ce20da1d5ad8da81943c9c6b167456c (diff)
download	libyuv-ea5646030053a179071379a77d5f3833676f2a11.tar.gz