diff options
author | George Steed <george.steed@arm.com> | 2024-03-24 07:01:15 +0000 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2024-04-16 07:28:56 +0000 |
commit | ea5646030053a179071379a77d5f3833676f2a11 (patch) | |
tree | 75cede318c4f0fa552624995c52567239ad94bc5 | |
parent | 7266cda79ce20da1d5ad8da81943c9c6b167456c (diff) | |
download | libyuv-ea5646030053a179071379a77d5f3833676f2a11.tar.gz |
[AArch64] Use LD1/ST1 rather than LD4/ST4 in ARGBMultiplyRow_NEON
There is no need to de-interleave channels here since we are applying
the same operation across all lanes. LD4 and ST4 are known to be
significantly slower than LD1/ST1 on some micro-architectures so we
should prefer to avoid them where possible.
Reduction in runtimes observed for ARGBMultiplyRow_NEON:
Cortex-A55: -22.3%
Cortex-A510: -56.6%
Cortex-A76: -45.5%
Cortex-X2: -54.6%
Change-Id: I9103111a109a4d87d358e06eb513746314aaf66a
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5454832
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | source/row_neon64.cc | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc index d3482799..8dd9af7e 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3779,8 +3779,8 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb, asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // multiply B "prfm pldl1keep, [%0, 448] \n" @@ -3792,7 +3792,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb, "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 |