aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorge Steed <george.steed@arm.com>2024-03-12 22:41:52 +0000
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2024-03-14 20:04:46 +0000
commit5d694bec38c39b03f9eb4339fb0e21f32dbba9bd (patch)
tree06c6224e728ad02591bc2749bdafffccd91484d6
parent8d0d885c2f4bc2479949b3e182ca9f649806b15b (diff)
downloadlibyuv-5d694bec38c39b03f9eb4339fb0e21f32dbba9bd.tar.gz
[AArch64] Replace UQSHRN{,2} pair by UZP2 in YUVTORGB
The existing Neon code makes use of a pair of UQSHRN and UQSHRN2 instructions to extract the top half of a widened multiply result. These instructions would ordinarily saturate, however saturation can never happen in this case since we are shifting by 16 to get the top half of each element, the top bits remain as-is. We could move this to using a slightly simpler non-saturating shift, however in this case it is simpler and faster to just use UZP2 to extract the top half of each 32-bit lane directly. Reduction in runtime for selected kernels: Kernel | Cortex-A55 | Cortex-A76 | Cortex-X2 I400ToARGBRow_NEON | -9.4% | -14.9% | -13.9% I422AlphaToARGBRow_NEON | -7.9% | -11.4% | -11.5% I422ToARGB1555Row_NEON | -7.3% | -17.2% | -14.7% I422ToARGB4444Row_NEON | -7.6% | -17.9% | -13.7% I422ToARGBRow_NEON | -8.2% | -9.8% | -11.9% I422ToRGB24Row_NEON | -8.0% | -13.3% | -12.8% I422ToRGB565Row_NEON | -7.5% | -15.1% | -14.6% I422ToRGBARow_NEON | -8.3% | -13.1% | -12.2% I444AlphaToARGBRow_NEON | -8.3% | -7.6% | -12.7% I444ToARGBRow_NEON | -8.6% | -3.5% | -13.5% I444ToRGB24Row_NEON | -8.5% | -7.8% | -13.4% NV12ToARGBRow_NEON | -8.8% | -1.4% | -12.0% NV12ToRGB24Row_NEON | -8.5% | -11.5% | -12.3% NV12ToRGB565Row_NEON | -7.9% | -15.0% | -15.7% NV21ToARGBRow_NEON | -8.7% | -1.6% | -12.3% NV21ToRGB24Row_NEON | -8.4% | -11.5% | -12.0% UYVYToARGBRow_NEON | -8.8% | -8.9% | -11.9% YUY2ToARGBRow_NEON | -8.7% | -10.8% | -13.3% Bug: libyuv:976 Change-Id: I6c505fe722e5f91f93718b85fe881ad056d8602d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5366653 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
-rw-r--r--source/row_neon64.cc3
1 files changed, 1 insertions, 2 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index d37b8dad..ac22a198 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -101,8 +101,7 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13,
"umull v6.8h, v1.8b, v30.8b \n" \
"umull v0.4s, v0.4h, v24.4h \n" \
"umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \
- "uqshrn v0.4h, v0.4s, #16 \n" \
- "uqshrn2 v0.8h, v3.4s, #16 \n" /* Y */ \
+ "uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \
"umull v4.8h, v1.8b, v28.8b \n" /* DB */ \
"umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \
"add v17.8h, v0.8h, v26.8h \n" /* G */ \