aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2020-07-01 15:23:50 +0100
committerJonathan Wright <jonathan.wright@arm.com>2020-08-10 15:58:28 +0100
commit2fa3dfece3033b3c00c613e39a0753936c4abecf (patch)
tree9b8f6f202eb7f2a02cab76aed0c8e1c51da6a5b9
parentc7fb4c5dcfb72be3edac5941ee4632c8a69cf8a4 (diff)
downloadlibjpeg-turbo-2fa3dfece3033b3c00c613e39a0753936c4abecf.tar.gz
Implement RGB->YCbCr using Arm NEON intrinsics
Adds an Arm NEON intrinsics implementation of RGB -> YCbCr color conversion. Removes the NEON assembly implementation for both AArch32 and AArch64. Bug: 922430 Change-Id: I83f63fb12481f4d7f9bd84ba1430e841faaf9c75
-rw-r--r--BUILD.gn2
-rw-r--r--README.chromium1
-rw-r--r--simd/arm/arm/jccolext-neon.c145
-rw-r--r--simd/arm/arm/jsimd_neon.S310
-rw-r--r--simd/arm/arm64/jccolext-neon.c312
-rw-r--r--simd/arm/arm64/jsimd.c30
-rw-r--r--simd/arm/arm64/jsimd_neon.S319
-rw-r--r--simd/arm/common/jccolor-neon.c156
-rw-r--r--simd/jsimd.h7
9 files changed, 620 insertions, 662 deletions
diff --git a/BUILD.gn b/BUILD.gn
index 940bf9e8..b3b64706 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -155,6 +155,7 @@ static_library("simd") {
sources = [
"simd/arm/arm/jsimd.c",
"simd/arm/arm/jsimd_neon.S",
+ "simd/arm/common/jccolor-neon.c",
"simd/arm/common/jcsample-neon.c",
"simd/arm/common/jdcolor-neon.c",
"simd/arm/common/jdmerge-neon.c",
@@ -169,6 +170,7 @@ static_library("simd") {
sources = [
"simd/arm/arm64/jsimd.c",
"simd/arm/arm64/jsimd_neon.S",
+ "simd/arm/common/jccolor-neon.c",
"simd/arm/common/jcsample-neon.c",
"simd/arm/common/jdcolor-neon.c",
"simd/arm/common/jdmerge-neon.c",
diff --git a/README.chromium b/README.chromium
index 3f1363d1..fa4d87ba 100644
--- a/README.chromium
+++ b/README.chromium
@@ -73,6 +73,7 @@ following changes which are not merged to upstream:
- Implement fast IDCT using Arm NEON intrinsics
- Add Arm NEON implementation of h2v1_downsample
- Add Arm NEON implementation of h2v2_downsample
+ - Implement RGB->YCbCr using Arm NEON intrinsics
* Patches to enable running the upstream unit tests through gtest.
The upstream unit tests are defined here under the section 'TESTS':
https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/CMakeLists.txt
diff --git a/simd/arm/arm/jccolext-neon.c b/simd/arm/arm/jccolext-neon.c
new file mode 100644
index 00000000..1e631b71
--- /dev/null
+++ b/simd/arm/arm/jccolext-neon.c
@@ -0,0 +1,145 @@
+/*
+ * jccolext-neon.c - colorspace conversion (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+/*
+ * RGB -> YCbCr conversion is defined by the following equations:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * 0.16874695 = 11059 * 2^-16
+ * 0.33125305 = 21709 * 2^-16
+ * 0.50000000 = 32768 * 2^-16
+ * 0.41868592 = 27439 * 2^-16
+ * 0.08131409 = 5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * To ensure rounding gives correct values, we add 0.5 to Cb and Cr.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
+ JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row,
+ int num_rows)
+{
+ /* Pointer to RGB(X/A) input data. */
+ JSAMPROW inptr;
+ /* Pointers to Y, Cb and Cr output data. */
+ JSAMPROW outptr0, outptr1, outptr2;
+
+ /* Setup conversion constants. */
+#if defined(__clang__)
+ const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+ const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+ const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+ const uint16x4x2_t consts = { consts1, consts2 };
+#endif
+ const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining > 0; cols_remaining -= 8) {
+
+ /* To prevent buffer overread by the vector load instructions, the */
+ /* last (image_width % 8) columns of data are first memcopied to a */
+ /* temporary buffer large enough to accommodate the vector load. */
+ if (cols_remaining < 8) {
+ uint8_t __attribute__((aligned(8))) tmp_buf[8 * RGB_PIXELSIZE];
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+ }
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+ uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+ uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+ y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+ y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+ uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+ y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+ y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_low = scaled_128_5;
+ cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+ cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+ cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+ uint32x4_t cb_high = scaled_128_5;
+ cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+ cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+ cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_low = scaled_128_5;
+ cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+ cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+ cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+ uint32x4_t cr_high = scaled_128_5;
+ cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+ cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+ cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+ vrshrn_n_u32(y_high, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+ vshrn_n_u32(cb_high, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+ vshrn_n_u32(cr_high, 16));
+ /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
+ /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ vst1_u8(outptr0, vmovn_u16(y_u16));
+ vst1_u8(outptr1, vmovn_u16(cb_u16));
+ vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+ /* Increment pointers. */
+ inptr += (8 * RGB_PIXELSIZE);
+ outptr0 += 8;
+ outptr1 += 8;
+ outptr2 += 8;
+ }
+ }
+}
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
index c3797736..2aac28be 100644
--- a/simd/arm/arm/jsimd_neon.S
+++ b/simd/arm/arm/jsimd_neon.S
@@ -65,316 +65,6 @@ _\fname:
/*****************************************************************************/
/*
- * jsimd_extrgb_ycc_convert_neon
- * jsimd_extbgr_ycc_convert_neon
- * jsimd_extrgbx_ycc_convert_neon
- * jsimd_extbgrx_ycc_convert_neon
- * jsimd_extxbgr_ycc_convert_neon
- * jsimd_extxrgb_ycc_convert_neon
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro do_store size
- .if \size == 8
- vst1.8 {d20}, [Y]!
- vst1.8 {d21}, [U]!
- vst1.8 {d22}, [V]!
- .elseif \size == 4
- vst1.8 {d20[0]}, [Y]!
- vst1.8 {d20[1]}, [Y]!
- vst1.8 {d20[2]}, [Y]!
- vst1.8 {d20[3]}, [Y]!
- vst1.8 {d21[0]}, [U]!
- vst1.8 {d21[1]}, [U]!
- vst1.8 {d21[2]}, [U]!
- vst1.8 {d21[3]}, [U]!
- vst1.8 {d22[0]}, [V]!
- vst1.8 {d22[1]}, [V]!
- vst1.8 {d22[2]}, [V]!
- vst1.8 {d22[3]}, [V]!
- .elseif \size == 2
- vst1.8 {d20[4]}, [Y]!
- vst1.8 {d20[5]}, [Y]!
- vst1.8 {d21[4]}, [U]!
- vst1.8 {d21[5]}, [U]!
- vst1.8 {d22[4]}, [V]!
- vst1.8 {d22[5]}, [V]!
- .elseif \size == 1
- vst1.8 {d20[6]}, [Y]!
- vst1.8 {d21[6]}, [U]!
- vst1.8 {d22[6]}, [V]!
- .else
- .error unsupported macroblock size
- .endif
-.endm
-
-.macro do_load bpp, size
- .if \bpp == 24
- .if \size == 8
- vld3.8 {d10, d11, d12}, [RGB]!
- pld [RGB, #128]
- .elseif \size == 4
- vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
- vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
- vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
- vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
- .elseif \size == 2
- vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
- vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
- .elseif \size == 1
- vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 32
- .if \size == 8
- vld4.8 {d10, d11, d12, d13}, [RGB]!
- pld [RGB, #128]
- .elseif \size == 4
- vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
- vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
- vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
- vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
- .elseif \size == 2
- vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
- vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
- .elseif \size == 1
- vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .else
- .error unsupported bpp
- .endif
-.endm
-
-.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined RGB->YCbCr conversion
- */
-
-.macro do_rgb_to_yuv_stage1
- vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
- vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
- vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
- vmull.u16 q7, d4, d0[0]
- vmlal.u16 q7, d6, d0[1]
- vmlal.u16 q7, d8, d0[2]
- vmull.u16 q8, d5, d0[0]
- vmlal.u16 q8, d7, d0[1]
- vmlal.u16 q8, d9, d0[2]
- vrev64.32 q9, q1
- vrev64.32 q13, q1
- vmlsl.u16 q9, d4, d0[3]
- vmlsl.u16 q9, d6, d1[0]
- vmlal.u16 q9, d8, d1[1]
- vmlsl.u16 q13, d5, d0[3]
- vmlsl.u16 q13, d7, d1[0]
- vmlal.u16 q13, d9, d1[1]
- vrev64.32 q14, q1
- vrev64.32 q15, q1
- vmlal.u16 q14, d4, d1[1]
- vmlsl.u16 q14, d6, d1[2]
- vmlsl.u16 q14, d8, d1[3]
- vmlal.u16 q15, d5, d1[1]
- vmlsl.u16 q15, d7, d1[2]
- vmlsl.u16 q15, d9, d1[3]
-.endm
-
-.macro do_rgb_to_yuv_stage2
- vrshrn.u32 d20, q7, #16
- vrshrn.u32 d21, q8, #16
- vshrn.u32 d22, q9, #16
- vshrn.u32 d23, q13, #16
- vshrn.u32 d24, q14, #16
- vshrn.u32 d25, q15, #16
- vmovn.u16 d20, q10 /* d20 = y */
- vmovn.u16 d21, q11 /* d21 = u */
- vmovn.u16 d22, q12 /* d22 = v */
-.endm
-
-.macro do_rgb_to_yuv
- do_rgb_to_yuv_stage1
- do_rgb_to_yuv_stage2
-.endm
-
-.macro do_rgb_to_yuv_stage2_store_load_stage1
- vrshrn.u32 d20, q7, #16
- vrshrn.u32 d21, q8, #16
- vshrn.u32 d22, q9, #16
- vrev64.32 q9, q1
- vshrn.u32 d23, q13, #16
- vrev64.32 q13, q1
- vshrn.u32 d24, q14, #16
- vshrn.u32 d25, q15, #16
- do_load \bpp, 8
- vmovn.u16 d20, q10 /* d20 = y */
- vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
- vmovn.u16 d21, q11 /* d21 = u */
- vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
- vmovn.u16 d22, q12 /* d22 = v */
- vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
- vmull.u16 q7, d4, d0[0]
- vmlal.u16 q7, d6, d0[1]
- vmlal.u16 q7, d8, d0[2]
- vst1.8 {d20}, [Y]!
- vmull.u16 q8, d5, d0[0]
- vmlal.u16 q8, d7, d0[1]
- vmlal.u16 q8, d9, d0[2]
- vmlsl.u16 q9, d4, d0[3]
- vmlsl.u16 q9, d6, d1[0]
- vmlal.u16 q9, d8, d1[1]
- vst1.8 {d21}, [U]!
- vmlsl.u16 q13, d5, d0[3]
- vmlsl.u16 q13, d7, d1[0]
- vmlal.u16 q13, d9, d1[1]
- vrev64.32 q14, q1
- vrev64.32 q15, q1
- vmlal.u16 q14, d4, d1[1]
- vmlsl.u16 q14, d6, d1[2]
- vmlsl.u16 q14, d8, d1[3]
- vst1.8 {d22}, [V]!
- vmlal.u16 q15, d5, d1[1]
- vmlsl.u16 q15, d7, d1[2]
- vmlsl.u16 q15, d9, d1[3]
-.endm
-
-.balign 16
-jsimd_\colorid\()_ycc_neon_consts:
- .short 19595, 38470, 7471, 11059
- .short 21709, 32768, 27439, 5329
- .short 32767, 128, 32767, 128
- .short 32767, 128, 32767, 128
-
-asm_function jsimd_\colorid\()_ycc_convert_neon
- OUTPUT_WIDTH .req r0
- INPUT_BUF .req r1
- OUTPUT_BUF .req r2
- OUTPUT_ROW .req r3
- NUM_ROWS .req r4
-
- OUTPUT_BUF0 .req r5
- OUTPUT_BUF1 .req r6
- OUTPUT_BUF2 .req OUTPUT_BUF
-
- RGB .req r7
- Y .req r8
- U .req r9
- V .req r10
- N .req ip
-
- /* Load constants to d0, d1, d2, d3 */
- adr ip, jsimd_\colorid\()_ycc_neon_consts
- vld1.16 {d0, d1, d2, d3}, [ip, :128]
-
- /* Save ARM registers and handle input arguments */
- push {r4, r5, r6, r7, r8, r9, r10, lr}
- ldr NUM_ROWS, [sp, #(4 * 8)]
- ldr OUTPUT_BUF0, [OUTPUT_BUF]
- ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
- ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
- .unreq OUTPUT_BUF
-
- /* Save NEON registers */
- vpush {d8-d15}
-
- /* Outer loop over scanlines */
- cmp NUM_ROWS, #1
- blt 9f
-0:
- ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
- ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
- mov N, OUTPUT_WIDTH
- ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
- add OUTPUT_ROW, OUTPUT_ROW, #1
- ldr RGB, [INPUT_BUF], #4
-
- /* Inner loop over pixels */
- subs N, N, #8
- blt 3f
- do_load \bpp, 8
- do_rgb_to_yuv_stage1
- subs N, N, #8
- blt 2f
-1:
- do_rgb_to_yuv_stage2_store_load_stage1
- subs N, N, #8
- bge 1b
-2:
- do_rgb_to_yuv_stage2
- do_store 8
- tst N, #7
- beq 8f
-3:
- tst N, #4
- beq 3f
- do_load \bpp, 4
-3:
- tst N, #2
- beq 4f
- do_load \bpp, 2
-4:
- tst N, #1
- beq 5f
- do_load \bpp, 1
-5:
- do_rgb_to_yuv
- tst N, #4
- beq 6f
- do_store 4
-6:
- tst N, #2
- beq 7f
- do_store 2
-7:
- tst N, #1
- beq 8f
- do_store 1
-8:
- subs NUM_ROWS, NUM_ROWS, #1
- bgt 0b
-9:
- /* Restore all registers and return */
- vpop {d8-d15}
- pop {r4, r5, r6, r7, r8, r9, r10, pc}
-
- .unreq OUTPUT_WIDTH
- .unreq OUTPUT_ROW
- .unreq INPUT_BUF
- .unreq NUM_ROWS
- .unreq OUTPUT_BUF0
- .unreq OUTPUT_BUF1
- .unreq OUTPUT_BUF2
- .unreq RGB
- .unreq Y
- .unreq U
- .unreq V
- .unreq N
-
-.purgem do_rgb_to_yuv
-.purgem do_rgb_to_yuv_stage1
-.purgem do_rgb_to_yuv_stage2
-.purgem do_rgb_to_yuv_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R G B */
-generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
* Load data into workspace, applying unsigned->signed conversion
*
* TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
diff --git a/simd/arm/arm64/jccolext-neon.c b/simd/arm/arm64/jccolext-neon.c
new file mode 100644
index 00000000..5c642fbc
--- /dev/null
+++ b/simd/arm/arm64/jccolext-neon.c
@@ -0,0 +1,312 @@
+/*
+ * jccolext-neon.c - colorspace conversion (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+/*
+ * RGB -> YCbCr conversion is defined by the following equations:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * 0.16874695 = 11059 * 2^-16
+ * 0.33125305 = 21709 * 2^-16
+ * 0.50000000 = 32768 * 2^-16
+ * 0.41868592 = 27439 * 2^-16
+ * 0.08131409 = 5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * To ensure rounding gives correct values, we add 0.5 to Cb and Cr.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
+ JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row,
+ int num_rows)
+{
+ /* Pointer to RGB(X/A) input data. */
+ JSAMPROW inptr;
+ /* Pointers to Y, Cb and Cr output data. */
+ JSAMPROW outptr0, outptr1, outptr2;
+
+ /* Setup conversion constants. */
+ const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+ const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+ uint32x4_t y_lh = vmull_high_laneq_u16(r_l, consts, 0);
+ y_lh = vmlal_high_laneq_u16(y_lh, g_l, consts, 1);
+ y_lh = vmlal_high_laneq_u16(y_lh, b_l, consts, 2);
+ uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+ uint32x4_t y_hh = vmull_high_laneq_u16(r_h, consts, 0);
+ y_hh = vmlal_high_laneq_u16(y_hh, g_h, consts, 1);
+ y_hh = vmlal_high_laneq_u16(y_hh, b_h, consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_ll = scaled_128_5;
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+ cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+ uint32x4_t cb_lh = scaled_128_5;
+ cb_lh = vmlsl_high_laneq_u16(cb_lh, r_l, consts, 3);
+ cb_lh = vmlsl_high_laneq_u16(cb_lh, g_l, consts, 4);
+ cb_lh = vmlal_high_laneq_u16(cb_lh, b_l, consts, 5);
+ uint32x4_t cb_hl = scaled_128_5;
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+ cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+ uint32x4_t cb_hh = scaled_128_5;
+ cb_hh = vmlsl_high_laneq_u16(cb_hh, r_h, consts, 3);
+ cb_hh = vmlsl_high_laneq_u16(cb_hh, g_h, consts, 4);
+ cb_hh = vmlal_high_laneq_u16(cb_hh, b_h, consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_ll = scaled_128_5;
+ cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+ uint32x4_t cr_lh = scaled_128_5;
+ cr_lh = vmlal_high_laneq_u16(cr_lh, r_l, consts, 5);
+ cr_lh = vmlsl_high_laneq_u16(cr_lh, g_l, consts, 6);
+ cr_lh = vmlsl_high_laneq_u16(cr_lh, b_l, consts, 7);
+ uint32x4_t cr_hl = scaled_128_5;
+ cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+ uint32x4_t cr_hh = scaled_128_5;
+ cr_hh = vmlal_high_laneq_u16(cr_hh, r_h, consts, 5);
+ cr_hh = vmlsl_high_laneq_u16(cr_hh, g_h, consts, 6);
+ cr_hh = vmlsl_high_laneq_u16(cr_hh, b_h, consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+ vshrn_n_u32(cb_lh, 16));
+ uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+ vshrn_n_u32(cb_hh, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+ vshrn_n_u32(cr_lh, 16));
+ uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+ vshrn_n_u32(cr_hh, 16));
+ /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
+ /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+ vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+ vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+ /* Increment pointers. */
+ inptr += (16 * RGB_PIXELSIZE);
+ outptr0 += 16;
+ outptr1 += 16;
+ outptr2 += 16;
+ }
+
+ if (cols_remaining > 8) {
+ /* To prevent buffer overread by the vector load instructions, the */
+ /* last (image_width % 16) columns of data are first memcopied to a */
+ /* temporary buffer large enough to accommodate the vector load. */
+ uint8_t __attribute__((aligned(16))) tmp_buf[16 * RGB_PIXELSIZE];
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+ uint32x4_t y_lh = vmull_high_laneq_u16(r_l, consts, 0);
+ y_lh = vmlal_high_laneq_u16(y_lh, g_l, consts, 1);
+ y_lh = vmlal_high_laneq_u16(y_lh, b_l, consts, 2);
+ uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+ uint32x4_t y_hh = vmull_high_laneq_u16(r_h, consts, 0);
+ y_hh = vmlal_high_laneq_u16(y_hh, g_h, consts, 1);
+ y_hh = vmlal_high_laneq_u16(y_hh, b_h, consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_ll = scaled_128_5;
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+ cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+ uint32x4_t cb_lh = scaled_128_5;
+ cb_lh = vmlsl_high_laneq_u16(cb_lh, r_l, consts, 3);
+ cb_lh = vmlsl_high_laneq_u16(cb_lh, g_l, consts, 4);
+ cb_lh = vmlal_high_laneq_u16(cb_lh, b_l, consts, 5);
+ uint32x4_t cb_hl = scaled_128_5;
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+ cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+ uint32x4_t cb_hh = scaled_128_5;
+ cb_hh = vmlsl_high_laneq_u16(cb_hh, r_h, consts, 3);
+ cb_hh = vmlsl_high_laneq_u16(cb_hh, g_h, consts, 4);
+ cb_hh = vmlal_high_laneq_u16(cb_hh, b_h, consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_ll = scaled_128_5;
+ cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+ uint32x4_t cr_lh = scaled_128_5;
+ cr_lh = vmlal_high_laneq_u16(cr_lh, r_l, consts, 5);
+ cr_lh = vmlsl_high_laneq_u16(cr_lh, g_l, consts, 6);
+ cr_lh = vmlsl_high_laneq_u16(cr_lh, b_l, consts, 7);
+ uint32x4_t cr_hl = scaled_128_5;
+ cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+ uint32x4_t cr_hh = scaled_128_5;
+ cr_hh = vmlal_high_laneq_u16(cr_hh, r_h, consts, 5);
+ cr_hh = vmlsl_high_laneq_u16(cr_hh, g_h, consts, 6);
+ cr_hh = vmlsl_high_laneq_u16(cr_hh, b_h, consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+ vshrn_n_u32(cb_lh, 16));
+ uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+ vshrn_n_u32(cb_hh, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+ vshrn_n_u32(cr_lh, 16));
+ uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+ vshrn_n_u32(cr_hh, 16));
+ /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
+ /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+ vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+ vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+ } else if (cols_remaining > 0) {
+ /* To prevent buffer overread by the vector load instructions, the */
+ /* last (image_width % 8) columns of data are first memcopied to a */
+ /* temporary buffer large enough to accommodate the vector load. */
+ uint8_t __attribute__((aligned(8))) tmp_buf[8 * RGB_PIXELSIZE];
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+ uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+ uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+ y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+ y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+ uint32x4_t y_h = vmull_high_laneq_u16(r, consts, 0);
+ y_h = vmlal_high_laneq_u16(y_h, g, consts, 1);
+ y_h = vmlal_high_laneq_u16(y_h, b, consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_l = scaled_128_5;
+ cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+ cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+ cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+ uint32x4_t cb_h = scaled_128_5;
+ cb_h = vmlsl_high_laneq_u16(cb_h, r, consts, 3);
+ cb_h = vmlsl_high_laneq_u16(cb_h, g, consts, 4);
+ cb_h = vmlal_high_laneq_u16(cb_h, b, consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_l = scaled_128_5;
+ cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+ cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+ cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+ uint32x4_t cr_h = scaled_128_5;
+ cr_h = vmlal_high_laneq_u16(cr_h, r, consts, 5);
+ cr_h = vmlsl_high_laneq_u16(cr_h, g, consts, 6);
+ cr_h = vmlsl_high_laneq_u16(cr_h, b, consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+ vrshrn_n_u32(y_h, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+ vshrn_n_u32(cb_h, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+ vshrn_n_u32(cr_h, 16));
+ /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
+ /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ vst1_u8(outptr0, vmovn_u16(y_u16));
+ vst1_u8(outptr1, vmovn_u16(cb_u16));
+ vst1_u8(outptr2, vmovn_u16(cr_u16));
+ }
+ }
+}
diff --git a/simd/arm/arm64/jsimd.c b/simd/arm/arm64/jsimd.c
index c9da49f3..8cc8f699 100644
--- a/simd/arm/arm64/jsimd.c
+++ b/simd/arm/arm64/jsimd.c
@@ -27,14 +27,11 @@
#include <string.h>
#include <ctype.h>
-#define JSIMD_FASTLD3 1
-#define JSIMD_FASTST3 2
#define JSIMD_FASTTBL 4
static unsigned int simd_support = ~0;
static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
- JSIMD_FASTTBL;
+static unsigned int simd_features = JSIMD_FASTTBL;
#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
@@ -154,16 +151,6 @@ init_simd(void)
env = getenv("JSIMD_NOHUFFENC");
if ((env != NULL) && (strcmp(env, "1") == 0))
simd_huffman = 0;
- env = getenv("JSIMD_FASTLD3");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_features |= JSIMD_FASTLD3;
- if ((env != NULL) && (strcmp(env, "0") == 0))
- simd_features &= ~JSIMD_FASTLD3;
- env = getenv("JSIMD_FASTST3");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_features |= JSIMD_FASTST3;
- if ((env != NULL) && (strcmp(env, "0") == 0))
- simd_features &= ~JSIMD_FASTST3;
#endif
}
@@ -237,20 +224,14 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
- if (simd_features & JSIMD_FASTLD3)
- neonfct = jsimd_extrgb_ycc_convert_neon;
- else
- neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+ neonfct = jsimd_extrgb_ycc_convert_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_extrgbx_ycc_convert_neon;
break;
case JCS_EXT_BGR:
- if (simd_features & JSIMD_FASTLD3)
- neonfct = jsimd_extbgr_ycc_convert_neon;
- else
- neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+ neonfct = jsimd_extbgr_ycc_convert_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
@@ -265,10 +246,7 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
neonfct = jsimd_extxrgb_ycc_convert_neon;
break;
default:
- if (simd_features & JSIMD_FASTLD3)
- neonfct = jsimd_extrgb_ycc_convert_neon;
- else
- neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+ neonfct = jsimd_extrgb_ycc_convert_neon;
break;
}
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 16614d15..7c134457 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -39,15 +39,6 @@
.section .rodata, "a", %progbits
#endif
-/* Constants for jsimd_*_ycc_neon() */
-
-.balign 16
-Ljsimd_rgb_ycc_neon_consts:
- .short 19595, 38470, 7471, 11059
- .short 21709, 32768, 27439, 5329
- .short 32767, 128, 32767, 128
- .short 32767, 128, 32767, 128
-
/* Constants for jsimd_fdct_islow_neon() */
#define F_0_298 2446 /* FIX(0.298631336) */
@@ -205,316 +196,6 @@ _\fname:
/*****************************************************************************/
/*
- * jsimd_extrgb_ycc_convert_neon
- * jsimd_extbgr_ycc_convert_neon
- * jsimd_extrgbx_ycc_convert_neon
- * jsimd_extbgrx_ycc_convert_neon
- * jsimd_extxbgr_ycc_convert_neon
- * jsimd_extxrgb_ycc_convert_neon
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro do_store size
- .if \size == 8
- st1 {v20.8b}, [Y], #8
- st1 {v21.8b}, [U], #8
- st1 {v22.8b}, [V], #8
- .elseif \size == 4
- st1 {v20.b}[0], [Y], #1
- st1 {v20.b}[1], [Y], #1
- st1 {v20.b}[2], [Y], #1
- st1 {v20.b}[3], [Y], #1
- st1 {v21.b}[0], [U], #1
- st1 {v21.b}[1], [U], #1
- st1 {v21.b}[2], [U], #1
- st1 {v21.b}[3], [U], #1
- st1 {v22.b}[0], [V], #1
- st1 {v22.b}[1], [V], #1
- st1 {v22.b}[2], [V], #1
- st1 {v22.b}[3], [V], #1
- .elseif \size == 2
- st1 {v20.b}[4], [Y], #1
- st1 {v20.b}[5], [Y], #1
- st1 {v21.b}[4], [U], #1
- st1 {v21.b}[5], [U], #1
- st1 {v22.b}[4], [V], #1
- st1 {v22.b}[5], [V], #1
- .elseif \size == 1
- st1 {v20.b}[6], [Y], #1
- st1 {v21.b}[6], [U], #1
- st1 {v22.b}[6], [V], #1
- .else
- .error unsupported macroblock size
- .endif
-.endm
-
-.macro do_load bpp, size, fast_ld3
- .if \bpp == 24
- .if \size == 8
- .if \fast_ld3 == 1
- ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24
- .else
- ld1 {v10.b}[0], [RGB], #1
- ld1 {v11.b}[0], [RGB], #1
- ld1 {v12.b}[0], [RGB], #1
-
- ld1 {v10.b}[1], [RGB], #1
- ld1 {v11.b}[1], [RGB], #1
- ld1 {v12.b}[1], [RGB], #1
-
- ld1 {v10.b}[2], [RGB], #1
- ld1 {v11.b}[2], [RGB], #1
- ld1 {v12.b}[2], [RGB], #1
-
- ld1 {v10.b}[3], [RGB], #1
- ld1 {v11.b}[3], [RGB], #1
- ld1 {v12.b}[3], [RGB], #1
-
- ld1 {v10.b}[4], [RGB], #1
- ld1 {v11.b}[4], [RGB], #1
- ld1 {v12.b}[4], [RGB], #1
-
- ld1 {v10.b}[5], [RGB], #1
- ld1 {v11.b}[5], [RGB], #1
- ld1 {v12.b}[5], [RGB], #1
-
- ld1 {v10.b}[6], [RGB], #1
- ld1 {v11.b}[6], [RGB], #1
- ld1 {v12.b}[6], [RGB], #1
-
- ld1 {v10.b}[7], [RGB], #1
- ld1 {v11.b}[7], [RGB], #1
- ld1 {v12.b}[7], [RGB], #1
- .endif
- prfm pldl1keep, [RGB, #128]
- .elseif \size == 4
- ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3
- ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3
- ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3
- ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3
- .elseif \size == 2
- ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3
- ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3
- .elseif \size == 1
- ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 32
- .if \size == 8
- ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
- prfm pldl1keep, [RGB, #128]
- .elseif \size == 4
- ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
- ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
- ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
- ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
- .elseif \size == 2
- ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
- ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
- .elseif \size == 1
- ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
- .else
- .error unsupported macroblock size
- .endif
- .else
- .error unsupported bpp
- .endif
-.endm
-
-.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
- b_offs, fast_ld3
-
-/*
- * 2-stage pipelined RGB->YCbCr conversion
- */
-
-.macro do_rgb_to_yuv_stage1
- ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */
- ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */
- ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */
- rev64 v18.4s, v1.4s
- rev64 v26.4s, v1.4s
- rev64 v28.4s, v1.4s
- rev64 v30.4s, v1.4s
- umull v14.4s, v4.4h, v0.h[0]
- umull2 v16.4s, v4.8h, v0.h[0]
- umlsl v18.4s, v4.4h, v0.h[3]
- umlsl2 v26.4s, v4.8h, v0.h[3]
- umlal v28.4s, v4.4h, v0.h[5]
- umlal2 v30.4s, v4.8h, v0.h[5]
- umlal v14.4s, v6.4h, v0.h[1]
- umlal2 v16.4s, v6.8h, v0.h[1]
- umlsl v18.4s, v6.4h, v0.h[4]
- umlsl2 v26.4s, v6.8h, v0.h[4]
- umlsl v28.4s, v6.4h, v0.h[6]
- umlsl2 v30.4s, v6.8h, v0.h[6]
- umlal v14.4s, v8.4h, v0.h[2]
- umlal2 v16.4s, v8.8h, v0.h[2]
- umlal v18.4s, v8.4h, v0.h[5]
- umlal2 v26.4s, v8.8h, v0.h[5]
- umlsl v28.4s, v8.4h, v0.h[7]
- umlsl2 v30.4s, v8.8h, v0.h[7]
-.endm
-
-.macro do_rgb_to_yuv_stage2
- rshrn v20.4h, v14.4s, #16
- shrn v22.4h, v18.4s, #16
- shrn v24.4h, v28.4s, #16
- rshrn2 v20.8h, v16.4s, #16
- shrn2 v22.8h, v26.4s, #16
- shrn2 v24.8h, v30.4s, #16
- xtn v20.8b, v20.8h /* v20 = y */
- xtn v21.8b, v22.8h /* v21 = u */
- xtn v22.8b, v24.8h /* v22 = v */
-.endm
-
-.macro do_rgb_to_yuv
- do_rgb_to_yuv_stage1
- do_rgb_to_yuv_stage2
-.endm
-
-/* TODO: expand macros and interleave instructions if some in-order
- * ARM64 processor actually can dual-issue LOAD/STORE with ALU */
-.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
- do_rgb_to_yuv_stage2
- do_load \bpp, 8, \fast_ld3
- st1 {v20.8b}, [Y], #8
- st1 {v21.8b}, [U], #8
- st1 {v22.8b}, [V], #8
- do_rgb_to_yuv_stage1
-.endm
-
-.if \fast_ld3 == 1
-asm_function jsimd_\colorid\()_ycc_convert_neon
-.else
-asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
-.endif
- OUTPUT_WIDTH .req w0
- INPUT_BUF .req x1
- OUTPUT_BUF .req x2
- OUTPUT_ROW .req w3
- NUM_ROWS .req w4
-
- OUTPUT_BUF0 .req x5
- OUTPUT_BUF1 .req x6
- OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */
-
- RGB .req x7
- Y .req x9
- U .req x10
- V .req x11
- N .req w12
-
- /* Load constants to d0, d1, d2, d3 */
- get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts
- ld1 {v0.8h, v1.8h}, [x13]
-
- ldr OUTPUT_BUF0, [OUTPUT_BUF]
- ldr OUTPUT_BUF1, [OUTPUT_BUF, #8]
- ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]
- .unreq OUTPUT_BUF
-
- /* Save NEON registers */
- sub sp, sp, #64
- mov x9, sp
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-
- /* Outer loop over scanlines */
- cmp NUM_ROWS, #1
- b.lt 9f
-0:
- ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
- ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
- mov N, OUTPUT_WIDTH
- ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
- add OUTPUT_ROW, OUTPUT_ROW, #1
- ldr RGB, [INPUT_BUF], #8
-
- /* Inner loop over pixels */
- subs N, N, #8
- b.lt 3f
- do_load \bpp, 8, \fast_ld3
- do_rgb_to_yuv_stage1
- subs N, N, #8
- b.lt 2f
-1:
- do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
- subs N, N, #8
- b.ge 1b
-2:
- do_rgb_to_yuv_stage2
- do_store 8
- tst N, #7
- b.eq 8f
-3:
- tbz N, #2, 3f
- do_load \bpp, 4, \fast_ld3
-3:
- tbz N, #1, 4f
- do_load \bpp, 2, \fast_ld3
-4:
- tbz N, #0, 5f
- do_load \bpp, 1, \fast_ld3
-5:
- do_rgb_to_yuv
- tbz N, #2, 6f
- do_store 4
-6:
- tbz N, #1, 7f
- do_store 2
-7:
- tbz N, #0, 8f
- do_store 1
-8:
- subs NUM_ROWS, NUM_ROWS, #1
- b.gt 0b
-9:
- /* Restore all registers and return */
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- br x30
-
- .unreq OUTPUT_WIDTH
- .unreq OUTPUT_ROW
- .unreq INPUT_BUF
- .unreq NUM_ROWS
- .unreq OUTPUT_BUF0
- .unreq OUTPUT_BUF1
- .unreq OUTPUT_BUF2
- .unreq RGB
- .unreq Y
- .unreq U
- .unreq V
- .unreq N
-
-.purgem do_rgb_to_yuv
-.purgem do_rgb_to_yuv_stage1
-.purgem do_rgb_to_yuv_stage2
-.purgem do_rgb_to_yuv_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R G B Fast LD3 */
-generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1
-generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
-generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
-generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
-
-generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0
-generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
* Load data into workspace, applying unsigned->signed conversion
*
* TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
diff --git a/simd/arm/common/jccolor-neon.c b/simd/arm/common/jccolor-neon.c
new file mode 100644
index 00000000..2ec1636c
--- /dev/null
+++ b/simd/arm/common/jccolor-neon.c
@@ -0,0 +1,156 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/* RGB -> YCbCr conversion constants. */
+
+#define F_0_298 19595
+#define F_0_587 38470
+#define F_0_113 7471
+#define F_0_168 11059
+#define F_0_331 21709
+#define F_0_500 32768
+#define F_0_418 27439
+#define F_0_081 5329
+
+const static uint16_t jsimd_rgb_ycc_neon_consts[] = { F_0_298, F_0_587,
+ F_0_113, F_0_168,
+ F_0_331, F_0_500,
+ F_0_418, F_0_081
+ };
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 2bbd0642..50a3c5d8 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -121,13 +121,6 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_neon
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-
EXTERN(void) jsimd_rgb_ycc_convert_dspr2
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);