diff options
author | Android Chromium Automerger <chromium-automerger@android> | 2014-07-02 09:52:58 +0000 |
---|---|---|
committer | Android Chromium Automerger <chromium-automerger@android> | 2014-07-02 09:52:58 +0000 |
commit | 448d48d2537f3b269c15656acb1ad4a35d4891f2 (patch) | |
tree | 30d3d08306c1c333688946a83f1b7d43b09a4a2a | |
parent | ee5df9a9b13628a72d99e479dd762c7317c6fc56 (diff) | |
parent | 1347fdea0715f8903ab37e07976861f56de17210 (diff) | |
download | libyuv-448d48d2537f3b269c15656acb1ad4a35d4891f2.tar.gz |
Merge third_party/libyuv from https://chromium.googlesource.com/external/libyuv.git at 1347fdea0715f8903ab37e07976861f56de17210
This commit was generated by merge_from_chromium.py.
Change-Id: Id37bf4ca99264d10ede61d172cf0662464723d8e
-rw-r--r-- | DEPS | 2 | ||||
-rw-r--r-- | README.chromium | 2 | ||||
-rw-r--r-- | include/libyuv/convert_from_argb.h | 18 | ||||
-rw-r--r-- | include/libyuv/row.h | 33 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | libyuv_test.gyp | 14 | ||||
-rw-r--r-- | source/compare_neon.cc | 10 | ||||
-rw-r--r-- | source/convert_argb.cc | 22 | ||||
-rw-r--r-- | source/planar_functions.cc | 4 | ||||
-rw-r--r-- | source/rotate_mips.cc | 9 | ||||
-rw-r--r-- | source/rotate_neon.cc | 129 | ||||
-rw-r--r-- | source/row_any.cc | 8 | ||||
-rw-r--r-- | source/row_mips.cc | 7 | ||||
-rw-r--r-- | source/row_neon.cc | 308 | ||||
-rw-r--r-- | source/row_win.cc | 224 | ||||
-rw-r--r-- | source/scale_mips.cc | 3 | ||||
-rw-r--r-- | source/scale_neon.cc | 129 | ||||
-rw-r--r-- | unit_test/convert_test.cc | 57 | ||||
-rw-r--r-- | util/Makefile | 12 | ||||
-rw-r--r-- | util/psnr.cc | 21 | ||||
-rw-r--r-- | util/psnr.h | 14 | ||||
-rw-r--r-- | util/psnr_main.cc | 96 | ||||
-rw-r--r-- | util/ssim.cc | 1 | ||||
-rw-r--r-- | util/ssim.h | 3 |
24 files changed, 983 insertions, 145 deletions
@@ -14,7 +14,7 @@ vars = { "chromium_trunk" : "http://src.chromium.org/svn/trunk", # chrome://version/ for revision of canary Chrome. # http://chromium-status.appspot.com/lkgr is a last known good revision. - "chromium_revision": "262938", + "chromium_revision": "274825", } # NOTE: Prefer revision numbers to tags for svn deps. Use http rather than diff --git a/README.chromium b/README.chromium index 78d39a6..9657262 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1007 +Version: 1025 License: BSD License File: LICENSE diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h index f0343a7..90f43af 100644 --- a/include/libyuv/convert_from_argb.h +++ b/include/libyuv/convert_from_argb.h @@ -25,24 +25,22 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Convert ARGB To BGRA. (alias) -#define ARGBToBGRA BGRAToARGB +// Convert ARGB To BGRA. LIBYUV_API -int BGRAToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, +int ARGBToBGRA(const uint8* src_argb, int src_stride_argb, + uint8* dst_bgra, int dst_stride_bgra, int width, int height); -// Convert ARGB To ABGR. (alias) -#define ARGBToABGR ABGRToARGB +// Convert ARGB To ABGR. LIBYUV_API -int ABGRToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, +int ARGBToABGR(const uint8* src_argb, int src_stride_argb, + uint8* dst_abgr, int dst_stride_abgr, int width, int height); // Convert ARGB To RGBA. LIBYUV_API -int ARGBToRGBA(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, +int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgba, int dst_stride_rgba, int width, int height); // Convert ARGB To RGB24. diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7a54e4a..e99c441 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -15,6 +15,10 @@ #include "libyuv/basic_types.h" +#if defined(__native_client__) +#include "ppapi/c/pp_macros.h" // For PPAPI_RELEASE +#endif + #ifdef __cplusplus namespace libyuv { extern "C" { @@ -47,7 +51,12 @@ extern "C" { #endif // Enable for NaCL pepper 33 for bundle and AVX2 support. -// #define NEW_BINUTILS +#if defined(__native_client__) && PPAPI_RELEASE >= 33 +#define NEW_BINUTILS +#endif +#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37 +#define LIBYUV_DISABLE_NEON +#endif // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ @@ -152,6 +161,11 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 #endif +// The following are available on x64 Visual C: +#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) +#define HAS_I422TOARGBROW_SSSE3 +#endif + // GCC >= 4.7.0 required for AVX2. #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) @@ -237,8 +251,7 @@ extern "C" { // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) && \ - !defined(__native_client__) + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOYROW_NEON #define HAS_ARGB1555TOARGBROW_NEON @@ -331,7 +344,8 @@ extern "C" { #endif // The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) #define HAS_COPYROW_MIPS #if defined(__mips_dsp) && (__mips_dsp_rev >= 2) #define HAS_I422TOABGRROW_MIPS_DSPR2 @@ -427,7 +441,7 @@ typedef uint8 uvec8[16]; "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ #opcode " (%%r15,%%r14),%" #arg "\n" \ BUNDLEUNLOCK -#else +#else // defined(__native_client__) && defined(__x86_64__) #define BUNDLEALIGN "\n" #define MEMACCESS(base) "(%" #base ")" #define MEMACCESS2(offset, base) #offset "(%" #base ")" @@ -444,6 +458,15 @@ typedef uint8 uvec8[16]; #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" #define MEMOPARG(opcode, offset, base, index, scale, arg) \ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" +#endif // defined(__native_client__) && defined(__x86_64__) + +#if defined(__arm__) +#undef MEMACCESS +#if defined(__native_client__) +#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" +#else +#define MEMACCESS(base) "\n" +#endif #endif void I444ToARGBRow_NEON(const uint8* src_y, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 16faa9b..1663926 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1007 +#define LIBYUV_VERSION 1025 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/libyuv_test.gyp b/libyuv_test.gyp index e7e9e76..ab04cde 100644 --- a/libyuv_test.gyp +++ b/libyuv_test.gyp @@ -56,6 +56,12 @@ 'LIBYUV_DISABLE_NEON' ], }], + [ 'OS == "ios"', { + 'xcode_settings': { + 'DEBUGGING_SYMBOLS': 'YES', + 'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym', + }, + }], [ 'OS != "ios"', { 'defines': [ 'HAVE_JPEG', @@ -116,7 +122,15 @@ 'LIBYUV_DISABLE_NEON' ], }], + [ 'OS != "ios"', { + 'defines': [ + 'HAVE_JPEG', + ], + }], ], # conditions + 'dependencies': [ + 'libyuv.gyp:libyuv', + ], }, { 'target_name': 'cpuid', diff --git a/source/compare_neon.cc b/source/compare_neon.cc index 77f42f4..5e7b8e4 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -9,6 +9,7 @@ */ #include "libyuv/basic_types.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -27,14 +28,9 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ".p2align 2 \n" "1: \n" - // TODO(fbarchard): Define a macro for clearing address bits for NaCL. -#if defined(__native_client__) - "bic %0, #0xc0000000 \n" -#endif + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" -#if defined(__native_client__) - "bic %1, #0xc0000000 \n" -#endif + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" "subs %2, %2, #16 \n" "vsubl.u8 q2, d0, d2 \n" diff --git a/source/convert_argb.cc b/source/convert_argb.cc index a8aab91..5f97f64 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -372,6 +372,17 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, width, height); } +// Convert ARGB to BGRA (same as BGRAToARGB). +LIBYUV_API +int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskBGRAToARGB), + width, height); +} + // Convert ABGR to ARGB. LIBYUV_API int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, @@ -383,6 +394,17 @@ int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, width, height); } +// Convert ARGB to ABGR to (same as ABGRToARGB). +LIBYUV_API +int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskABGRToARGB), + width, height); +} + // Convert RGBA to ARGB. LIBYUV_API int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index f1297ca..3857008 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -37,6 +37,10 @@ void CopyPlane(const uint8* src_y, int src_stride_y, height = 1; src_stride_y = dst_stride_y = 0; } + // Nothing to do. + if (src_y == dst_y && src_stride_y == dst_stride_y) { + return; + } #if defined(HAS_COPYROW_X86) if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { CopyRow = CopyRow_X86; diff --git a/source/rotate_mips.cc b/source/rotate_mips.cc index 04d5a66..70770fd 100644 --- a/source/rotate_mips.cc +++ b/source/rotate_mips.cc @@ -18,7 +18,8 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_MIPS) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) + defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, uint8* dst, int dst_stride, @@ -303,10 +304,8 @@ void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, [width] "+r" (width) :[src_stride] "r" (src_stride), [dst_stride] "r" (dst_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", "s4", - "s5", "s6", "s7" + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7" ); } diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index 7375bab..d354e11 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -17,8 +17,8 @@ namespace libyuv { extern "C" { #endif -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__native_client__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) + static uvec8 kVTbl4x4Transpose = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; @@ -37,13 +37,21 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "1: \n" "mov %0, %1 \n" + MEMACCESS(0) "vld1.8 {d0}, [%0], %2 \n" + MEMACCESS(0) "vld1.8 {d1}, [%0], %2 \n" + MEMACCESS(0) "vld1.8 {d2}, [%0], %2 \n" + MEMACCESS(0) "vld1.8 {d3}, [%0], %2 \n" + MEMACCESS(0) "vld1.8 {d4}, [%0], %2 \n" + MEMACCESS(0) "vld1.8 {d5}, [%0], %2 \n" + MEMACCESS(0) "vld1.8 {d6}, [%0], %2 \n" + MEMACCESS(0) "vld1.8 {d7}, [%0] \n" "vtrn.8 d1, d0 \n" @@ -68,13 +76,21 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "mov %0, %3 \n" + MEMACCESS(0) "vst1.8 {d1}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d0}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d3}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d2}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d5}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d4}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d7}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d6}, [%0] \n" "add %1, #8 \n" // src += 8 @@ -96,17 +112,26 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, // 4x8 block "mov %0, %1 \n" + MEMACCESS(0) "vld1.32 {d0[0]}, [%0], %2 \n" + MEMACCESS(0) "vld1.32 {d0[1]}, [%0], %2 \n" + MEMACCESS(0) "vld1.32 {d1[0]}, [%0], %2 \n" + MEMACCESS(0) "vld1.32 {d1[1]}, [%0], %2 \n" + MEMACCESS(0) "vld1.32 {d2[0]}, [%0], %2 \n" + MEMACCESS(0) "vld1.32 {d2[1]}, [%0], %2 \n" + MEMACCESS(0) "vld1.32 {d3[0]}, [%0], %2 \n" + MEMACCESS(0) "vld1.32 {d3[1]}, [%0] \n" "mov %0, %3 \n" + MEMACCESS(6) "vld1.8 {q3}, [%6] \n" "vtbl.8 d4, {d0, d1}, d6 \n" @@ -116,15 +141,23 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, // TODO(frkoenig): Rework shuffle above to // write out with 4 instead of 8 writes. + MEMACCESS(0) "vst1.32 {d4[0]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d4[1]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d5[0]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d5[1]}, [%0] \n" "add %0, %3, #4 \n" + MEMACCESS(0) "vst1.32 {d0[0]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d0[1]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d1[0]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d1[1]}, [%0] \n" "add %1, #4 \n" // src += 4 @@ -140,20 +173,30 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, // 2x8 block "2: \n" "mov %0, %1 \n" + MEMACCESS(0) "vld1.16 {d0[0]}, [%0], %2 \n" + MEMACCESS(0) "vld1.16 {d1[0]}, [%0], %2 \n" + MEMACCESS(0) "vld1.16 {d0[1]}, [%0], %2 \n" + MEMACCESS(0) "vld1.16 {d1[1]}, [%0], %2 \n" + MEMACCESS(0) "vld1.16 {d0[2]}, [%0], %2 \n" + MEMACCESS(0) "vld1.16 {d1[2]}, [%0], %2 \n" + MEMACCESS(0) "vld1.16 {d0[3]}, [%0], %2 \n" + MEMACCESS(0) "vld1.16 {d1[3]}, [%0] \n" "vtrn.8 d0, d1 \n" "mov %0, %3 \n" + MEMACCESS(0) "vst1.64 {d0}, [%0], %4 \n" + MEMACCESS(0) "vst1.64 {d1}, [%0] \n" "add %1, #2 \n" // src += 2 @@ -163,15 +206,24 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, // 1x8 block "3: \n" + MEMACCESS(1) "vld1.8 {d0[0]}, [%1], %2 \n" + MEMACCESS(1) "vld1.8 {d0[1]}, [%1], %2 \n" + MEMACCESS(1) "vld1.8 {d0[2]}, [%1], %2 \n" + MEMACCESS(1) "vld1.8 {d0[3]}, [%1], %2 \n" + MEMACCESS(1) "vld1.8 {d0[4]}, [%1], %2 \n" + MEMACCESS(1) "vld1.8 {d0[5]}, [%1], %2 \n" + MEMACCESS(1) "vld1.8 {d0[6]}, [%1], %2 \n" + MEMACCESS(1) "vld1.8 {d0[7]}, [%1] \n" + MEMACCESS(3) "vst1.64 {d0}, [%3] \n" "4: \n" @@ -206,13 +258,21 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "1: \n" "mov %0, %1 \n" + MEMACCESS(0) "vld2.8 {d0, d1}, [%0], %2 \n" + MEMACCESS(0) "vld2.8 {d2, d3}, [%0], %2 \n" + MEMACCESS(0) "vld2.8 {d4, d5}, [%0], %2 \n" + MEMACCESS(0) "vld2.8 {d6, d7}, [%0], %2 \n" + MEMACCESS(0) "vld2.8 {d16, d17}, [%0], %2 \n" + MEMACCESS(0) "vld2.8 {d18, d19}, [%0], %2 \n" + MEMACCESS(0) "vld2.8 {d20, d21}, [%0], %2 \n" + MEMACCESS(0) "vld2.8 {d22, d23}, [%0] \n" "vtrn.8 q1, q0 \n" @@ -241,24 +301,40 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "mov %0, %3 \n" + MEMACCESS(0) "vst1.8 {d2}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d0}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d6}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d4}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d18}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d16}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d22}, [%0], %4 \n" + MEMACCESS(0) "vst1.8 {d20}, [%0] \n" "mov %0, %5 \n" + MEMACCESS(0) "vst1.8 {d3}, [%0], %6 \n" + MEMACCESS(0) "vst1.8 {d1}, [%0], %6 \n" + MEMACCESS(0) "vst1.8 {d7}, [%0], %6 \n" + MEMACCESS(0) "vst1.8 {d5}, [%0], %6 \n" + MEMACCESS(0) "vst1.8 {d19}, [%0], %6 \n" + MEMACCESS(0) "vst1.8 {d17}, [%0], %6 \n" + MEMACCESS(0) "vst1.8 {d23}, [%0], %6 \n" + MEMACCESS(0) "vst1.8 {d21}, [%0] \n" "add %1, #8*2 \n" // src += 8*2 @@ -279,18 +355,27 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "cmp %7, #4 \n" "blt 2f \n" - //TODO(frkoenig): Clean this up + // TODO(frkoenig): Clean this up // 4x8 block "mov %0, %1 \n" + MEMACCESS(0) "vld1.64 {d0}, [%0], %2 \n" + MEMACCESS(0) "vld1.64 {d1}, [%0], %2 \n" + MEMACCESS(0) "vld1.64 {d2}, [%0], %2 \n" + MEMACCESS(0) "vld1.64 {d3}, [%0], %2 \n" + MEMACCESS(0) "vld1.64 {d4}, [%0], %2 \n" + MEMACCESS(0) "vld1.64 {d5}, [%0], %2 \n" + MEMACCESS(0) "vld1.64 {d6}, [%0], %2 \n" + MEMACCESS(0) "vld1.64 {d7}, [%0] \n" + MEMACCESS(8) "vld1.8 {q15}, [%8] \n" "vtrn.8 q0, q1 \n" @@ -307,28 +392,44 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "mov %0, %3 \n" + MEMACCESS(0) "vst1.32 {d16[0]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d16[1]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d17[0]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d17[1]}, [%0], %4 \n" "add %0, %3, #4 \n" + MEMACCESS(0) "vst1.32 {d20[0]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d20[1]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d21[0]}, [%0], %4 \n" + MEMACCESS(0) "vst1.32 {d21[1]}, [%0] \n" "mov %0, %5 \n" + MEMACCESS(0) "vst1.32 {d18[0]}, [%0], %6 \n" + MEMACCESS(0) "vst1.32 {d18[1]}, [%0], %6 \n" + MEMACCESS(0) "vst1.32 {d19[0]}, [%0], %6 \n" + MEMACCESS(0) "vst1.32 {d19[1]}, [%0], %6 \n" "add %0, %5, #4 \n" + MEMACCESS(0) "vst1.32 {d22[0]}, [%0], %6 \n" + MEMACCESS(0) "vst1.32 {d22[1]}, [%0], %6 \n" + MEMACCESS(0) "vst1.32 {d23[0]}, [%0], %6 \n" + MEMACCESS(0) "vst1.32 {d23[1]}, [%0] \n" "add %1, #4*2 \n" // src += 4 * 2 @@ -345,13 +446,21 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, // 2x8 block "2: \n" "mov %0, %1 \n" + MEMACCESS(0) "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" + MEMACCESS(0) "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" + MEMACCESS(0) "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" + MEMACCESS(0) "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" + MEMACCESS(0) "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" + MEMACCESS(0) "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" + MEMACCESS(0) "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" + MEMACCESS(0) "vld2.16 {d1[3], d3[3]}, [%0] \n" "vtrn.8 d0, d1 \n" @@ -359,12 +468,16 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "mov %0, %3 \n" + MEMACCESS(0) "vst1.64 {d0}, [%0], %4 \n" + MEMACCESS(0) "vst1.64 {d2}, [%0] \n" "mov %0, %5 \n" + MEMACCESS(0) "vst1.64 {d1}, [%0], %6 \n" + MEMACCESS(0) "vst1.64 {d3}, [%0] \n" "add %1, #2*2 \n" // src += 2 * 2 @@ -375,16 +488,26 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, // 1x8 block "3: \n" + MEMACCESS(1) "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" + MEMACCESS(1) "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" + MEMACCESS(1) "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" + MEMACCESS(1) "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" + MEMACCESS(1) "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" + MEMACCESS(1) "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" + MEMACCESS(1) "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" + MEMACCESS(1) "vld2.8 {d0[7], d1[7]}, [%1] \n" + MEMACCESS(3) "vst1.64 {d0}, [%3] \n" + MEMACCESS(5) "vst1.64 {d1}, [%5] \n" "4: \n" diff --git a/source/row_any.cc b/source/row_any.cc index 90c6a3f..97ef844 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -35,10 +35,12 @@ extern "C" { } #ifdef HAS_I422TOARGBROW_SSSE3 -YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, - 0, 4, 7) YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1, 4, 7) +#endif // HAS_I422TOARGBROW_SSSE3 +#ifdef HAS_I444TOARGBROW_SSSE3 +YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, + 0, 4, 7) YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2, 4, 7) YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, @@ -59,7 +61,7 @@ YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7) YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7) YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15) YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) -#endif // HAS_I422TOARGBROW_SSSE3 +#endif // HAS_I444TOARGBROW_SSSE3 #ifdef HAS_I422TOARGBROW_AVX2 YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15) #endif // HAS_I422TOARGBROW_AVX2 diff --git a/source/row_mips.cc b/source/row_mips.cc index 4435c55..ae9370c 100644 --- a/source/row_mips.cc +++ b/source/row_mips.cc @@ -16,7 +16,8 @@ extern "C" { #endif // The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) #ifdef HAS_COPYROW_MIPS void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { @@ -376,7 +377,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { // MIPS DSPR2 functions #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ - (__mips_dsp_rev >= 2) + (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) + void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { __asm__ __volatile__ ( diff --git a/source/row_neon.cc b/source/row_neon.cc index 4f5158f..a84e3e4 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -16,39 +16,50 @@ extern "C" { #endif // This module is for GCC Neon -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__native_client__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.32 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ "vld1.32 {d2[1]}, [%2]! \n" // Read 8 Y, 2 U and 2 V from 422 #define READYUV411 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.16 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ "vld1.16 {d2[1]}, [%2]! \n" \ "vmov.u8 d3, d2 \n" \ "vzip.u8 d2, d3 \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ + MEMACCESS(2) \ "vld1.8 {d3}, [%2]! \n" \ "vpaddl.u8 q1, q1 \n" \ "vrshrn.u16 d2, q1, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ "vmov.u8 d2, #128 \n" // Read 8 Y and 4 UV from NV12 #define READNV12 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vuzp.u8 d2, d3 \n" \ @@ -56,7 +67,9 @@ extern "C" { // Read 8 Y and 4 VU from NV21 #define READNV21 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vuzp.u8 d3, d2 \n" \ @@ -64,6 +77,7 @@ extern "C" { // Read 8 YUY2 #define READYUY2 \ + MEMACCESS(0) \ "vld2.8 {d0, d2}, [%0]! \n" \ "vmov.u8 d3, d2 \n" \ "vuzp.u8 d2, d3 \n" \ @@ -71,6 +85,7 @@ extern "C" { // Read 8 UYVY #define READUYVY \ + MEMACCESS(0) \ "vld2.8 {d2, d3}, [%0]! \n" \ "vmov.u8 d0, d3 \n" \ "vmov.u8 d3, d2 \n" \ @@ -114,7 +129,9 @@ void I444ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -125,6 +142,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -145,7 +163,9 @@ void I422ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -156,6 +176,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -176,7 +197,9 @@ void I411ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -187,6 +210,7 @@ void I411ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -207,7 +231,9 @@ void I422ToBGRARow_NEON(const uint8* src_y, uint8* dst_bgra, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -219,6 +245,7 @@ void I422ToBGRARow_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vswp.u8 d20, d22 \n" "vmov.u8 d19, #255 \n" + MEMACCESS(3) "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -239,7 +266,9 @@ void I422ToABGRRow_NEON(const uint8* src_y, uint8* dst_abgr, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -251,6 +280,7 @@ void I422ToABGRRow_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vswp.u8 d20, d22 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -271,7 +301,9 @@ void I422ToRGBARow_NEON(const uint8* src_y, uint8* dst_rgba, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -282,6 +314,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d19, #255 \n" + MEMACCESS(3) "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -302,7 +335,9 @@ void I422ToRGB24Row_NEON(const uint8* src_y, uint8* dst_rgb24, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -312,6 +347,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, READYUV422 YUV422TORGB "subs %4, %4, #8 \n" + MEMACCESS(3) "vst3.8 {d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -332,7 +368,9 @@ void I422ToRAWRow_NEON(const uint8* src_y, uint8* dst_raw, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -343,6 +381,7 @@ void I422ToRAWRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vswp.u8 d20, d22 \n" + MEMACCESS(3) "vst3.8 {d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -375,7 +414,9 @@ void I422ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -386,6 +427,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" ARGBTORGB565 + MEMACCESS(3) "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_y), // %0 @@ -421,7 +463,9 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, uint8* dst_argb1555, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -433,6 +477,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" ARGBTOARGB1555 + MEMACCESS(3) "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. "bgt 1b \n" : "+r"(src_y), // %0 @@ -462,7 +507,9 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, uint8* dst_argb4444, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -475,6 +522,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" ARGBTOARGB4444 + MEMACCESS(3) "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. "bgt 1b \n" : "+r"(src_y), // %0 @@ -493,7 +541,10 @@ void YToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(3) + MEMACCESS(3) "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -504,6 +555,7 @@ void YToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %2, %2, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -523,10 +575,12 @@ void I400ToARGBRow_NEON(const uint8* src_y, ".p2align 2 \n" "vmov.u8 d23, #255 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d20}, [%0]! \n" "vmov d21, d20 \n" "vmov d22, d20 \n" "subs %2, %2, #8 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -542,7 +596,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -553,6 +609,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -571,7 +628,9 @@ void NV21ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -582,6 +641,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -600,7 +660,9 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -611,6 +673,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" ARGBTORGB565 + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_y), // %0 @@ -629,7 +692,9 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -640,6 +705,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" ARGBTORGB565 + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_y), // %0 @@ -657,7 +723,9 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(3) "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -668,6 +736,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, YUV422TORGB "subs %2, %2, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -684,7 +753,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(3) "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -695,6 +766,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, YUV422TORGB "subs %2, %2, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -713,9 +785,12 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store U + MEMACCESS(2) "vst1.8 {q1}, [%2]! \n" // store V "bgt 1b \n" : "+r"(src_uv), // %0 @@ -733,9 +808,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load U + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(2) "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV "bgt 1b \n" : @@ -753,8 +831,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop + MEMACCESS(1) "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "bgt 1b \n" : "+r"(src), // %0 @@ -771,6 +851,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) { "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" : "+r"(dst), // %0 @@ -799,10 +880,13 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0], r3 \n" // src -= 16 "subs %2, #16 \n" // 16 pixels per loop. "vrev64.8 q0, q0 \n" + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" "bgt 1b \n" : "+r"(src), // %0 @@ -823,10 +907,13 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "subs %3, #8 \n" // 8 pixels per loop. "vrev64.8 q0, q0 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // dst += 8 + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" "bgt 1b \n" : "+r"(src_uv), // %0 @@ -847,10 +934,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0], r3 \n" // src -= 16 "subs %2, #4 \n" // 4 pixels per loop. "vrev64.32 q0, q0 \n" + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" "bgt 1b \n" : "+r"(src), // %0 @@ -866,8 +956,10 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { "vmov.u8 d4, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -883,9 +975,11 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { "vmov.u8 d4, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -913,9 +1007,11 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { "vmov.u8 d3, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb565), // %0 @@ -959,9 +1055,11 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, "vmov.u8 d3, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb1555), // %0 @@ -988,9 +1086,11 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, "vmov.u8 d3, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb4444), // %0 @@ -1005,8 +1105,10 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1021,9 +1123,11 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1038,8 +1142,10 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -1054,8 +1160,10 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -1071,9 +1179,12 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // store 8 U. + MEMACCESS(2) "vst1.8 {d3}, [%2]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -1090,9 +1201,12 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 U. + MEMACCESS(2) "vst1.8 {d2}, [%2]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -1110,12 +1224,16 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "add %1, %0, %1 \n" // stride + src_yuy2 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "vrhadd.u8 d1, d1, d5 \n" // average rows of U "vrhadd.u8 d3, d3, d7 \n" // average rows of V + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 U. + MEMACCESS(3) "vst1.8 {d3}, [%3]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -1134,12 +1252,16 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "add %1, %0, %1 \n" // stride + src_uyvy ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "vrhadd.u8 d0, d0, d4 \n" // average rows of U "vrhadd.u8 d2, d2, d6 \n" // average rows of V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 U. + MEMACCESS(3) "vst1.8 {d2}, [%3]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -1158,10 +1280,13 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, // change the stride to row 2 pointer "add %1, %0 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. "vrhadd.u8 q0, q1 \n" // average row 1 and 2 + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_uv), // %0 @@ -1179,11 +1304,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, asm volatile ( "vmov.u32 d6[0], %3 \n" // selector "1: \n" + MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels "vtrn.u32 d4, d5 \n" // combine 8 pixels + MEMACCESS(1) "vst1.8 {d4}, [%1]! \n" // store 8. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1199,8 +1326,10 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 /*selector*/, int pix) { asm volatile ( "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // store 8 G's. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1215,12 +1344,15 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( + MEMACCESS(3) "vld1.8 {q2}, [%3] \n" // shuffler "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 4 pixels. "subs %2, %2, #4 \n" // 4 processed per loop "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + MEMACCESS(1) "vst1.8 {q1}, [%1]! \n" // store 4. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1238,10 +1370,14 @@ void I422ToYUY2Row_NEON(const uint8* src_y, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + MEMACCESS(1) "vld1.8 {d1}, [%1]! \n" // load 8 Us + MEMACCESS(2) "vld1.8 {d3}, [%2]! \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 @@ -1261,10 +1397,14 @@ void I422ToUYVYRow_NEON(const uint8* src_y, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + MEMACCESS(1) "vld1.8 {d0}, [%1]! \n" // load 8 Us + MEMACCESS(2) "vld1.8 {d2}, [%2]! \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 @@ -1281,9 +1421,11 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1299,9 +1441,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1318,9 +1462,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1339,6 +1485,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -1346,6 +1493,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1363,12 +1511,14 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1391,6 +1541,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -1406,7 +1557,9 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1430,7 +1583,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. @@ -1451,7 +1606,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1476,12 +1633,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(0) "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. + MEMACCESS(0) "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. @@ -1509,7 +1670,9 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1548,12 +1711,16 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1565,7 +1732,9 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1592,12 +1761,16 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1609,7 +1782,9 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1635,12 +1810,16 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. @@ -1652,7 +1831,9 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q3, q2, q1) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_bgra), // %0 @@ -1678,12 +1859,16 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1695,7 +1880,9 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_abgr), // %0 @@ -1721,12 +1908,16 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. @@ -1738,7 +1929,9 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgba), // %0 @@ -1764,12 +1957,16 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1781,7 +1978,9 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -1807,12 +2006,16 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1824,7 +2027,9 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -1851,22 +2056,26 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. RGB565TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. RGB565TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. RGB565TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. RGB565TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. @@ -1888,7 +2097,9 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb565), // %0 @@ -1915,22 +2126,26 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. @@ -1952,7 +2167,9 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb1555), // %0 @@ -1979,22 +2196,26 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. @@ -2016,7 +2237,9 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb4444), // %0 @@ -2038,6 +2261,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB @@ -2046,6 +2270,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgb565), // %0 @@ -2064,6 +2289,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB @@ -2072,6 +2298,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb1555), // %0 @@ -2090,6 +2317,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB @@ -2098,6 +2326,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb4444), // %0 @@ -2116,6 +2345,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d1, d4 \n" // R @@ -2123,6 +2353,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "vmlal.u8 q8, d3, d6 \n" // B "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_bgra), // %0 @@ -2141,6 +2372,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // R @@ -2148,6 +2380,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "vmlal.u8 q8, d2, d6 \n" // B "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_abgr), // %0 @@ -2166,6 +2399,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d1, d4 \n" // B @@ -2173,6 +2407,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "vmlal.u8 q8, d3, d6 \n" // R "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgba), // %0 @@ -2191,6 +2426,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // B @@ -2198,6 +2434,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "vmlal.u8 q8, d2, d6 \n" // R "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -2216,6 +2453,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // B @@ -2223,6 +2461,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "vmlal.u8 q8, d2, d6 \n" // R "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -2253,7 +2492,9 @@ void InterpolateRow_NEON(uint8* dst_ptr, "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" @@ -2262,46 +2503,58 @@ void InterpolateRow_NEON(uint8* dst_ptr, "vmlal.u8 q14, d3, d5 \n" "vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q0}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" @@ -2324,7 +2577,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "blt 89f \n" // Blend 8 pixels. "8: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q10, d4, d3 \n" // db * a @@ -2338,6 +2593,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "vqadd.u8 q0, q0, q2 \n" // + sbg "vqadd.u8 d2, d2, d6 \n" // + sr "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. "bge 8b \n" @@ -2347,7 +2603,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // Blend 1 pixels. "1: \n" + MEMACCESS(0) "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + MEMACCESS(1) "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. "subs %3, %3, #1 \n" // 1 processed per loop. "vmull.u8 q10, d4, d3 \n" // db * a @@ -2361,6 +2619,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "vqadd.u8 q0, q0, q2 \n" // + sbg "vqadd.u8 d2, d2, d6 \n" // + sr "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. "bge 1b \n" @@ -2380,6 +2639,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( // Attenuate 8 pixels. "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q10, d0, d3 \n" // b * a @@ -2388,6 +2648,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2411,6 +2672,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. "subs %1, %1, #8 \n" // 8 processed per loop. "vmovl.u8 q0, d0 \n" // b (0 .. 255) @@ -2428,6 +2690,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, "vqmovn.u16 d0, q0 \n" "vqmovn.u16 d2, q1 \n" "vqmovn.u16 d4, q2 \n" + MEMACCESS(0) "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(dst_argb), // %0 @@ -2452,6 +2715,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q10, d20 \n" // b (0 .. 255) @@ -2466,6 +2730,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, "vqmovn.u16 d22, q11 \n" "vqmovn.u16 d24, q12 \n" "vqmovn.u16 d26, q13 \n" + MEMACCESS(1) "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2486,6 +2751,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -2494,6 +2760,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B "vmov d1, d0 \n" // G "vmov d2, d0 \n" // R + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2521,6 +2788,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "vmov.u8 d30, #50 \n" // BR coefficient ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "subs %1, %1, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d20 \n" // B to Sepia B @@ -2535,6 +2803,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + MEMACCESS(0) "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(dst_argb), // %0 @@ -2551,12 +2820,14 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, const int8* matrix_argb, int width) { asm volatile ( + MEMACCESS(3) "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit @@ -2595,6 +2866,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + MEMACCESS(1) "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2615,7 +2887,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q0, d0, d1 \n" // multiply B @@ -2626,6 +2900,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" @@ -2646,11 +2921,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 q0, q0, q2 \n" // add B, G "vqadd.u8 q1, q1, q3 \n" // add R, A + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" @@ -2670,11 +2948,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vqsub.u8 q0, q0, q2 \n" // subtract B, G "vqsub.u8 q1, q1, q3 \n" // subtract R, A + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" @@ -2699,12 +2980,15 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) "vld1.8 {d1}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d0, d0, d1 \n" // add "vmov.u8 d1, d0 \n" "vmov.u8 d2, d0 \n" + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 @@ -2723,10 +3007,13 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // 16 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" // load 16 sobely. "subs %3, %3, #16 \n" // 16 processed per loop. "vqadd.u8 q0, q0, q1 \n" // add + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" // store 16 pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 @@ -2750,10 +3037,13 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) "vld1.8 {d0}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d1, d0, d2 \n" // add + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 @@ -2774,21 +3064,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0],%5 \n" // top + MEMACCESS(0) "vld1.8 {d1}, [%0],%6 \n" "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(1) "vld1.8 {d2}, [%1],%5 \n" // center * 2 + MEMACCESS(1) "vld1.8 {d3}, [%1],%6 \n" "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n" + MEMACCESS(2) "vld1.8 {d2}, [%2],%5 \n" // bottom + MEMACCESS(2) "vld1.8 {d3}, [%2],%6 \n" "subs %4, %4, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" "vqmovn.u16 d0, q0 \n" + MEMACCESS(3) "vst1.8 {d0}, [%3]! \n" // store 8 sobelx "bgt 1b \n" : "+r"(src_y0), // %0 @@ -2811,21 +3108,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0],%4 \n" // left + MEMACCESS(1) "vld1.8 {d1}, [%1],%4 \n" "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(0) "vld1.8 {d2}, [%0],%4 \n" // center * 2 + MEMACCESS(1) "vld1.8 {d3}, [%1],%4 \n" "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n" + MEMACCESS(0) "vld1.8 {d2}, [%0],%5 \n" // right + MEMACCESS(1) "vld1.8 {d3}, [%1],%5 \n" "subs %3, %3, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" "vqmovn.u16 d0, q0 \n" + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 sobely "bgt 1b \n" : "+r"(src_y0), // %0 diff --git a/source/row_win.cc b/source/row_win.cc index f13e4d7..8eb8889 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -10,13 +10,177 @@ #include "libyuv/row.h" +#if defined (_M_X64) +#include <emmintrin.h> +#include <tmmintrin.h> // For _mm_maddubs_epi16 +#endif + #ifdef __cplusplus namespace libyuv { extern "C" { #endif -// This module is for Visual C x86. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// This module is for Visual C. +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) + +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ + +#define UB 127 /* min(127,(int8)(2.018 * 64)) */ +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +static const vec8 kUVToB = { + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +}; + +static const vec8 kUVToR = { + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +}; + +static const vec8 kUVToG = { + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +}; + +static const vec8 kVUToB = { + VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, +}; + +static const vec8 kVUToR = { + VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, +}; + +static const vec8 kVUToG = { + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, +}; + +static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; +static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; +static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; +static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; +static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; + +// 64 bit +#if defined(_M_X64) + +// Aligned destination version. +__declspec(align(16)) +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + + __m128i xmm0, xmm1, xmm2, xmm3; + const __m128i xmm5 = _mm_set1_epi8(-1); + const __m128i xmm4 = _mm_setzero_si128(); + const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + + while (width > 0) { + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); + xmm1 = _mm_load_si128(&xmm0); + xmm2 = _mm_load_si128(&xmm0); + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); + xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); + xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); + xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); + xmm3 = _mm_loadl_epi64((__m128i*)y_buf); + xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); + xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); + xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); + xmm0 = _mm_adds_epi16(xmm0, xmm3); + xmm1 = _mm_adds_epi16(xmm1, xmm3); + xmm2 = _mm_adds_epi16(xmm2, xmm3); + xmm0 = _mm_srai_epi16(xmm0, 6); + xmm1 = _mm_srai_epi16(xmm1, 6); + xmm2 = _mm_srai_epi16(xmm2, 6); + xmm0 = _mm_packus_epi16(xmm0, xmm0); + xmm1 = _mm_packus_epi16(xmm1, xmm1); + xmm2 = _mm_packus_epi16(xmm2, xmm2); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); + xmm1 = _mm_load_si128(&xmm0); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); + + _mm_store_si128((__m128i *)dst_argb, xmm0); + _mm_store_si128((__m128i *)(dst_argb + 16), xmm1); + + y_buf += 8; + u_buf += 4; + dst_argb += 32; + width -= 8; + } +} + +// Unaligned destination version. +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + + __m128i xmm0, xmm1, xmm2, xmm3; + const __m128i xmm5 = _mm_set1_epi8(-1); + const __m128i xmm4 = _mm_setzero_si128(); + const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + + while (width > 0) { + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); + xmm1 = _mm_load_si128(&xmm0); + xmm2 = _mm_load_si128(&xmm0); + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); + xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); + xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); + xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); + xmm3 = _mm_loadl_epi64((__m128i*)y_buf); + xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); + xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); + xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); + xmm0 = _mm_adds_epi16(xmm0, xmm3); + xmm1 = _mm_adds_epi16(xmm1, xmm3); + xmm2 = _mm_adds_epi16(xmm2, xmm3); + xmm0 = _mm_srai_epi16(xmm0, 6); + xmm1 = _mm_srai_epi16(xmm1, 6); + xmm2 = _mm_srai_epi16(xmm2, 6); + xmm0 = _mm_packus_epi16(xmm0, xmm0); + xmm1 = _mm_packus_epi16(xmm1, xmm1); + xmm2 = _mm_packus_epi16(xmm2, xmm2); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); + xmm1 = _mm_load_si128(&xmm0); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); + + _mm_storeu_si128((__m128i *)dst_argb, xmm0); + _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); + + y_buf += 8; + u_buf += 4; + dst_argb += 32; + width -= 8; + } +} +// 32 bit +#else // defined(_M_X64) #ifdef HAS_ARGBTOYROW_SSSE3 @@ -2030,21 +2194,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOYROW_SSSE3 -#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ - -#define UB 127 /* min(63,(int8)(2.018 * 64)) */ -#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ -#define UR 0 - -#define VB 0 -#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ -#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ - -// Bias -#define BB UB * 128 + VB * 128 -#define BG UG * 128 + VG * 128 -#define BR UR * 128 + VR * 128 - #ifdef HAS_I422TOARGBROW_AVX2 static const lvec8 kUVToB_AVX = { @@ -2079,10 +2228,10 @@ static const lvec16 kUVBiasR_AVX = { // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) __declspec(align(16)) void I422ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { __asm { push esi push edi @@ -2150,36 +2299,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_I422TOARGBROW_SSSE3 -static const vec8 kUVToB = { - UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB -}; - -static const vec8 kUVToR = { - UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR -}; - -static const vec8 kUVToG = { - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG -}; - -static const vec8 kVUToB = { - VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, -}; - -static const vec8 kVUToR = { - VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, -}; - -static const vec8 kVUToG = { - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, -}; - -static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; -static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; -static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; -static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; -static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; - // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. // Read 8 UV from 444. @@ -7276,7 +7395,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +#endif // defined(_M_X64) +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) #ifdef __cplusplus } // extern "C" diff --git a/source/scale_mips.cc b/source/scale_mips.cc index 4572f45..3eb4f27 100644 --- a/source/scale_mips.cc +++ b/source/scale_mips.cc @@ -18,7 +18,8 @@ extern "C" { // This module is for GCC MIPS DSPR2 #if !defined(LIBYUV_DISABLE_MIPS) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) + defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 410364a..1b8a5ba 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -16,8 +16,7 @@ extern "C" { #endif // This module is for GCC Neon. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__native_client__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) // NEON downscalers with interpolation. // Provided by Fritz Koenig @@ -29,8 +28,10 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ".p2align 2 \n" "1: \n" // load even pixels into q0, odd into q1 + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop + MEMACCESS(1) "vst1.8 {q1}, [%1]! \n" // store odd pixels "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -49,7 +50,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add %1, %0 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + MEMACCESS(1) "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc "subs %3, %3, #16 \n" // 16 processed per loop "vpaddl.u8 q0, q0 \n" // row 1 add adjacent @@ -58,6 +61,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "vpadal.u8 q1, q3 \n" "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack "vrshrn.u16 d1, q1, #2 \n" + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -74,8 +78,10 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) "vst1.8 {d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -88,16 +94,20 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile ( - "add r4, %0, %3 \n" - "add r5, r4, %3 \n" - "add %3, r5, %3 \n" + const uint8* src_ptr1 = src_ptr + src_stride; + const uint8* src_ptr2 = src_ptr + src_stride * 2; + const uint8* src_ptr3 = src_ptr + src_stride * 3; +asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load up 16x4 - "vld1.8 {q1}, [r4]! \n" - "vld1.8 {q2}, [r5]! \n" - "vld1.8 {q3}, [%3]! \n" + MEMACCESS(3) + "vld1.8 {q1}, [%3]! \n" + MEMACCESS(4) + "vld1.8 {q2}, [%4]! \n" + MEMACCESS(5) + "vld1.8 {q3}, [%5]! \n" "subs %2, %2, #4 \n" "vpaddl.u8 q0, q0 \n" "vpadal.u8 q0, q1 \n" @@ -106,13 +116,17 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "vpaddl.u16 q0, q0 \n" "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding "vmovn.u16 d0, q0 \n" + MEMACCESS(1) "vst1.32 {d0[0]}, [%1]! \n" "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc" ); } @@ -125,9 +139,11 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #24 \n" "vmov d2, d3 \n" // order d0, d1, d2 + MEMACCESS(1) "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -146,7 +162,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "add %3, %0 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "subs %2, %2, #24 \n" @@ -183,6 +201,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "vmlal.u8 q8, d3, d24 \n" "vqrshrn.u16 d2, q8, #2 \n" + MEMACCESS(1) "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" @@ -203,7 +222,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, "add %3, %0 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "subs %2, %2, #24 \n" // average src line 0 with src line 1 @@ -223,6 +244,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, "vmlal.u8 q3, d3, d24 \n" "vqrshrn.u16 d2, q3, #2 \n" + MEMACCESS(1) "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -251,14 +273,18 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( + MEMACCESS(3) "vld1.8 {q3}, [%3] \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0, d1, d2, d3}, [%0]! \n" "subs %2, %2, #12 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + MEMACCESS(1) "vst1.8 {d4}, [%1]! \n" + MEMACCESS(1) "vst1.32 {d5[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -273,11 +299,15 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride * 2; + asm volatile ( - "vld1.16 {q13}, [%4] \n" - "vld1.8 {q14}, [%5] \n" - "vld1.8 {q15}, [%6] \n" - "add r4, %0, %3, lsl #1 \n" + MEMACCESS(5) + "vld1.16 {q13}, [%5] \n" + MEMACCESS(6) + "vld1.8 {q14}, [%6] \n" + MEMACCESS(7) + "vld1.8 {q15}, [%7] \n" "add %3, %0 \n" ".p2align 2 \n" "1: \n" @@ -286,9 +316,12 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "vld4.8 {d16, d17, d18, d19}, [r4]! \n" + MEMACCESS(4) + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data @@ -365,18 +398,20 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, "vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + MEMACCESS(1) "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) "vst1.32 {d4[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2), // %5 - "r"(&kMult38_Div9) // %6 - : "r4", "q0", "q1", "q2", "q3", "q8", "q9", - "q13", "q14", "q15", "memory", "cc" + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" ); } @@ -385,7 +420,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( + MEMACCESS(4) "vld1.16 {q13}, [%4] \n" + MEMACCESS(5) "vld1.8 {q14}, [%5] \n" "add %3, %0 \n" ".p2align 2 \n" @@ -395,7 +432,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) "vld4.8 {d4, d5, d6, d7}, [%3]! \n" "subs %2, %2, #12 \n" @@ -462,7 +501,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, "vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + MEMACCESS(1) "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) "vst1.32 {d4[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -495,7 +536,9 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" @@ -504,50 +547,63 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "vmlal.u8 q14, d3, d5 \n" "vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q0}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" "99: \n" + MEMACCESS(0) "vst1.8 {d1[7]}, [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 @@ -565,10 +621,14 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ".p2align 2 \n" "1: \n" // load even pixels into q0, odd into q1 + MEMACCESS(0) "vld2.32 {q0, q1}, [%0]! \n" + MEMACCESS(0) "vld2.32 {q2, q3}, [%0]! \n" "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) "vst1.8 {q1}, [%1]! \n" // store odd pixels + MEMACCESS(1) "vst1.8 {q3}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -586,14 +646,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add %1, %1, %0 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. @@ -603,6 +667,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d2, q2, #2 \n" "vrshrn.u16 d3, q3, #2 \n" + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -622,11 +687,16 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, "mov r12, %3, lsl #2 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.32 {d0[0]}, [%0], r12 \n" + MEMACCESS(0) "vld1.32 {d0[1]}, [%0], r12 \n" + MEMACCESS(0) "vld1.32 {d1[0]}, [%0], r12 \n" + MEMACCESS(0) "vld1.32 {d1[1]}, [%0], r12 \n" "subs %2, %2, #4 \n" // 4 pixels per loop. + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 @@ -647,13 +717,21 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, "add %1, %1, %0 \n" ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + MEMACCESS(1) "vld1.8 {d1}, [%1], r12 \n" + MEMACCESS(0) "vld1.8 {d2}, [%0], r12 \n" + MEMACCESS(1) "vld1.8 {d3}, [%1], r12 \n" + MEMACCESS(0) "vld1.8 {d4}, [%0], r12 \n" + MEMACCESS(1) "vld1.8 {d5}, [%1], r12 \n" + MEMACCESS(0) "vld1.8 {d6}, [%0], r12 \n" + MEMACCESS(1) "vld1.8 {d7}, [%1], r12 \n" "vaddl.u8 q0, d0, d1 \n" "vaddl.u8 q1, d2, d3 \n" @@ -666,6 +744,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. "subs %3, %3, #4 \n" // 4 pixels per loop. + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 0a9d95f..0697b9f 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -963,6 +963,63 @@ TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0) TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0) TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0) +#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \ + W1280, N, NEG, OFF) \ +TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + align_buffer_64(src_argb, kStrideA * kHeightA + OFF); \ + align_buffer_64(dst_argb_c, kStrideA * kHeightA); \ + align_buffer_64(dst_argb_opt, kStrideA * kHeightA); \ + srandom(time(NULL)); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + src_argb[i + OFF] = (random() & 0xff); \ + } \ + memset(dst_argb_c, 1, kStrideA * kHeightA); \ + memset(dst_argb_opt, 101, kStrideA * kHeightA); \ + MaskCpuFlags(0); \ + FMT_ATOB(src_argb + OFF, kStrideA, \ + dst_argb_c, kStrideA, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_ATOB(src_argb + OFF, kStrideA, \ + dst_argb_opt, kStrideA, \ + kWidth, NEG kHeight); \ + } \ + MaskCpuFlags(0); \ + FMT_ATOB(dst_argb_c, kStrideA, \ + dst_argb_c, kStrideA, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + FMT_ATOB(dst_argb_opt, kStrideA, \ + dst_argb_opt, kStrideA, \ + kWidth, NEG kHeight); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ + } \ + free_aligned_buffer_64(src_argb); \ + free_aligned_buffer_64(dst_argb_c); \ + free_aligned_buffer_64(dst_argb_opt); \ +} + +#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A) \ + TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \ + benchmark_width_ - 4, _Any, +, 0) \ + TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \ + benchmark_width_, _Unaligned, +, 1) \ + TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \ + benchmark_width_, _Opt, +, 0) + +TESTSYM(ARGBToARGB, 4, 4, 1) +TESTSYM(ARGBToBGRA, 4, 4, 1) +TESTSYM(ARGBToABGR, 4, 4, 1) +TESTSYM(BGRAToARGB, 4, 4, 1) +TESTSYM(ABGRToARGB, 4, 4, 1) + TEST_F(libyuvTest, Test565) { SIMD_ALIGNED(uint8 orig_pixels[256][4]); SIMD_ALIGNED(uint8 pixels565[256][2]); diff --git a/util/Makefile b/util/Makefile index be6de35..6044d2a 100644 --- a/util/Makefile +++ b/util/Makefile @@ -1,6 +1,6 @@ -psnr: psnr.cc ssim.cc psnr_main.cc
-ifeq ($(CXX),icl)
- $(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
-else
- $(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
-endif
+psnr: psnr.cc ssim.cc psnr_main.cc +ifeq ($(CXX),icl) + $(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc +else + $(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all +endif diff --git a/util/psnr.cc b/util/psnr.cc index e8fd16a..f3cc0cf 100644 --- a/util/psnr.cc +++ b/util/psnr.cc @@ -10,8 +10,6 @@ #include "./psnr.h" // NOLINT -#include <math.h> - #ifdef _OPENMP #include <omp.h> #endif @@ -34,13 +32,8 @@ typedef unsigned long long uint64; // NOLINT #endif // __LP64__ #endif // _MSC_VER -// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse) -double ComputePSNR(double sse, double size) { - const double kMINSSE = 255.0 * 255.0 * size / pow(10., kMaxPSNR / 10.); - if (sse <= kMINSSE) - sse = kMINSSE; // Produces max PSNR of 128 - return 10.0 * log10(65025.0 * size / sse); -} +// libyuv provides this function when linking library for jpeg support. +#if !defined(HAVE_JPEG) #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) #define HAS_SUMSQUAREERROR_NEON @@ -241,6 +234,16 @@ double ComputeSumSquareError(const uint8* src_a, } return static_cast<double>(sse); } +#endif + +// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse) +// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match). +double ComputePSNR(double sse, double size) { + const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0); + if (sse <= kMINSSE) + sse = kMINSSE; // Produces max PSNR of 128 + return 10.0 * log10(255.0 * 255.0 * size / sse); +} #ifdef __cplusplus } // extern "C" diff --git a/util/psnr.h b/util/psnr.h index 370337a..0816b97 100644 --- a/util/psnr.h +++ b/util/psnr.h @@ -13,6 +13,8 @@ #ifndef UTIL_PSNR_H_ // NOLINT #define UTIL_PSNR_H_ +#include <math.h> // For log10() + #ifdef __cplusplus extern "C" { #endif @@ -24,13 +26,17 @@ typedef unsigned char uint8; static const double kMaxPSNR = 128.0; -// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse). -// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match). -double ComputePSNR(double sse, double size); - +// libyuv provides this function when linking library for jpeg support. +// TODO(fbarchard): make psnr lib compatible subset of libyuv. +#if !defined(HAVE_JPEG) // Computer Sum of Squared Error (SSE). // Pass this to ComputePSNR for final result. double ComputeSumSquareError(const uint8* org, const uint8* rec, int size); +#endif + +// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse) +// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match). +double ComputePSNR(double sse, double size); #ifdef __cplusplus } // extern "C" diff --git a/util/psnr_main.cc b/util/psnr_main.cc index 9cee5f8..a26bc92 100644 --- a/util/psnr_main.cc +++ b/util/psnr_main.cc @@ -32,6 +32,10 @@ #include "./psnr.h" #include "./ssim.h" +#ifdef HAVE_JPEG +#include "libyuv/compare.h" +#include "libyuv/convert.h" +#endif struct metric { double y, u, v, all; @@ -75,6 +79,29 @@ bool ExtractResolutionFromFilename(const char* name, } } } + +#ifdef HAVE_JPEG + // Try parsing file as a jpeg. + FILE* const file_org = fopen(name, "rb"); + if (file_org == NULL) { + fprintf(stderr, "Cannot open %s\n", name); + return false; + } + fseek(file_org, 0, SEEK_END); + size_t total_size = ftell(file_org); + fseek(file_org, 0, SEEK_SET); + uint8* const ch_org = new uint8[total_size]; + memset(ch_org, 0, total_size); + size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org); + fclose(file_org); + if (bytes_org == total_size) { + if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) { + delete[] ch_org; + return true; + } + } + delete[] ch_org; +#endif // HAVE_JPEG return false; } @@ -215,9 +242,18 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec, const uint8* const v_org = ch_org + y_size + (uv_size - uv_offset); const uint8* const v_rec = ch_rec + y_size + uv_size; if (do_psnr) { +#ifdef HAVE_JPEG + double y_err = static_cast<double>( + libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size)); + double u_err = static_cast<double>( + libyuv::ComputeSumSquareError(u_org, u_rec, uv_size)); + double v_err = static_cast<double>( + libyuv::ComputeSumSquareError(v_org, v_rec, uv_size)); +#else double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size); double u_err = ComputeSumSquareError(u_org, u_rec, uv_size); double v_err = ComputeSumSquareError(v_org, v_rec, uv_size); +#endif const double total_err = y_err + u_err + v_err; cur_distortion_psnr->global_y += y_err; cur_distortion_psnr->global_u += u_err; @@ -230,10 +266,10 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec, static_cast<double>(total_size)); } else { distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height); - distorted_frame->u = CalcSSIM(u_org, u_rec, image_width / 2, - image_height / 2); - distorted_frame->v = CalcSSIM(v_org, v_rec, image_width / 2, - image_height / 2); + distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2, + (image_height + 1) / 2); + distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2, + (image_height + 1) / 2); distorted_frame->all = (distorted_frame->y + distorted_frame->u + distorted_frame->v) / total_size; @@ -386,14 +422,62 @@ int main(int argc, const char* argv[]) { break; size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org); - if (bytes_org < total_size) + if (bytes_org < total_size) { +#ifdef HAVE_JPEG + // Try parsing file as a jpeg. + uint8* const ch_jpeg = new uint8[bytes_org]; + memcpy(ch_jpeg, ch_org, bytes_org); + memset(ch_org, 0, total_size); + + if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org, + ch_org, + image_width, + ch_org + y_size, + (image_width + 1) / 2, + ch_org + y_size + uv_size, + (image_width + 1) / 2, + image_width, + image_height, + image_width, + image_height)) { + delete[] ch_jpeg; + break; + } + delete[] ch_jpeg; +#else break; +#endif // HAVE_JPEG + } for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { size_t bytes_rec = fread(ch_rec, sizeof(uint8), total_size, file_rec[cur_rec]); - if (bytes_rec < total_size) + if (bytes_rec < total_size) { +#ifdef HAVE_JPEG + // Try parsing file as a jpeg. + uint8* const ch_jpeg = new uint8[bytes_rec]; + memcpy(ch_jpeg, ch_rec, bytes_rec); + memset(ch_rec, 0, total_size); + + if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec, + ch_rec, + image_width, + ch_rec + y_size, + (image_width + 1) / 2, + ch_rec + y_size + uv_size, + (image_width + 1) / 2, + image_width, + image_height, + image_width, + image_height)) { + delete[] ch_jpeg; + break; + } + delete[] ch_jpeg; +#else break; +#endif // HAVE_JPEG + } if (verbose) { printf("%5d", number_of_frames); diff --git a/util/ssim.cc b/util/ssim.cc index d07889a..5a6399b 100644 --- a/util/ssim.cc +++ b/util/ssim.cc @@ -10,7 +10,6 @@ #include "../util/ssim.h" // NOLINT -#include <math.h> #include <string.h> #ifdef __cplusplus diff --git a/util/ssim.h b/util/ssim.h index 40120b4..430eb71 100644 --- a/util/ssim.h +++ b/util/ssim.h @@ -13,6 +13,8 @@ #ifndef UTIL_SSIM_H_ // NOLINT #define UTIL_SSIM_H_ +#include <math.h> // For log10() + #ifdef __cplusplus extern "C" { #endif @@ -25,7 +27,6 @@ typedef unsigned char uint8; double CalcSSIM(const uint8* org, const uint8* rec, const int image_width, const int image_height); -// does -10.0 * log10(1.0 - ssim) double CalcLSSIM(double ssim); #ifdef __cplusplus |