summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Chromium Automerger <chromium-automerger@android>2014-07-02 09:52:58 +0000
committerAndroid Chromium Automerger <chromium-automerger@android>2014-07-02 09:52:58 +0000
commit448d48d2537f3b269c15656acb1ad4a35d4891f2 (patch)
tree30d3d08306c1c333688946a83f1b7d43b09a4a2a
parentee5df9a9b13628a72d99e479dd762c7317c6fc56 (diff)
parent1347fdea0715f8903ab37e07976861f56de17210 (diff)
downloadlibyuv-448d48d2537f3b269c15656acb1ad4a35d4891f2.tar.gz
Merge third_party/libyuv from https://chromium.googlesource.com/external/libyuv.git at 1347fdea0715f8903ab37e07976861f56de17210
This commit was generated by merge_from_chromium.py. Change-Id: Id37bf4ca99264d10ede61d172cf0662464723d8e
-rw-r--r--DEPS2
-rw-r--r--README.chromium2
-rw-r--r--include/libyuv/convert_from_argb.h18
-rw-r--r--include/libyuv/row.h33
-rw-r--r--include/libyuv/version.h2
-rw-r--r--libyuv_test.gyp14
-rw-r--r--source/compare_neon.cc10
-rw-r--r--source/convert_argb.cc22
-rw-r--r--source/planar_functions.cc4
-rw-r--r--source/rotate_mips.cc9
-rw-r--r--source/rotate_neon.cc129
-rw-r--r--source/row_any.cc8
-rw-r--r--source/row_mips.cc7
-rw-r--r--source/row_neon.cc308
-rw-r--r--source/row_win.cc224
-rw-r--r--source/scale_mips.cc3
-rw-r--r--source/scale_neon.cc129
-rw-r--r--unit_test/convert_test.cc57
-rw-r--r--util/Makefile12
-rw-r--r--util/psnr.cc21
-rw-r--r--util/psnr.h14
-rw-r--r--util/psnr_main.cc96
-rw-r--r--util/ssim.cc1
-rw-r--r--util/ssim.h3
24 files changed, 983 insertions, 145 deletions
diff --git a/DEPS b/DEPS
index e4da873..fd71cb0 100644
--- a/DEPS
+++ b/DEPS
@@ -14,7 +14,7 @@ vars = {
"chromium_trunk" : "http://src.chromium.org/svn/trunk",
# chrome://version/ for revision of canary Chrome.
# http://chromium-status.appspot.com/lkgr is a last known good revision.
- "chromium_revision": "262938",
+ "chromium_revision": "274825",
}
# NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
diff --git a/README.chromium b/README.chromium
index 78d39a6..9657262 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1007
+Version: 1025
License: BSD
License File: LICENSE
diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h
index f0343a7..90f43af 100644
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@@ -25,24 +25,22 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
-// Convert ARGB To BGRA. (alias)
-#define ARGBToBGRA BGRAToARGB
+// Convert ARGB To BGRA.
LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bgra, int dst_stride_bgra,
int width, int height);
-// Convert ARGB To ABGR. (alias)
-#define ARGBToABGR ABGRToARGB
+// Convert ARGB To ABGR.
LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert ARGB To RGBA.
LIBYUV_API
-int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
// Convert ARGB To RGB24.
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 7a54e4a..e99c441 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -15,6 +15,10 @@
#include "libyuv/basic_types.h"
+#if defined(__native_client__)
+#include "ppapi/c/pp_macros.h" // For PPAPI_RELEASE
+#endif
+
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@@ -47,7 +51,12 @@ extern "C" {
#endif
// Enable for NaCL pepper 33 for bundle and AVX2 support.
-// #define NEW_BINUTILS
+#if defined(__native_client__) && PPAPI_RELEASE >= 33
+#define NEW_BINUTILS
+#endif
+#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
+#define LIBYUV_DISABLE_NEON
+#endif
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
@@ -152,6 +161,11 @@ extern "C" {
#define HAS_YUY2TOYROW_SSE2
#endif
+// The following are available on x64 Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
+#define HAS_I422TOARGBROW_SSSE3
+#endif
+
// GCC >= 4.7.0 required for AVX2.
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
@@ -237,8 +251,7 @@ extern "C" {
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) && \
- !defined(__native_client__)
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_ABGRTOUVROW_NEON
#define HAS_ABGRTOYROW_NEON
#define HAS_ARGB1555TOARGBROW_NEON
@@ -331,7 +344,8 @@ extern "C" {
#endif
// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
#define HAS_COPYROW_MIPS
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_I422TOABGRROW_MIPS_DSPR2
@@ -427,7 +441,7 @@ typedef uint8 uvec8[16];
"lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
#opcode " (%%r15,%%r14),%" #arg "\n" \
BUNDLEUNLOCK
-#else
+#else // defined(__native_client__) && defined(__x86_64__)
#define BUNDLEALIGN "\n"
#define MEMACCESS(base) "(%" #base ")"
#define MEMACCESS2(offset, base) #offset "(%" #base ")"
@@ -444,6 +458,15 @@ typedef uint8 uvec8[16];
#opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
#define MEMOPARG(opcode, offset, base, index, scale, arg) \
#opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#endif // defined(__native_client__) && defined(__x86_64__)
+
+#if defined(__arm__)
+#undef MEMACCESS
+#if defined(__native_client__)
+#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
+#else
+#define MEMACCESS(base) "\n"
+#endif
#endif
void I444ToARGBRow_NEON(const uint8* src_y,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 16faa9b..1663926 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1007
+#define LIBYUV_VERSION 1025
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/libyuv_test.gyp b/libyuv_test.gyp
index e7e9e76..ab04cde 100644
--- a/libyuv_test.gyp
+++ b/libyuv_test.gyp
@@ -56,6 +56,12 @@
'LIBYUV_DISABLE_NEON'
],
}],
+ [ 'OS == "ios"', {
+ 'xcode_settings': {
+ 'DEBUGGING_SYMBOLS': 'YES',
+ 'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym',
+ },
+ }],
[ 'OS != "ios"', {
'defines': [
'HAVE_JPEG',
@@ -116,7 +122,15 @@
'LIBYUV_DISABLE_NEON'
],
}],
+ [ 'OS != "ios"', {
+ 'defines': [
+ 'HAVE_JPEG',
+ ],
+ }],
], # conditions
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ ],
},
{
'target_name': 'cpuid',
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
index 77f42f4..5e7b8e4 100644
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -9,6 +9,7 @@
*/
#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -27,14 +28,9 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
".p2align 2 \n"
"1: \n"
- // TODO(fbarchard): Define a macro for clearing address bits for NaCL.
-#if defined(__native_client__)
- "bic %0, #0xc0000000 \n"
-#endif
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n"
-#if defined(__native_client__)
- "bic %1, #0xc0000000 \n"
-#endif
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index a8aab91..5f97f64 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -372,6 +372,17 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
width, height);
}
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskBGRAToARGB),
+ width, height);
+}
+
// Convert ABGR to ARGB.
LIBYUV_API
int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
@@ -383,6 +394,17 @@ int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
width, height);
}
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskABGRToARGB),
+ width, height);
+}
+
// Convert RGBA to ARGB.
LIBYUV_API
int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index f1297ca..3857008 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -37,6 +37,10 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
height = 1;
src_stride_y = dst_stride_y = 0;
}
+ // Nothing to do.
+ if (src_y == dst_y && src_stride_y == dst_stride_y) {
+ return;
+ }
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
diff --git a/source/rotate_mips.cc b/source/rotate_mips.cc
index 04d5a66..70770fd 100644
--- a/source/rotate_mips.cc
+++ b/source/rotate_mips.cc
@@ -18,7 +18,8 @@ extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && \
- defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
@@ -303,10 +304,8 @@ void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
[width] "+r" (width)
:[src_stride] "r" (src_stride),
[dst_stride] "r" (dst_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9",
- "s0", "s1", "s2", "s3", "s4",
- "s5", "s6", "s7"
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
);
}
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index 7375bab..d354e11 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -17,8 +17,8 @@ namespace libyuv {
extern "C" {
#endif
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
- !defined(__native_client__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
@@ -37,13 +37,21 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"1: \n"
"mov %0, %1 \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.8 {d1}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.8 {d3}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.8 {d4}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.8 {d5}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.8 {d6}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n"
@@ -68,13 +76,21 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"mov %0, %3 \n"
+ MEMACCESS(0)
"vst1.8 {d1}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d3}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d5}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d7}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d6}, [%0] \n"
"add %1, #8 \n" // src += 8
@@ -96,17 +112,26 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
// 4x8 block
"mov %0, %1 \n"
+ MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.32 {d2[0]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.32 {d2[1]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.32 {d3[0]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.32 {d3[1]}, [%0] \n"
"mov %0, %3 \n"
+ MEMACCESS(6)
"vld1.8 {q3}, [%6] \n"
"vtbl.8 d4, {d0, d1}, d6 \n"
@@ -116,15 +141,23 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
+ MEMACCESS(0)
"vst1.32 {d4[0]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d4[1]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d5[0]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d5[1]}, [%0] \n"
"add %0, %3, #4 \n"
+ MEMACCESS(0)
"vst1.32 {d0[0]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d0[1]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d1[0]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d1[1]}, [%0] \n"
"add %1, #4 \n" // src += 4
@@ -140,20 +173,30 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
// 2x8 block
"2: \n"
"mov %0, %1 \n"
+ MEMACCESS(0)
"vld1.16 {d0[0]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.16 {d1[0]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.16 {d0[1]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.16 {d1[1]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.16 {d0[2]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.16 {d1[2]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.16 {d0[3]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.16 {d1[3]}, [%0] \n"
"vtrn.8 d0, d1 \n"
"mov %0, %3 \n"
+ MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.64 {d1}, [%0] \n"
"add %1, #2 \n" // src += 2
@@ -163,15 +206,24 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
// 1x8 block
"3: \n"
+ MEMACCESS(1)
"vld1.8 {d0[0]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld1.8 {d0[1]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld1.8 {d0[2]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld1.8 {d0[3]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld1.8 {d0[4]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld1.8 {d0[5]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld1.8 {d0[6]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld1.8 {d0[7]}, [%1] \n"
+ MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
"4: \n"
@@ -206,13 +258,21 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"1: \n"
"mov %0, %1 \n"
+ MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.8 {d2, d3}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.8 {d4, d5}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.8 {d6, d7}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.8 {d16, d17}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.8 {d18, d19}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.8 {d20, d21}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.8 {d22, d23}, [%0] \n"
"vtrn.8 q1, q0 \n"
@@ -241,24 +301,40 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"mov %0, %3 \n"
+ MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d6}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d18}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d16}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d22}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.8 {d20}, [%0] \n"
"mov %0, %5 \n"
+ MEMACCESS(0)
"vst1.8 {d3}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.8 {d1}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.8 {d7}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.8 {d5}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.8 {d19}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.8 {d17}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.8 {d23}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.8 {d21}, [%0] \n"
"add %1, #8*2 \n" // src += 8*2
@@ -279,18 +355,27 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"cmp %7, #4 \n"
"blt 2f \n"
- //TODO(frkoenig): Clean this up
+ // TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
+ MEMACCESS(0)
"vld1.64 {d0}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.64 {d1}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.64 {d2}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.64 {d3}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.64 {d4}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.64 {d5}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.64 {d6}, [%0], %2 \n"
+ MEMACCESS(0)
"vld1.64 {d7}, [%0] \n"
+ MEMACCESS(8)
"vld1.8 {q15}, [%8] \n"
"vtrn.8 q0, q1 \n"
@@ -307,28 +392,44 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"mov %0, %3 \n"
+ MEMACCESS(0)
"vst1.32 {d16[0]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d16[1]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d17[0]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d17[1]}, [%0], %4 \n"
"add %0, %3, #4 \n"
+ MEMACCESS(0)
"vst1.32 {d20[0]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d20[1]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d21[0]}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.32 {d21[1]}, [%0] \n"
"mov %0, %5 \n"
+ MEMACCESS(0)
"vst1.32 {d18[0]}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.32 {d18[1]}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.32 {d19[0]}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.32 {d19[1]}, [%0], %6 \n"
"add %0, %5, #4 \n"
+ MEMACCESS(0)
"vst1.32 {d22[0]}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.32 {d22[1]}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.32 {d23[0]}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.32 {d23[1]}, [%0] \n"
"add %1, #4*2 \n" // src += 4 * 2
@@ -345,13 +446,21 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
// 2x8 block
"2: \n"
"mov %0, %1 \n"
+ MEMACCESS(0)
"vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
+ MEMACCESS(0)
"vld2.16 {d1[3], d3[3]}, [%0] \n"
"vtrn.8 d0, d1 \n"
@@ -359,12 +468,16 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"mov %0, %3 \n"
+ MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
"vst1.64 {d2}, [%0] \n"
"mov %0, %5 \n"
+ MEMACCESS(0)
"vst1.64 {d1}, [%0], %6 \n"
+ MEMACCESS(0)
"vst1.64 {d3}, [%0] \n"
"add %1, #2*2 \n" // src += 2 * 2
@@ -375,16 +488,26 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
// 1x8 block
"3: \n"
+ MEMACCESS(1)
"vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
+ MEMACCESS(1)
"vld2.8 {d0[7], d1[7]}, [%1] \n"
+ MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
+ MEMACCESS(5)
"vst1.64 {d1}, [%5] \n"
"4: \n"
diff --git a/source/row_any.cc b/source/row_any.cc
index 90c6a3f..97ef844 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -35,10 +35,12 @@ extern "C" {
}
#ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
- 0, 4, 7)
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
1, 4, 7)
+#endif // HAS_I422TOARGBROW_SSSE3
+#ifdef HAS_I444TOARGBROW_SSSE3
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
+ 0, 4, 7)
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
2, 4, 7)
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
@@ -59,7 +61,7 @@ YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
-#endif // HAS_I422TOARGBROW_SSSE3
+#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TOARGBROW_AVX2
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
#endif // HAS_I422TOARGBROW_AVX2
diff --git a/source/row_mips.cc b/source/row_mips.cc
index 4435c55..ae9370c 100644
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -16,7 +16,8 @@ extern "C" {
#endif
// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
#ifdef HAS_COPYROW_MIPS
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
@@ -376,7 +377,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
// MIPS DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
- (__mips_dsp_rev >= 2)
+ (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
__asm__ __volatile__ (
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 4f5158f..a84e3e4 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -16,39 +16,50 @@ extern "C" {
#endif
// This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
- !defined(__native_client__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.32 {d2[0]}, [%1]! \n" \
+ MEMACCESS(2) \
"vld1.32 {d2[1]}, [%2]! \n"
// Read 8 Y, 2 U and 2 V from 422
#define READYUV411 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.16 {d2[0]}, [%1]! \n" \
+ MEMACCESS(2) \
"vld1.16 {d2[1]}, [%2]! \n" \
"vmov.u8 d3, d2 \n" \
"vzip.u8 d2, d3 \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
+ MEMACCESS(2) \
"vld1.8 {d3}, [%2]! \n" \
"vpaddl.u8 q1, q1 \n" \
"vrshrn.u16 d2, q1, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d2, d3 \n" \
@@ -56,7 +67,9 @@ extern "C" {
// Read 8 Y and 4 VU from NV21
#define READNV21 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d3, d2 \n" \
@@ -64,6 +77,7 @@ extern "C" {
// Read 8 YUY2
#define READYUY2 \
+ MEMACCESS(0) \
"vld2.8 {d0, d2}, [%0]! \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
@@ -71,6 +85,7 @@ extern "C" {
// Read 8 UYVY
#define READUYVY \
+ MEMACCESS(0) \
"vld2.8 {d2, d3}, [%0]! \n" \
"vmov.u8 d0, d3 \n" \
"vmov.u8 d3, d2 \n" \
@@ -114,7 +129,9 @@ void I444ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -125,6 +142,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -145,7 +163,9 @@ void I422ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -156,6 +176,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -176,7 +197,9 @@ void I411ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -187,6 +210,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -207,7 +231,9 @@ void I422ToBGRARow_NEON(const uint8* src_y,
uint8* dst_bgra,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -219,6 +245,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
"vmov.u8 d19, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -239,7 +266,9 @@ void I422ToABGRRow_NEON(const uint8* src_y,
uint8* dst_abgr,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -251,6 +280,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -271,7 +301,9 @@ void I422ToRGBARow_NEON(const uint8* src_y,
uint8* dst_rgba,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -282,6 +314,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d19, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -302,7 +335,9 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
uint8* dst_rgb24,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -312,6 +347,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
READYUV422
YUV422TORGB
"subs %4, %4, #8 \n"
+ MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -332,7 +368,9 @@ void I422ToRAWRow_NEON(const uint8* src_y,
uint8* dst_raw,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -343,6 +381,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
+ MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -375,7 +414,9 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -386,6 +427,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
ARGBTORGB565
+ MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -421,7 +463,9 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
uint8* dst_argb1555,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -433,6 +477,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
ARGBTOARGB1555
+ MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -462,7 +507,9 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
uint8* dst_argb4444,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -475,6 +522,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
ARGBTOARGB4444
+ MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -493,7 +541,10 @@ void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(3)
+ MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -504,6 +555,7 @@ void YToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -523,10 +575,12 @@ void I400ToARGBRow_NEON(const uint8* src_y,
".p2align 2 \n"
"vmov.u8 d23, #255 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d20}, [%0]! \n"
"vmov d21, d20 \n"
"vmov d22, d20 \n"
"subs %2, %2, #8 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -542,7 +596,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -553,6 +609,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -571,7 +628,9 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -582,6 +641,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -600,7 +660,9 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -611,6 +673,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
YUV422TORGB
"subs %3, %3, #8 \n"
ARGBTORGB565
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -629,7 +692,9 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -640,6 +705,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
YUV422TORGB
"subs %3, %3, #8 \n"
ARGBTORGB565
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -657,7 +723,9 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -668,6 +736,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -684,7 +753,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -695,6 +766,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -713,9 +785,12 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store U
+ MEMACCESS(2)
"vst1.8 {q1}, [%2]! \n" // store V
"bgt 1b \n"
: "+r"(src_uv), // %0
@@ -733,9 +808,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load U
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(2)
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
"bgt 1b \n"
:
@@ -753,8 +831,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
+ MEMACCESS(1)
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
"bgt 1b \n"
: "+r"(src), // %0
@@ -771,6 +851,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) {
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
@@ -799,10 +880,13 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #16 \n" // 16 pixels per loop.
"vrev64.8 q0, q0 \n"
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src), // %0
@@ -823,10 +907,13 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
"subs %3, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // dst += 8
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
@@ -847,10 +934,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #4 \n" // 4 pixels per loop.
"vrev64.32 q0, q0 \n"
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src), // %0
@@ -866,8 +956,10 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
@@ -883,9 +975,11 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
"vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_raw), // %0
@@ -913,9 +1007,11 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
"vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_rgb565), // %0
@@ -959,9 +1055,11 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
"vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb1555), // %0
@@ -988,9 +1086,11 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
"vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb4444), // %0
@@ -1005,8 +1105,10 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1021,9 +1123,11 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1038,8 +1142,10 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -1054,8 +1160,10 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
+ MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -1071,9 +1179,12 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 U.
+ MEMACCESS(2)
"vst1.8 {d3}, [%2]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -1090,9 +1201,12 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 U.
+ MEMACCESS(2)
"vst1.8 {d2}, [%2]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -1110,12 +1224,16 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"add %1, %0, %1 \n" // stride + src_yuy2
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 U.
+ MEMACCESS(3)
"vst1.8 {d3}, [%3]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -1134,12 +1252,16 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"add %1, %0, %1 \n" // stride + src_uyvy
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 U.
+ MEMACCESS(3)
"vst1.8 {d2}, [%3]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -1158,10 +1280,13 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
"vrhadd.u8 q0, q1 \n" // average row 1 and 2
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
@@ -1179,11 +1304,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile (
"vmov.u32 d6[0], %3 \n" // selector
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
"vtrn.u32 d4, d5 \n" // combine 8 pixels
+ MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n" // store 8.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1199,8 +1326,10 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /*selector*/, int pix) {
asm volatile (
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 G's.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1215,12 +1344,15 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
"vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 4.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1238,10 +1370,14 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 Us
+ MEMACCESS(2)
"vld1.8 {d3}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
+ MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -1261,10 +1397,14 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 Us
+ MEMACCESS(2)
"vld1.8 {d2}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
+ MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -1281,9 +1421,11 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1299,9 +1441,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1318,9 +1462,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1339,6 +1485,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@@ -1346,6 +1493,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1363,12 +1511,14 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1391,6 +1541,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@@ -1406,7 +1557,9 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1430,7 +1583,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
@@ -1451,7 +1606,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1476,12 +1633,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(0)
"vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
"vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1509,7 +1670,9 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1548,12 +1711,16 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1565,7 +1732,9 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1592,12 +1761,16 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1609,7 +1782,9 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1635,12 +1810,16 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
"vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
@@ -1652,7 +1831,9 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q3, q2, q1)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_bgra), // %0
@@ -1678,12 +1859,16 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1695,7 +1880,9 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_abgr), // %0
@@ -1721,12 +1908,16 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
"vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
@@ -1738,7 +1929,9 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgba), // %0
@@ -1764,12 +1957,16 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ MEMACCESS(0)
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1781,7 +1978,9 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
@@ -1807,12 +2006,16 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ MEMACCESS(0)
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1824,7 +2027,9 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_raw), // %0
@@ -1851,22 +2056,26 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
@@ -1888,7 +2097,9 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgb565), // %0
@@ -1915,22 +2126,26 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
@@ -1952,7 +2167,9 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb1555), // %0
@@ -1979,22 +2196,26 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
@@ -2016,7 +2237,9 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb4444), // %0
@@ -2038,6 +2261,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
@@ -2046,6 +2270,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgb565), // %0
@@ -2064,6 +2289,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
@@ -2072,6 +2298,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb1555), // %0
@@ -2090,6 +2317,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
@@ -2098,6 +2326,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb4444), // %0
@@ -2116,6 +2345,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // R
@@ -2123,6 +2353,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"vmlal.u8 q8, d3, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_bgra), // %0
@@ -2141,6 +2372,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // R
@@ -2148,6 +2380,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"vmlal.u8 q8, d2, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_abgr), // %0
@@ -2166,6 +2399,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // B
@@ -2173,6 +2407,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"vmlal.u8 q8, d3, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgba), // %0
@@ -2191,6 +2426,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
@@ -2198,6 +2434,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
@@ -2216,6 +2453,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
@@ -2223,6 +2461,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_raw), // %0
@@ -2253,7 +2492,9 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
@@ -2262,46 +2503,58 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
@@ -2324,7 +2577,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"blt 89f \n"
// Blend 8 pixels.
"8: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a
@@ -2338,6 +2593,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"vqadd.u8 q0, q0, q2 \n" // + sbg
"vqadd.u8 d2, d2, d6 \n" // + sr
"vmov.u8 d3, #255 \n" // a = 255
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
"bge 8b \n"
@@ -2347,7 +2603,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// Blend 1 pixels.
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ MEMACCESS(1)
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a
@@ -2361,6 +2619,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"vqadd.u8 q0, q0, q2 \n" // + sbg
"vqadd.u8 d2, d2, d6 \n" // + sr
"vmov.u8 d3, #255 \n" // a = 255
+ MEMACCESS(2)
"vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
"bge 1b \n"
@@ -2380,6 +2639,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
// Attenuate 8 pixels.
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a
@@ -2388,6 +2648,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2411,6 +2672,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n" // b (0 .. 255)
@@ -2428,6 +2690,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"vqmovn.u16 d0, q0 \n"
"vqmovn.u16 d2, q1 \n"
"vqmovn.u16 d4, q2 \n"
+ MEMACCESS(0)
"vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(dst_argb), // %0
@@ -2452,6 +2715,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q10, d20 \n" // b (0 .. 255)
@@ -2466,6 +2730,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"vqmovn.u16 d22, q11 \n"
"vqmovn.u16 d24, q12 \n"
"vqmovn.u16 d26, q13 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2486,6 +2751,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@@ -2494,6 +2760,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
"vmov d1, d0 \n" // G
"vmov d2, d0 \n" // R
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2521,6 +2788,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vmov.u8 d30, #50 \n" // BR coefficient
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B to Sepia B
@@ -2535,6 +2803,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
"vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
"vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ MEMACCESS(0)
"vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(dst_argb), // %0
@@ -2551,12 +2820,14 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
@@ -2595,6 +2866,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ MEMACCESS(1)
"vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2615,7 +2887,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q0, d0, d1 \n" // multiply B
@@ -2626,6 +2900,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
"vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
@@ -2646,11 +2921,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 q0, q0, q2 \n" // add B, G
"vqadd.u8 q1, q1, q3 \n" // add R, A
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
@@ -2670,11 +2948,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqsub.u8 q0, q0, q2 \n" // subtract B, G
"vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
@@ -2699,12 +2980,15 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d0, d0, d1 \n" // add
"vmov.u8 d1, d0 \n"
"vmov.u8 d2, d0 \n"
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
@@ -2723,10 +3007,13 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// 16 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop.
"vqadd.u8 q0, q0, q1 \n" // add
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 16 pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
@@ -2750,10 +3037,13 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d1, d0, d2 \n" // add
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
@@ -2774,21 +3064,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0],%5 \n" // top
+ MEMACCESS(0)
"vld1.8 {d1}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n"
+ MEMACCESS(1)
"vld1.8 {d2}, [%1],%5 \n" // center * 2
+ MEMACCESS(1)
"vld1.8 {d3}, [%1],%6 \n"
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n"
+ MEMACCESS(2)
"vld1.8 {d2}, [%2],%5 \n" // bottom
+ MEMACCESS(2)
"vld1.8 {d3}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n"
+ MEMACCESS(3)
"vst1.8 {d0}, [%3]! \n" // store 8 sobelx
"bgt 1b \n"
: "+r"(src_y0), // %0
@@ -2811,21 +3108,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0],%4 \n" // left
+ MEMACCESS(1)
"vld1.8 {d1}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0],%4 \n" // center * 2
+ MEMACCESS(1)
"vld1.8 {d3}, [%1],%4 \n"
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0],%5 \n" // right
+ MEMACCESS(1)
"vld1.8 {d3}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n"
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 sobely
"bgt 1b \n"
: "+r"(src_y0), // %0
diff --git a/source/row_win.cc b/source/row_win.cc
index f13e4d7..8eb8889 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -10,13 +10,177 @@
#include "libyuv/row.h"
+#if defined (_M_X64)
+#include <emmintrin.h>
+#include <tmmintrin.h> // For _mm_maddubs_epi16
+#endif
+
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+
+#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(127,(int8)(2.018 * 64)) */
+#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
+#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static const vec8 kUVToB = {
+ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+
+static const vec8 kUVToR = {
+ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+
+static const vec8 kUVToG = {
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+
+static const vec8 kVUToB = {
+ VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+};
+
+static const vec8 kVUToR = {
+ VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+};
+
+static const vec8 kVUToG = {
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+};
+
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+
+// 64 bit
+#if defined(_M_X64)
+
+// Aligned destination version.
+__declspec(align(16))
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ const __m128i xmm5 = _mm_set1_epi8(-1);
+ const __m128i xmm4 = _mm_setzero_si128();
+ const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+ while (width > 0) {
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+ xmm1 = _mm_load_si128(&xmm0);
+ xmm2 = _mm_load_si128(&xmm0);
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
+ xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
+ xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
+ xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+ xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
+ xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
+ xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+ xmm0 = _mm_adds_epi16(xmm0, xmm3);
+ xmm1 = _mm_adds_epi16(xmm1, xmm3);
+ xmm2 = _mm_adds_epi16(xmm2, xmm3);
+ xmm0 = _mm_srai_epi16(xmm0, 6);
+ xmm1 = _mm_srai_epi16(xmm1, 6);
+ xmm2 = _mm_srai_epi16(xmm2, 6);
+ xmm0 = _mm_packus_epi16(xmm0, xmm0);
+ xmm1 = _mm_packus_epi16(xmm1, xmm1);
+ xmm2 = _mm_packus_epi16(xmm2, xmm2);
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+ xmm1 = _mm_load_si128(&xmm0);
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+ _mm_store_si128((__m128i *)dst_argb, xmm0);
+ _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
+
+ y_buf += 8;
+ u_buf += 4;
+ dst_argb += 32;
+ width -= 8;
+ }
+}
+
+// Unaligned destination version.
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ const __m128i xmm5 = _mm_set1_epi8(-1);
+ const __m128i xmm4 = _mm_setzero_si128();
+ const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+ while (width > 0) {
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+ xmm1 = _mm_load_si128(&xmm0);
+ xmm2 = _mm_load_si128(&xmm0);
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
+ xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
+ xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
+ xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+ xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
+ xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
+ xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+ xmm0 = _mm_adds_epi16(xmm0, xmm3);
+ xmm1 = _mm_adds_epi16(xmm1, xmm3);
+ xmm2 = _mm_adds_epi16(xmm2, xmm3);
+ xmm0 = _mm_srai_epi16(xmm0, 6);
+ xmm1 = _mm_srai_epi16(xmm1, 6);
+ xmm2 = _mm_srai_epi16(xmm2, 6);
+ xmm0 = _mm_packus_epi16(xmm0, xmm0);
+ xmm1 = _mm_packus_epi16(xmm1, xmm1);
+ xmm2 = _mm_packus_epi16(xmm2, xmm2);
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+ xmm1 = _mm_load_si128(&xmm0);
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+ _mm_storeu_si128((__m128i *)dst_argb, xmm0);
+ _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
+
+ y_buf += 8;
+ u_buf += 4;
+ dst_argb += 32;
+ width -= 8;
+ }
+}
+// 32 bit
+#else // defined(_M_X64)
#ifdef HAS_ARGBTOYROW_SSSE3
@@ -2030,21 +2194,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
#endif // HAS_ARGBTOYROW_SSSE3
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
#ifdef HAS_I422TOARGBROW_AVX2
static const lvec8 kUVToB_AVX = {
@@ -2079,10 +2228,10 @@ static const lvec16 kUVBiasR_AVX = {
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16))
void I422ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
push edi
@@ -2150,36 +2299,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_I422TOARGBROW_SSSE3
-static const vec8 kUVToB = {
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-
-static const vec8 kUVToR = {
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-
-static const vec8 kUVToG = {
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-
-static const vec8 kVUToB = {
- VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
-};
-
-static const vec8 kVUToR = {
- VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
-};
-
-static const vec8 kVUToG = {
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-};
-
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
-
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
// Read 8 UV from 444.
@@ -7276,7 +7395,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
}
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#endif // defined(_M_X64)
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
#ifdef __cplusplus
} // extern "C"
diff --git a/source/scale_mips.cc b/source/scale_mips.cc
index 4572f45..3eb4f27 100644
--- a/source/scale_mips.cc
+++ b/source/scale_mips.cc
@@ -18,7 +18,8 @@ extern "C" {
// This module is for GCC MIPS DSPR2
#if !defined(LIBYUV_DISABLE_MIPS) && \
- defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 410364a..1b8a5ba 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -16,8 +16,7 @@ extern "C" {
#endif
// This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
- !defined(__native_client__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
// NEON downscalers with interpolation.
// Provided by Fritz Koenig
@@ -29,8 +28,10 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -49,7 +50,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %0 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ MEMACCESS(1)
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
@@ -58,6 +61,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -74,8 +78,10 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
"vst1.8 {d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -88,16 +94,20 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile (
- "add r4, %0, %3 \n"
- "add r5, r4, %3 \n"
- "add %3, r5, %3 \n"
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ const uint8* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4
- "vld1.8 {q1}, [r4]! \n"
- "vld1.8 {q2}, [r5]! \n"
- "vld1.8 {q3}, [%3]! \n"
+ MEMACCESS(3)
+ "vld1.8 {q1}, [%3]! \n"
+ MEMACCESS(4)
+ "vld1.8 {q2}, [%4]! \n"
+ MEMACCESS(5)
+ "vld1.8 {q3}, [%5]! \n"
"subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
@@ -106,13 +116,17 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
+ MEMACCESS(1)
"vst1.32 {d0[0]}, [%1]! \n"
"bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(src_stride) // %3
- : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(src_ptr2), // %4
+ "+r"(src_ptr3) // %5
+ :
+ : "q0", "q1", "q2", "q3", "memory", "cc"
);
}
@@ -125,9 +139,11 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2
+ MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -146,7 +162,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
@@ -183,6 +201,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n"
+ MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
@@ -203,7 +222,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
// average src line 0 with src line 1
@@ -223,6 +244,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n"
+ MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -251,14 +273,18 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {q3}, [%3] \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n"
+ MEMACCESS(1)
"vst1.32 {d5[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -273,11 +299,15 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
asm volatile (
- "vld1.16 {q13}, [%4] \n"
- "vld1.8 {q14}, [%5] \n"
- "vld1.8 {q15}, [%6] \n"
- "add r4, %0, %3, lsl #1 \n"
+ MEMACCESS(5)
+ "vld1.16 {q13}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {q14}, [%6] \n"
+ MEMACCESS(7)
+ "vld1.8 {q15}, [%7] \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
@@ -286,9 +316,12 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "vld4.8 {d16, d17, d18, d19}, [r4]! \n"
+ MEMACCESS(4)
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
@@ -365,18 +398,20 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ MEMACCESS(1)
"vst1.8 {d3}, [%1]! \n"
+ MEMACCESS(1)
"vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2), // %5
- "r"(&kMult38_Div9) // %6
- : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
- "q13", "q14", "q15", "memory", "cc"
+ "+r"(src_stride), // %3
+ "+r"(src_ptr1) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
);
}
@@ -385,7 +420,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
+ MEMACCESS(4)
"vld1.16 {q13}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {q14}, [%5] \n"
"add %3, %0 \n"
".p2align 2 \n"
@@ -395,7 +432,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"subs %2, %2, #12 \n"
@@ -462,7 +501,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ MEMACCESS(1)
"vst1.8 {d3}, [%1]! \n"
+ MEMACCESS(1)
"vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -495,7 +536,9 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
@@ -504,50 +547,63 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
+ MEMACCESS(0)
"vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@@ -565,10 +621,14 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
+ MEMACCESS(0)
"vld2.32 {q0, q1}, [%0]! \n"
+ MEMACCESS(0)
"vld2.32 {q2, q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store odd pixels
+ MEMACCESS(1)
"vst1.8 {q3}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -586,14 +646,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
@@ -603,6 +667,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n"
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -622,11 +687,16 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"mov r12, %3, lsl #2 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], r12 \n"
+ MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], r12 \n"
+ MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], r12 \n"
+ MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], r12 \n"
"subs %2, %2, #4 \n" // 4 pixels per loop.
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -647,13 +717,21 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
+ MEMACCESS(1)
"vld1.8 {d1}, [%1], r12 \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0], r12 \n"
+ MEMACCESS(1)
"vld1.8 {d3}, [%1], r12 \n"
+ MEMACCESS(0)
"vld1.8 {d4}, [%0], r12 \n"
+ MEMACCESS(1)
"vld1.8 {d5}, [%1], r12 \n"
+ MEMACCESS(0)
"vld1.8 {d6}, [%0], r12 \n"
+ MEMACCESS(1)
"vld1.8 {d7}, [%1], r12 \n"
"vaddl.u8 q0, d0, d1 \n"
"vaddl.u8 q1, d2, d3 \n"
@@ -666,6 +744,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
"vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
"subs %3, %3, #4 \n" // 4 pixels per loop.
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 0a9d95f..0697b9f 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -963,6 +963,63 @@ TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
+#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \
+ W1280, N, NEG, OFF) \
+TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ align_buffer_64(src_argb, kStrideA * kHeightA + OFF); \
+ align_buffer_64(dst_argb_c, kStrideA * kHeightA); \
+ align_buffer_64(dst_argb_opt, kStrideA * kHeightA); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i + OFF] = (random() & 0xff); \
+ } \
+ memset(dst_argb_c, 1, kStrideA * kHeightA); \
+ memset(dst_argb_opt, 101, kStrideA * kHeightA); \
+ MaskCpuFlags(0); \
+ FMT_ATOB(src_argb + OFF, kStrideA, \
+ dst_argb_c, kStrideA, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_ATOB(src_argb + OFF, kStrideA, \
+ dst_argb_opt, kStrideA, \
+ kWidth, NEG kHeight); \
+ } \
+ MaskCpuFlags(0); \
+ FMT_ATOB(dst_argb_c, kStrideA, \
+ dst_argb_c, kStrideA, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ FMT_ATOB(dst_argb_opt, kStrideA, \
+ dst_argb_opt, kStrideA, \
+ kWidth, NEG kHeight); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
+ } \
+ free_aligned_buffer_64(src_argb); \
+ free_aligned_buffer_64(dst_argb_c); \
+ free_aligned_buffer_64(dst_argb_opt); \
+}
+
+#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A) \
+ TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \
+ benchmark_width_, _Opt, +, 0)
+
+TESTSYM(ARGBToARGB, 4, 4, 1)
+TESTSYM(ARGBToBGRA, 4, 4, 1)
+TESTSYM(ARGBToABGR, 4, 4, 1)
+TESTSYM(BGRAToARGB, 4, 4, 1)
+TESTSYM(ABGRToARGB, 4, 4, 1)
+
TEST_F(libyuvTest, Test565) {
SIMD_ALIGNED(uint8 orig_pixels[256][4]);
SIMD_ALIGNED(uint8 pixels565[256][2]);
diff --git a/util/Makefile b/util/Makefile
index be6de35..6044d2a 100644
--- a/util/Makefile
+++ b/util/Makefile
@@ -1,6 +1,6 @@
-psnr: psnr.cc ssim.cc psnr_main.cc
-ifeq ($(CXX),icl)
- $(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
-else
- $(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
-endif
+psnr: psnr.cc ssim.cc psnr_main.cc
+ifeq ($(CXX),icl)
+ $(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+else
+ $(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
+endif
diff --git a/util/psnr.cc b/util/psnr.cc
index e8fd16a..f3cc0cf 100644
--- a/util/psnr.cc
+++ b/util/psnr.cc
@@ -10,8 +10,6 @@
#include "./psnr.h" // NOLINT
-#include <math.h>
-
#ifdef _OPENMP
#include <omp.h>
#endif
@@ -34,13 +32,8 @@ typedef unsigned long long uint64; // NOLINT
#endif // __LP64__
#endif // _MSC_VER
-// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
-double ComputePSNR(double sse, double size) {
- const double kMINSSE = 255.0 * 255.0 * size / pow(10., kMaxPSNR / 10.);
- if (sse <= kMINSSE)
- sse = kMINSSE; // Produces max PSNR of 128
- return 10.0 * log10(65025.0 * size / sse);
-}
+// libyuv provides this function when linking library for jpeg support.
+#if !defined(HAVE_JPEG)
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
#define HAS_SUMSQUAREERROR_NEON
@@ -241,6 +234,16 @@ double ComputeSumSquareError(const uint8* src_a,
}
return static_cast<double>(sse);
}
+#endif
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size) {
+ const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
+ if (sse <= kMINSSE)
+ sse = kMINSSE; // Produces max PSNR of 128
+ return 10.0 * log10(255.0 * 255.0 * size / sse);
+}
#ifdef __cplusplus
} // extern "C"
diff --git a/util/psnr.h b/util/psnr.h
index 370337a..0816b97 100644
--- a/util/psnr.h
+++ b/util/psnr.h
@@ -13,6 +13,8 @@
#ifndef UTIL_PSNR_H_ // NOLINT
#define UTIL_PSNR_H_
+#include <math.h> // For log10()
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -24,13 +26,17 @@ typedef unsigned char uint8;
static const double kMaxPSNR = 128.0;
-// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse).
-// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
-double ComputePSNR(double sse, double size);
-
+// libyuv provides this function when linking library for jpeg support.
+// TODO(fbarchard): make psnr lib compatible subset of libyuv.
+#if !defined(HAVE_JPEG)
// Computer Sum of Squared Error (SSE).
// Pass this to ComputePSNR for final result.
double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
+#endif
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size);
#ifdef __cplusplus
} // extern "C"
diff --git a/util/psnr_main.cc b/util/psnr_main.cc
index 9cee5f8..a26bc92 100644
--- a/util/psnr_main.cc
+++ b/util/psnr_main.cc
@@ -32,6 +32,10 @@
#include "./psnr.h"
#include "./ssim.h"
+#ifdef HAVE_JPEG
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#endif
struct metric {
double y, u, v, all;
@@ -75,6 +79,29 @@ bool ExtractResolutionFromFilename(const char* name,
}
}
}
+
+#ifdef HAVE_JPEG
+ // Try parsing file as a jpeg.
+ FILE* const file_org = fopen(name, "rb");
+ if (file_org == NULL) {
+ fprintf(stderr, "Cannot open %s\n", name);
+ return false;
+ }
+ fseek(file_org, 0, SEEK_END);
+ size_t total_size = ftell(file_org);
+ fseek(file_org, 0, SEEK_SET);
+ uint8* const ch_org = new uint8[total_size];
+ memset(ch_org, 0, total_size);
+ size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
+ fclose(file_org);
+ if (bytes_org == total_size) {
+ if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) {
+ delete[] ch_org;
+ return true;
+ }
+ }
+ delete[] ch_org;
+#endif // HAVE_JPEG
return false;
}
@@ -215,9 +242,18 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
const uint8* const v_org = ch_org + y_size + (uv_size - uv_offset);
const uint8* const v_rec = ch_rec + y_size + uv_size;
if (do_psnr) {
+#ifdef HAVE_JPEG
+ double y_err = static_cast<double>(
+ libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
+ double u_err = static_cast<double>(
+ libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
+ double v_err = static_cast<double>(
+ libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
+#else
double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
double v_err = ComputeSumSquareError(v_org, v_rec, uv_size);
+#endif
const double total_err = y_err + u_err + v_err;
cur_distortion_psnr->global_y += y_err;
cur_distortion_psnr->global_u += u_err;
@@ -230,10 +266,10 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
static_cast<double>(total_size));
} else {
distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
- distorted_frame->u = CalcSSIM(u_org, u_rec, image_width / 2,
- image_height / 2);
- distorted_frame->v = CalcSSIM(v_org, v_rec, image_width / 2,
- image_height / 2);
+ distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2,
+ (image_height + 1) / 2);
+ distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2,
+ (image_height + 1) / 2);
distorted_frame->all =
(distorted_frame->y + distorted_frame->u + distorted_frame->v)
/ total_size;
@@ -386,14 +422,62 @@ int main(int argc, const char* argv[]) {
break;
size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
- if (bytes_org < total_size)
+ if (bytes_org < total_size) {
+#ifdef HAVE_JPEG
+ // Try parsing file as a jpeg.
+ uint8* const ch_jpeg = new uint8[bytes_org];
+ memcpy(ch_jpeg, ch_org, bytes_org);
+ memset(ch_org, 0, total_size);
+
+ if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org,
+ ch_org,
+ image_width,
+ ch_org + y_size,
+ (image_width + 1) / 2,
+ ch_org + y_size + uv_size,
+ (image_width + 1) / 2,
+ image_width,
+ image_height,
+ image_width,
+ image_height)) {
+ delete[] ch_jpeg;
+ break;
+ }
+ delete[] ch_jpeg;
+#else
break;
+#endif // HAVE_JPEG
+ }
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
size_t bytes_rec = fread(ch_rec, sizeof(uint8),
total_size, file_rec[cur_rec]);
- if (bytes_rec < total_size)
+ if (bytes_rec < total_size) {
+#ifdef HAVE_JPEG
+ // Try parsing file as a jpeg.
+ uint8* const ch_jpeg = new uint8[bytes_rec];
+ memcpy(ch_jpeg, ch_rec, bytes_rec);
+ memset(ch_rec, 0, total_size);
+
+ if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec,
+ ch_rec,
+ image_width,
+ ch_rec + y_size,
+ (image_width + 1) / 2,
+ ch_rec + y_size + uv_size,
+ (image_width + 1) / 2,
+ image_width,
+ image_height,
+ image_width,
+ image_height)) {
+ delete[] ch_jpeg;
+ break;
+ }
+ delete[] ch_jpeg;
+#else
break;
+#endif // HAVE_JPEG
+ }
if (verbose) {
printf("%5d", number_of_frames);
diff --git a/util/ssim.cc b/util/ssim.cc
index d07889a..5a6399b 100644
--- a/util/ssim.cc
+++ b/util/ssim.cc
@@ -10,7 +10,6 @@
#include "../util/ssim.h" // NOLINT
-#include <math.h>
#include <string.h>
#ifdef __cplusplus
diff --git a/util/ssim.h b/util/ssim.h
index 40120b4..430eb71 100644
--- a/util/ssim.h
+++ b/util/ssim.h
@@ -13,6 +13,8 @@
#ifndef UTIL_SSIM_H_ // NOLINT
#define UTIL_SSIM_H_
+#include <math.h> // For log10()
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -25,7 +27,6 @@ typedef unsigned char uint8;
double CalcSSIM(const uint8* org, const uint8* rec,
const int image_width, const int image_height);
-// does -10.0 * log10(1.0 - ssim)
double CalcLSSIM(double ssim);
#ifdef __cplusplus