diff options
75 files changed, 7533 insertions, 5811 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9928fea --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +.gn +build +buildtools +chromium/.gclient_entries +chromium/.last_sync_chromium +chromium/src/ +google_apis +links +net +out/ +testing +third_party/ +tools/android +tools/clang +tools/find_depot_tools.py +tools/gn +tools/gyp +tools/memory +tools/python +tools/valgrind +tools/win + @@ -69,7 +69,7 @@ source_set("libyuv") { "source/video_common.cc", ] - direct_dependent_configs = [ ":libyuv_config" ] + public_configs = [ ":libyuv_config" ] defines = [] diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..6f0fccf --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,82 @@ +cmake_minimum_required(VERSION 2.8)
+
+# Basic CMakeLists for libyuv, compiles w/o the jpeg library
+# created for "roxlu build system" to compile libyuv on windows
+
+set(ly_base_dir ${CMAKE_CURRENT_LIST_DIR})
+set(ly_src_dir ${ly_base_dir}/source/)
+set(ly_inc_dir ${ly_base_dir}/include)
+set(ly_lib_name "yuv")
+
+set(ly_source_files
+ ${ly_src_dir}/compare.cc
+ ${ly_src_dir}/compare_common.cc
+ ${ly_src_dir}/compare_neon.cc
+ ${ly_src_dir}/compare_posix.cc
+ ${ly_src_dir}/compare_win.cc
+ ${ly_src_dir}/convert.cc
+ ${ly_src_dir}/convert_argb.cc
+ ${ly_src_dir}/convert_from.cc
+ ${ly_src_dir}/convert_from_argb.cc
+ ${ly_src_dir}/convert_to_argb.cc
+ ${ly_src_dir}/convert_to_i420.cc
+ ${ly_src_dir}/cpu_id.cc
+ ${ly_src_dir}/format_conversion.cc
+ ${ly_src_dir}/mjpeg_decoder.cc
+ ${ly_src_dir}/mjpeg_validate.cc
+ ${ly_src_dir}/planar_functions.cc
+ ${ly_src_dir}/rotate.cc
+ ${ly_src_dir}/rotate_argb.cc
+ ${ly_src_dir}/rotate_mips.cc
+ ${ly_src_dir}/rotate_neon.cc
+ ${ly_src_dir}/row_any.cc
+ ${ly_src_dir}/row_common.cc
+ ${ly_src_dir}/row_mips.cc
+ ${ly_src_dir}/row_neon.cc
+ ${ly_src_dir}/row_posix.cc
+ ${ly_src_dir}/row_win.cc
+ ${ly_src_dir}/scale.cc
+ ${ly_src_dir}/scale_argb.cc
+ ${ly_src_dir}/scale_common.cc
+ ${ly_src_dir}/scale_mips.cc
+ ${ly_src_dir}/scale_neon.cc
+ ${ly_src_dir}/scale_win.cc
+ ${ly_src_dir}/video_common.cc
+# ${ly_src_dir}/convert_jpeg.cc
+)
+
+if (WIN32)
+ list(APPEND ly_source_files
+ ${ly_src_dir}/scale_win.cc
+ )
+endif()
+
+set(ly_header_files
+ ${ly_inc_dir}/libyuv/basic_types.h
+ ${ly_inc_dir}/libyuv/compare.h
+ ${ly_inc_dir}/libyuv/convert.h
+ ${ly_inc_dir}/libyuv/convert_argb.h
+ ${ly_inc_dir}/libyuv/convert_from.h
+ ${ly_inc_dir}/libyuv/convert_from_argb.h
+ ${ly_inc_dir}/libyuv/cpu_id.h
+ ${ly_inc_dir}/libyuv/format_conversion.h
+ ${ly_inc_dir}/libyuv/planar_functions.h
+ ${ly_inc_dir}/libyuv/rotate.h
+ ${ly_inc_dir}/libyuv/rotate_argb.h
+ ${ly_inc_dir}/libyuv/row.h
+ ${ly_inc_dir}/libyuv/scale.h
+ ${ly_inc_dir}/libyuv/scale_argb.h
+ ${ly_inc_dir}/libyuv/scale_row.h
+ ${ly_inc_dir}/libyuv/version.h
+ ${ly_inc_dir}/libyuv/video_common.h
+ ${ly_inc_dir}/libyuv/mjpeg_decoder.h
+)
+
+include_directories(${ly_inc_dir})
+
+add_library(${ly_lib_name} STATIC ${ly_source_files})
+
+install(TARGETS ${ly_lib_name} DESTINATION lib)
+install(FILES ${ly_header_files} DESTINATION include/libyuv)
+install(FILES ${ly_inc_dir}/libyuv.h DESTINATION include/)
+
@@ -1,5 +1,3 @@ -use_relative_paths = True - vars = { "libyuv_trunk" : "https://libyuv.googlecode.com/svn/trunk", @@ -8,214 +6,24 @@ vars = { "root_dir": "trunk", "extra_gyp_flag": "-Dextra_gyp_flag=0", - # Use this googlecode_url variable only if there is an internal mirror for it. - # If you do not know, use the full path while defining your new deps entry. - "googlecode_url": "http://%s.googlecode.com/svn", - "chromium_trunk" : "http://src.chromium.org/svn/trunk", - # chrome://version/ for revision of canary Chrome. - # http://chromium-status.appspot.com/lkgr is a last known good revision. - "chromium_revision": "280149", -} - -# NOTE: Prefer revision numbers to tags for svn deps. Use http rather than -# https; the latter can cause problems for users behind proxies. -deps = { - "../chromium_deps": - File(Var("chromium_trunk") + "/src/DEPS@" + Var("chromium_revision")), - - "../chromium_gn": - File(Var("chromium_trunk") + "/src/.gn@" + Var("chromium_revision")), - - "build": - Var("chromium_trunk") + "/src/build@" + Var("chromium_revision"), - - "buildtools": - From("chromium_deps", "src/buildtools"), - - # Needed by common.gypi. - "google_apis/build": - Var("chromium_trunk") + "/src/google_apis/build@" + Var("chromium_revision"), - - "testing": - Var("chromium_trunk") + "/src/testing@" + Var("chromium_revision"), - - "testing/gtest": - From("chromium_deps", "src/testing/gtest"), - - "tools/clang": - Var("chromium_trunk") + "/src/tools/clang@" + Var("chromium_revision"), - - "tools/gn": - Var("chromium_trunk") + "/src/tools/gn@" + Var("chromium_revision"), - - "tools/gyp": - From("chromium_deps", "src/tools/gyp"), - - "tools/memory": - Var("chromium_trunk") + "/src/tools/memory@" + Var("chromium_revision"), - - "tools/python": - Var("chromium_trunk") + "/src/tools/python@" + Var("chromium_revision"), - - "tools/sanitizer_options": - File(Var("chromium_trunk") + "/src/base/debug/sanitizer_options.cc@" + Var("chromium_revision")), - - "tools/tsan_suppressions": - File(Var("chromium_trunk") + "/src/base/debug/tsan_suppressions.cc@" + Var("chromium_revision")), - - "tools/valgrind": - Var("chromium_trunk") + "/src/tools/valgrind@" + Var("chromium_revision"), - - # Needed by build/common.gypi. - "tools/win/supalink": - Var("chromium_trunk") + "/src/tools/win/supalink@" + Var("chromium_revision"), - - "third_party/binutils": - Var("chromium_trunk") + "/src/third_party/binutils@" + Var("chromium_revision"), - - "third_party/libc++": - Var("chromium_trunk") + "/src/third_party/libc++@" + Var("chromium_revision"), - - "third_party/libc++/trunk": - From("chromium_deps", "src/third_party/libc++/trunk"), - - "third_party/libc++abi": - Var("chromium_trunk") + "/src/third_party/libc++abi@" + Var("chromium_revision"), - - "third_party/libc++abi/trunk": - From("chromium_deps", "src/third_party/libc++abi/trunk"), - - "third_party/libjpeg_turbo": - From("chromium_deps", "src/third_party/libjpeg_turbo"), - - # Yasm assember required for libjpeg_turbo - "third_party/yasm": - Var("chromium_trunk") + "/src/third_party/yasm@" + Var("chromium_revision"), - - "third_party/yasm/source/patched-yasm": - Var("chromium_trunk") + "/deps/third_party/yasm/patched-yasm@" + Var("chromium_revision"), -} - -deps_os = { - "win": { - # Use WebRTC's, stripped down, version of Cygwin (required by GYP). - "third_party/cygwin": - (Var("googlecode_url") % "webrtc") + "/deps/third_party/cygwin@2672", - - # Used by libjpeg-turbo. - # TODO(fbarchard): Remove binaries and run yasm from build folder. - "third_party/yasm/binaries": - Var("chromium_trunk") + "/deps/third_party/yasm/binaries@" + Var("chromium_revision"), - "third_party/yasm": None, - - "tools/find_depot_tools": - File(Var("chromium_trunk") + "/src/tools/find_depot_tools.py@" + Var("chromium_revision")), - }, - "android": { - "third_party/android_tools": - From("chromium_deps", "src/third_party/android_tools"), - - "third_party/libjpeg": - Var("chromium_trunk") + "/src/third_party/libjpeg@" + Var("chromium_revision"), - }, - "ios": { - # NSS, for SSLClientSocketNSS. - "third_party/nss": - From("chromium_deps", "src/third_party/nss"), - - "net/third_party/nss": - Var("chromium_trunk") + "/src/net/third_party/nss@" + Var("chromium_revision"), - - # class-dump utility to generate header files for undocumented SDKs. - "testing/iossim/third_party/class-dump": - From("chromium_deps", "src/testing/iossim/third_party/class-dump"), - - # Helper for running under the simulator. - "testing/iossim": - Var("chromium_trunk") + "/src/testing/iossim@" + Var("chromium_revision"), - }, + # Roll the Chromium Git hash to pick up newer versions of all the + # dependencies and tools linked to in setup_links.py. + "chromium_revision": "2d714fae183152299b3cbf0056eab5fe8bb75e87", } hooks = [ { - # Copy .gn from temporary place (../chromium_gn) to root_dir. - "name": "copy .gn", - "pattern": ".", - "action": ["python", Var("root_dir") + "/build/cp.py", - Var("root_dir") + "/../chromium_gn/.gn", - Var("root_dir")], - }, - # Pull GN binaries. This needs to be before running GYP below. - { - "name": "gn_win", - "pattern": ".", - "action": [ "download_from_google_storage", - "--no_resume", - "--platform=win32", - "--no_auth", - "--bucket", "chromium-gn", - "-s", Var("root_dir") + "/buildtools/win/gn.exe.sha1", - ], - }, - { - "name": "gn_mac", - "pattern": ".", - "action": [ "download_from_google_storage", - "--no_resume", - "--platform=darwin", - "--no_auth", - "--bucket", "chromium-gn", - "-s", Var("root_dir") + "/buildtools/mac/gn.sha1", - ], - }, - { - "name": "gn_linux", - "pattern": ".", - "action": [ "download_from_google_storage", - "--no_resume", - "--platform=linux*", - "--no_auth", - "--bucket", "chromium-gn", - "-s", Var("root_dir") + "/buildtools/linux64/gn.sha1", - ], - }, - { - "name": "gn_linux32", - "pattern": ".", - "action": [ "download_from_google_storage", - "--no_resume", - "--platform=linux*", - "--no_auth", - "--bucket", "chromium-gn", - "-s", Var("root_dir") + "/buildtools/linux32/gn.sha1", - ], - }, - { - # Remove GN binaries from tools/gn/bin that aren't used anymore. - # TODO(kjellander) remove after the end of July, 2014. - "name": "remove_old_gn_binaries", - "pattern": ".", - "action": ["python", Var("root_dir") + "/tools/gn/bin/rm_binaries.py"], - }, - { - # Pull clang on mac. If nothing changed, or on non-mac platforms, this takes - # zero seconds to run. If something changed, it downloads a prebuilt clang. - "pattern": ".", - "action": ["python", Var("root_dir") + "/tools/clang/scripts/update.py", - "--if-needed"], - }, - { - # Update the Windows toolchain if necessary. - "name": "win_toolchain", + # Clone chromium and its deps. + "name": "sync chromium", "pattern": ".", - "action": ["python", Var("root_dir") + "/download_vs_toolchain.py", - "update"], + "action": ["python", "-u", Var("root_dir") + "/sync_chromium.py", + "--target-revision", Var("chromium_revision")], }, { - # Pull binutils for gold. - "name": "binutils", + # Create links to shared dependencies in Chromium. + "name": "setup_links", "pattern": ".", - "action": ["python", Var("root_dir") + "/third_party/binutils/download.py"], + "action": ["python", Var("root_dir") + "/setup_links.py"], }, { # A change to a .gyp, .gypi, or to GYP itself should run the generator. diff --git a/PRESUBMIT.py b/PRESUBMIT.py new file mode 100755 index 0000000..80ec300 --- /dev/null +++ b/PRESUBMIT.py @@ -0,0 +1,45 @@ +# Copyright 2014 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +import re +import sys + + +def GetDefaultTryConfigs(bots=None): + """Returns a list of ('bot', set(['tests']), optionally filtered by [bots]. + + For WebRTC purposes, we always return an empty list of tests, since we want + to run all tests by default on all our trybots. + """ + return { 'tryserver.libyuv': dict((bot, []) for bot in bots)} + + +# pylint: disable=W0613 +def GetPreferredTryMasters(project, change): + files = change.LocalPaths() + bots = [ + 'win', + 'win_rel', + 'win_x64_rel', + 'mac', + 'mac_rel', + 'mac_x64_rel', + 'ios', + 'ios_rel', + 'mac_asan', + 'linux', + 'linux_rel', + 'linux_memcheck', + 'linux_tsan2', + 'linux_asan', + 'android', + 'android_rel', + ] + if not files or all(re.search(r'[\\/]OWNERS$', f) for f in files): + return {} + return GetDefaultTryConfigs(bots) diff --git a/README.chromium b/README.chromium index e58eb6f..90ff5c2 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1038 +Version: 1130 License: BSD License File: LICENSE diff --git a/chromium/.gclient b/chromium/.gclient new file mode 100644 index 0000000..1ff06aa --- /dev/null +++ b/chromium/.gclient @@ -0,0 +1,19 @@ +solutions = [{ + 'name': 'src', + 'url': 'https://chromium.googlesource.com/chromium/src.git', + 'deps_file': '.DEPS.git', + 'managed': False, + 'custom_deps': { + # Skip syncing some large dependencies Libyuv will never need. + 'src/chrome/tools/test/reference_build/chrome_linux': None, + 'src/chrome/tools/test/reference_build/chrome_mac': None, + 'src/chrome/tools/test/reference_build/chrome_win': None, + 'src/native_client': None, + 'src/third_party/ffmpeg': None, + 'src/third_party/WebKit': None, + 'src/v8': None, + }, + 'safesync_url': '' +}] + +cache_dir = None diff --git a/chromium/README b/chromium/README new file mode 100644 index 0000000..127f4b5 --- /dev/null +++ b/chromium/README @@ -0,0 +1,5 @@ +This .gclient file is used to do download a copy of Chromium. +Libyuv uses the Chromium build toolchain and a number of shared +dependencies by creating symlinks to folders in this checkout, +using the ../setup_links.py script. + @@ -19,7 +19,6 @@ import sys checkout_root = os.path.dirname(os.path.realpath(__file__)) sys.path.insert(0, os.path.join(checkout_root, 'build')) -sys.path.insert(0, os.path.join(checkout_root, 'tools', 'find_depot_tools')) import gyp_chromium import gyp_helper import vs_toolchain @@ -27,6 +26,13 @@ import vs_toolchain sys.path.insert(0, os.path.join(checkout_root, 'tools', 'gyp', 'pylib')) import gyp +def GetSupplementalFiles(): + """Returns a list of the supplemental files that are included in all GYP + sources.""" + # Can't use the one in gyp_chromium since the directory location of the root + # is different. + return glob.glob(os.path.join(checkout_root, '*', 'supplement.gypi')) + if __name__ == '__main__': args = sys.argv[1:] @@ -41,7 +47,10 @@ if __name__ == '__main__': # If we didn't get a file, assume 'all.gyp' in the root of the checkout. if not gyp_file_specified: - args.append(os.path.join(checkout_root, 'all.gyp')) + # Because of a bug in gyp, simply adding the abspath to all.gyp doesn't + # work, but chdir'ing and adding the relative path does. Spooky :/ + os.chdir(checkout_root) + args.append('all.gyp') # There shouldn't be a circular dependency relationship between .gyp files, args.append('--no-circular-check') @@ -50,7 +59,9 @@ if __name__ == '__main__': if not os.environ.get('GYP_GENERATORS'): os.environ['GYP_GENERATORS'] = 'ninja' - vs2013_runtime_dll_dirs = vs_toolchain.SetEnvironmentAndGetRuntimeDllDirs() + vs2013_runtime_dll_dirs = None + if int(os.environ.get('DEPOT_TOOLS_WIN_TOOLCHAIN', '1')): + vs2013_runtime_dll_dirs = vs_toolchain.SetEnvironmentAndGetRuntimeDllDirs() # Enforce gyp syntax checking. This adds about 20% execution time. args.append('--check') diff --git a/include/libyuv/mjpeg_decoder.h b/include/libyuv/mjpeg_decoder.h index 82fd95d..8423121 100644 --- a/include/libyuv/mjpeg_decoder.h +++ b/include/libyuv/mjpeg_decoder.h @@ -153,7 +153,6 @@ class LIBYUV_API MJpegDecoder { int* subsample_x, int* subsample_y, int number_of_components); private: - void AllocOutputBuffers(int num_outbufs); void DestroyOutputBuffers(); diff --git a/include/libyuv/row.h b/include/libyuv/row.h index cdf6cec..058122b 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -15,10 +15,6 @@ #include "libyuv/basic_types.h" -#if defined(__native_client__) -#include "ppapi/c/pp_macros.h" // For PPAPI_RELEASE -#endif - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -51,13 +47,12 @@ extern "C" { #define LIBYUV_SSSE3_ONLY #endif -// Enable for NaCL pepper 33 for bundle and AVX2 support. -#if defined(__native_client__) && PPAPI_RELEASE >= 33 -#define NEW_BINUTILS -#endif -#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37 +// clang >= 3.5.0 required for Arm64. +#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) +#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) #define LIBYUV_DISABLE_NEON -#endif +#endif // clang >= 3.5 +#endif // __clang__ // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ @@ -80,7 +75,6 @@ extern "C" { #define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSHADEROW_SSE2 #define HAS_ARGBSUBTRACTROW_SSE2 -#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 @@ -94,6 +88,7 @@ extern "C" { #define HAS_SOBELYROW_SSE2 // Conversions: +#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 @@ -117,7 +112,6 @@ extern "C" { #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_COPYROW_X86 -#define HAS_HALFROW_SSE2 #define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 @@ -192,6 +186,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) // Effects: +#define HAS_COPYROW_AVX #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBCOPYALPHAROW_AVX2 @@ -204,8 +199,8 @@ extern "C" { #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 -#define HAS_HALFROW_AVX2 #define HAS_I422TOARGBROW_AVX2 +#define HAS_I422TOBGRAROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 @@ -250,8 +245,99 @@ extern "C" { #define HAS_MIRRORROW_SSE2 #endif +// The following are available on arm64 platforms: +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_I444TOARGBROW_NEON +#define HAS_I422TOARGBROW_NEON +#define HAS_I411TOARGBROW_NEON +#define HAS_I422TOBGRAROW_NEON +#define HAS_I422TOABGRROW_NEON +#define HAS_I422TORGBAROW_NEON +#define HAS_I422TORGB24ROW_NEON +#define HAS_I422TORAWROW_NEON +#define HAS_I422TORGB565ROW_NEON +#define HAS_I422TOARGB1555ROW_NEON +#define HAS_I422TOARGB4444ROW_NEON +#define HAS_YTOARGBROW_NEON +#define HAS_I400TOARGBROW_NEON +#define HAS_NV12TOARGBROW_NEON +#define HAS_NV21TOARGBROW_NEON +#define HAS_NV12TORGB565ROW_NEON +#define HAS_NV21TORGB565ROW_NEON +#define HAS_YUY2TOARGBROW_NEON +#define HAS_UYVYTOARGBROW_NEON +#define HAS_SPLITUVROW_NEON +#define HAS_MERGEUVROW_NEON +#define HAS_COPYROW_NEON +#define HAS_SETROW_NEON +#define HAS_ARGBSETROWS_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORUVROW_NEON +#define HAS_ARGBMIRRORROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RAWTOARGBROW_NEON +#define HAS_RGB565TOARGBROW_NEON +#define HAS_ARGB1555TOARGBROW_NEON +#define HAS_ARGB4444TOARGBROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_YUY2TOYROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_ARGBTOBAYERROW_NEON +#define HAS_ARGBTOBAYERGGROW_NEON +#define HAS_ARGBSHUFFLEROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I422TOUYVYROW_NEON +#define HAS_ARGBTORGB565ROW_NEON +#define HAS_ARGBTOARGB1555ROW_NEON +#define HAS_ARGBTOARGB4444ROW_NEON +#define HAS_ARGBTOYROW_NEON +#define HAS_ARGBTOYJROW_NEON +#define HAS_ARGBTOUV444ROW_NEON +#define HAS_ARGBTOUV422ROW_NEON +#define HAS_ARGBTOUV411ROW_NEON +#define HAS_ARGBTOUVROW_NEON +#define HAS_ARGBTOUVJROW_NEON +#define HAS_BGRATOUVROW_NEON +#define HAS_ABGRTOUVROW_NEON +#define HAS_RGBATOUVROW_NEON +#define HAS_RGB24TOUVROW_NEON +#define HAS_RAWTOUVROW_NEON +#define HAS_RGB565TOUVROW_NEON +#define HAS_ARGB1555TOUVROW_NEON +#define HAS_ARGB4444TOUVROW_NEON +#define HAS_RGB565TOYROW_NEON +#define HAS_ARGB1555TOYROW_NEON +#define HAS_ARGB4444TOYROW_NEON +#define HAS_BGRATOYROW_NEON +#define HAS_ABGRTOYROW_NEON +#define HAS_RGBATOYROW_NEON +#define HAS_RGB24TOYROW_NEON +#define HAS_RAWTOYROW_NEON +#define HAS_INTERPOLATEROW_NEON +#define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBATTENUATEROW_NEON +#define HAS_ARGBQUANTIZEROW_NEON +#define HAS_ARGBSHADEROW_NEON +#define HAS_ARGBGRAYROW_NEON +#define HAS_ARGBSEPIAROW_NEON +#define HAS_ARGBCOLORMATRIXROW_NEON +#define HAS_ARGBMULTIPLYROW_NEON +#define HAS_ARGBADDROW_NEON +#define HAS_ARGBSUBTRACTROW_NEON +#define HAS_SOBELROW_NEON +#define HAS_SOBELTOPLANEROW_NEON +#define HAS_SOBELXYROW_NEON +#define HAS_SOBELXROW_NEON +#define HAS_SOBELYROW_NEON +#endif + // The following are available on Neon platforms: -#if !defined(LIBYUV_DISABLE_NEON) && \ +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__aarch64__) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOYROW_NEON @@ -278,7 +364,6 @@ extern "C" { #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON -#define HAS_HALFROW_NEON #define HAS_I400TOARGBROW_NEON #define HAS_I411TOARGBROW_NEON #define HAS_I422TOABGRROW_NEON @@ -346,7 +431,7 @@ extern "C" { // The following are available on Mips platforms: #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) + (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) #define HAS_COPYROW_MIPS #if defined(__mips_dsp) && (__mips_dsp_rev >= 2) #define HAS_I422TOABGRROW_MIPS_DSPR2 @@ -400,24 +485,15 @@ typedef uint8 uvec8[16]; #endif // NaCL macros for GCC x86 and x64. - -// TODO(nfullagar): When pepper_33 toolchain is distributed, default to -// NEW_BINUTILS and remove all BUNDLEALIGN occurances. #if defined(__native_client__) #define LABELALIGN ".p2align 5\n" #else #define LABELALIGN ".p2align 2\n" #endif #if defined(__native_client__) && defined(__x86_64__) -#if defined(NEW_BINUTILS) #define BUNDLELOCK ".bundle_lock\n" #define BUNDLEUNLOCK ".bundle_unlock\n" #define BUNDLEALIGN "\n" -#else -#define BUNDLELOCK "\n" -#define BUNDLEUNLOCK "\n" -#define BUNDLEALIGN ".p2align 5\n" -#endif #define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" #define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" #define MEMLEA(offset, base) #offset "(%q" #base ")" @@ -461,7 +537,7 @@ typedef uint8 uvec8[16]; #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" #endif // defined(__native_client__) && defined(__x86_64__) -#if defined(__arm__) +#if defined(__arm__) || defined(__aarch64__) #undef MEMACCESS #if defined(__native_client__) #define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" @@ -559,13 +635,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); -void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); -void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, @@ -644,16 +713,6 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int width); void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, @@ -715,15 +774,11 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUV422Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); @@ -761,10 +816,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix); -void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, - uint8* dst_v, int pix); void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, @@ -782,8 +833,6 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, - uint8* dst_uv, int width); void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, @@ -792,6 +841,7 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void CopyRow_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count); void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count); @@ -829,8 +879,6 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, @@ -897,7 +945,6 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); @@ -988,6 +1035,11 @@ void I422ToARGBRow_AVX2(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I422ToBGRARow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I444ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1055,7 +1107,6 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); -// RGB24/RAW are unaligned. void I422ToRGB24Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1066,56 +1117,16 @@ void I422ToRAWRow_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_raw, int width); - -void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, - int width); -void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, - int width); -void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - int width); -void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - int width); -void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_bgra, - int width); -void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_abgr, - int width); -void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, - int width); void I422ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, int width); +void I422ToBGRARow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I444ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1183,7 +1194,6 @@ void I422ToRGB565Row_Any_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_rgba, int width); -// RGB24/RAW are unaligned. void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1397,12 +1407,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix); -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); @@ -1438,12 +1442,6 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); void UYVYToUV422Row_SSE2(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix); -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); @@ -1476,18 +1474,6 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); -void HalfRow_C(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix); -void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix); -void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix); -void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix); - -void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, - uint16* dst_uv, int pix); - void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix); void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, @@ -1647,12 +1633,6 @@ void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); -void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h index a3bc07e..102158d 100644 --- a/include/libyuv/scale.h +++ b/include/libyuv/scale.h @@ -34,6 +34,7 @@ void ScalePlane(const uint8* src, int src_stride, int dst_width, int dst_height, enum FilterMode filtering); +LIBYUV_API void ScalePlane_16(const uint16* src, int src_stride, int src_width, int src_height, uint16* dst, int dst_stride, diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 8dc0762..27aa04b 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -44,7 +44,7 @@ extern "C" { // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEROWDOWN34_NEON @@ -200,15 +200,6 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -259,10 +250,10 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx); // Row functions. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width); void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 1ab8c5c..2e34fc8 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1038 +#define LIBYUV_VERSION 1130 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT @@ -10,18 +10,29 @@ 'includes': [ 'libyuv.gypi', ], + # Make sure that if we are being compiled to an xcodeproj, nothing tries to + # include a .pch. + 'xcode_settings': { + 'GCC_PREFIX_HEADER': '', + 'GCC_PRECOMPILE_PREFIX_HEADER': 'NO', + }, 'variables': { 'use_system_libjpeg%': 0, 'libyuv_disable_jpeg%': 0, + # Link-Time Optimizations. + 'use_lto%': 0, 'build_neon': 0, 'conditions': [ - [ '(target_arch == "armv7" or target_arch == "armv7s" or (target_arch == "arm" and arm_version >= 7)) and target_subarch != 64 and (arm_neon == 1 or arm_neon_optional == 1)', { + ['(target_arch == "armv7" or target_arch == "armv7s" or \ + (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\ + and (arm_neon == 1 or arm_neon_optional == 1)', + { 'build_neon': 1, }], ], }, 'conditions': [ - [ 'build_neon != 0', { + ['build_neon != 0', { 'targets': [ # The NEON-specific components. { @@ -35,8 +46,20 @@ '-mfpu=vfpv3', '-mfpu=vfpv3-d16', ], - 'cflags': [ - '-mfpu=neon', + 'conditions': [ + # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug. + ['use_lto == 1', { + 'cflags!': [ + '-flto', + '-ffat-lto-objects', + ], + }], + # arm64 does not need -mfpu=neon option as neon is not optional + ['target_arch != "arm64"', { + 'cflags': [ + '-mfpu=neon', + ], + }], ], 'include_dirs': [ 'include', @@ -51,9 +74,13 @@ 'sources': [ # sources. 'source/compare_neon.cc', + 'source/compare_neon64.cc', 'source/rotate_neon.cc', + 'source/rotate_neon64.cc', 'source/row_neon.cc', + 'source/row_neon64.cc', 'source/scale_neon.cc', + 'source/scale_neon64.cc', ], }, ], @@ -67,12 +94,7 @@ # Allows libyuv.a redistributable library without external dependencies. 'standalone_static_library': 1, 'conditions': [ - [ 'OS == "ios" and target_subarch == 64', { - 'defines': [ - 'LIBYUV_DISABLE_NEON' - ], - }], - [ 'OS != "ios" and libyuv_disable_jpeg != 1', { + ['OS != "ios" and libyuv_disable_jpeg != 1', { 'defines': [ 'HAVE_JPEG' ], @@ -96,13 +118,10 @@ }], ], }], - [ 'build_neon != 0', { + ['build_neon != 0', { 'dependencies': [ 'libyuv_neon', ], - 'defines': [ - 'LIBYUV_NEON', - ] }], # MemorySanitizer does not support assembly code yet. # http://crbug.com/344505 @@ -111,7 +130,7 @@ 'LIBYUV_DISABLE_X86', ], }], - ], + ], #conditions 'defines': [ # Enable the following 3 macros to turn off assembly for specified CPU. # 'LIBYUV_DISABLE_X86', @@ -119,6 +138,7 @@ # 'LIBYUV_DISABLE_MIPS', # Enable the following macro to build libyuv as a shared library (dll). # 'LIBYUV_USING_SHARED_LIBRARY', + # TODO(fbarchard): Make these into gyp defines. ], 'include_dirs': [ 'include', @@ -129,6 +149,18 @@ 'include', '.', ], + 'conditions': [ + ['OS == "android" and target_arch == "arm64"', { + 'ldflags': [ + '-Wl,--dynamic-linker,/system/bin/linker64', + ], + }], + ['OS == "android" and target_arch != "arm64"', { + 'ldflags': [ + '-Wl,--dynamic-linker,/system/bin/linker', + ], + }], + ], #conditions }, 'sources': [ '<@(libyuv_sources)', diff --git a/libyuv.target.darwin-arm.mk b/libyuv.target.darwin-arm.mk index c04afd0..890d730 100644 --- a/libyuv.target.darwin-arm.mk +++ b/libyuv.target.darwin-arm.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -117,16 +118,9 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ - '-DLIBYUV_NEON' \ '-DUSE_OPENSSL=1' \ '-DUSE_OPENSSL_CERTS=1' \ '-DANDROID' \ @@ -144,10 +138,7 @@ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -226,16 +217,9 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ - '-DLIBYUV_NEON' \ '-DUSE_OPENSSL=1' \ '-DUSE_OPENSSL_CERTS=1' \ '-DANDROID' \ @@ -253,10 +237,7 @@ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -280,10 +261,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv.target.darwin-arm64.mk b/libyuv.target.darwin-arm64.mk index 8e7a656..8ecb8e0 100644 --- a/libyuv.target.darwin-arm64.mk +++ b/libyuv.target.darwin-arm64.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -106,12 +107,6 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -132,10 +127,7 @@ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -202,12 +194,6 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -228,10 +214,7 @@ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -254,10 +237,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv.target.darwin-mips.mk b/libyuv.target.darwin-mips.mk index e446e24..932cf3e 100644 --- a/libyuv.target.darwin-mips.mk +++ b/libyuv.target.darwin-mips.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -110,12 +111,6 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -136,10 +131,7 @@ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -211,12 +203,6 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -237,10 +223,7 @@ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -264,10 +247,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv.target.darwin-x86.mk b/libyuv.target.darwin-x86.mk index cd82ea9..aad7180 100644 --- a/libyuv.target.darwin-x86.mk +++ b/libyuv.target.darwin-x86.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -112,12 +113,6 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -138,10 +133,7 @@ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -214,12 +206,6 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -240,10 +226,7 @@ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -266,10 +249,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv.target.darwin-x86_64.mk b/libyuv.target.darwin-x86_64.mk index 7beeb32..1b44d91 100644 --- a/libyuv.target.darwin-x86_64.mk +++ b/libyuv.target.darwin-x86_64.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -111,12 +112,6 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -137,10 +132,7 @@ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -212,12 +204,6 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -238,10 +224,7 @@ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -264,10 +247,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv.target.linux-arm.mk b/libyuv.target.linux-arm.mk index c04afd0..890d730 100644 --- a/libyuv.target.linux-arm.mk +++ b/libyuv.target.linux-arm.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -117,16 +118,9 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ - '-DLIBYUV_NEON' \ '-DUSE_OPENSSL=1' \ '-DUSE_OPENSSL_CERTS=1' \ '-DANDROID' \ @@ -144,10 +138,7 @@ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -226,16 +217,9 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ - '-DLIBYUV_NEON' \ '-DUSE_OPENSSL=1' \ '-DUSE_OPENSSL_CERTS=1' \ '-DANDROID' \ @@ -253,10 +237,7 @@ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -280,10 +261,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv.target.linux-arm64.mk b/libyuv.target.linux-arm64.mk index 8e7a656..8ecb8e0 100644 --- a/libyuv.target.linux-arm64.mk +++ b/libyuv.target.linux-arm64.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -106,12 +107,6 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -132,10 +127,7 @@ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -202,12 +194,6 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -228,10 +214,7 @@ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -254,10 +237,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv.target.linux-mips.mk b/libyuv.target.linux-mips.mk index e446e24..932cf3e 100644 --- a/libyuv.target.linux-mips.mk +++ b/libyuv.target.linux-mips.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -110,12 +111,6 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -136,10 +131,7 @@ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -211,12 +203,6 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -237,10 +223,7 @@ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -264,10 +247,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv.target.linux-x86.mk b/libyuv.target.linux-x86.mk index cd82ea9..aad7180 100644 --- a/libyuv.target.linux-x86.mk +++ b/libyuv.target.linux-x86.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -112,12 +113,6 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -138,10 +133,7 @@ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -214,12 +206,6 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -240,10 +226,7 @@ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -266,10 +249,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv.target.linux-x86_64.mk b/libyuv.target.linux-x86_64.mk index 7beeb32..1b44d91 100644 --- a/libyuv.target.linux-x86_64.mk +++ b/libyuv.target.linux-x86_64.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -111,12 +112,6 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -137,10 +132,7 @@ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -212,12 +204,6 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DHAVE_JPEG' \ @@ -238,10 +224,7 @@ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ $(LOCAL_PATH)/third_party/libyuv \ - $(LOCAL_PATH)/third_party/libjpeg_turbo \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libjpeg_turbo # Flags passed to only C++ (and not C) files. @@ -264,10 +247,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv_neon.target.darwin-arm.mk b/libyuv_neon.target.darwin-arm.mk index f4f3e35..014a843 100644 --- a/libyuv_neon.target.darwin-arm.mk +++ b/libyuv_neon.target.darwin-arm.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_neon_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -24,9 +25,13 @@ GYP_COPIED_SOURCE_ORIGIN_DIRS := LOCAL_SRC_FILES := \ third_party/libyuv/source/compare_neon.cc \ + third_party/libyuv/source/compare_neon64.cc \ third_party/libyuv/source/rotate_neon.cc \ + third_party/libyuv/source/rotate_neon64.cc \ third_party/libyuv/source/row_neon.cc \ - third_party/libyuv/source/scale_neon.cc + third_party/libyuv/source/row_neon64.cc \ + third_party/libyuv/source/scale_neon.cc \ + third_party/libyuv/source/scale_neon64.cc # Flags passed to both C and C++ files. @@ -91,12 +96,6 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DUSE_OPENSSL=1' \ @@ -115,10 +114,7 @@ MY_DEFS_Debug := \ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ - $(LOCAL_PATH)/third_party/libyuv \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libyuv # Flags passed to only C++ (and not C) files. @@ -198,12 +194,6 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DUSE_OPENSSL=1' \ @@ -222,10 +212,7 @@ MY_DEFS_Release := \ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ - $(LOCAL_PATH)/third_party/libyuv \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libyuv # Flags passed to only C++ (and not C) files. @@ -249,10 +236,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv_neon.target.darwin-arm64.mk b/libyuv_neon.target.darwin-arm64.mk new file mode 100644 index 0000000..63c8d4f --- /dev/null +++ b/libyuv_neon.target.darwin-arm64.mk @@ -0,0 +1,225 @@ +# This file is generated by gyp; do not edit. + +include $(CLEAR_VARS) + +LOCAL_MODULE_CLASS := STATIC_LIBRARIES +LOCAL_MODULE := third_party_libyuv_libyuv_neon_gyp +LOCAL_MODULE_SUFFIX := .a +LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 +gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) +gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) + +# Make sure our deps are built first. +GYP_TARGET_DEPENDENCIES := + +GYP_GENERATED_OUTPUTS := + +# Make sure our deps and generated files are built first. +LOCAL_ADDITIONAL_DEPENDENCIES := $(GYP_TARGET_DEPENDENCIES) $(GYP_GENERATED_OUTPUTS) + +LOCAL_CPP_EXTENSION := .cc +LOCAL_GENERATED_SOURCES := + +GYP_COPIED_SOURCE_ORIGIN_DIRS := + +LOCAL_SRC_FILES := \ + third_party/libyuv/source/compare_neon.cc \ + third_party/libyuv/source/compare_neon64.cc \ + third_party/libyuv/source/rotate_neon.cc \ + third_party/libyuv/source/rotate_neon64.cc \ + third_party/libyuv/source/row_neon.cc \ + third_party/libyuv/source/row_neon64.cc \ + third_party/libyuv/source/scale_neon.cc \ + third_party/libyuv/source/scale_neon64.cc + + +# Flags passed to both C and C++ files. +MY_CFLAGS_Debug := \ + --param=ssp-buffer-size=4 \ + -fno-strict-aliasing \ + -Wno-unused-parameter \ + -Wno-missing-field-initializers \ + -fvisibility=hidden \ + -pipe \ + -fPIC \ + -Wno-unused-local-typedefs \ + -Wno-format \ + -ffunction-sections \ + -funwind-tables \ + -g \ + -fno-short-enums \ + -finline-limit=64 \ + -Wa,--noexecstack \ + -U_FORTIFY_SOURCE \ + -Wno-extra \ + -Wno-ignored-qualifiers \ + -Wno-type-limits \ + -Wno-unused-but-set-variable \ + -Wno-address \ + -Wno-format-security \ + -Wno-return-type \ + -Wno-sequence-point \ + -Os \ + -g \ + -gdwarf-4 \ + -fdata-sections \ + -ffunction-sections \ + -funwind-tables + +MY_DEFS_Debug := \ + '-DV8_DEPRECATION_WARNINGS' \ + '-D_FILE_OFFSET_BITS=64' \ + '-DNO_TCMALLOC' \ + '-DDISABLE_NACL' \ + '-DCHROMIUM_BUILD' \ + '-DUSE_LIBJPEG_TURBO=1' \ + '-DENABLE_WEBRTC=1' \ + '-DUSE_PROPRIETARY_CODECS' \ + '-DENABLE_BROWSER_CDMS' \ + '-DENABLE_CONFIGURATION_POLICY' \ + '-DDISCARDABLE_MEMORY_ALWAYS_SUPPORTED_NATIVELY' \ + '-DSYSTEM_NATIVELY_SIGNALS_MEMORY_PRESSURE' \ + '-DENABLE_EGLIMAGE=1' \ + '-DCLD_VERSION=1' \ + '-DENABLE_PRINTING=1' \ + '-DENABLE_MANAGED_USERS=1' \ + '-DVIDEO_HOLE=1' \ + '-DENABLE_LOAD_COMPLETION_HACKS=1' \ + '-DUSE_OPENSSL=1' \ + '-DUSE_OPENSSL_CERTS=1' \ + '-DANDROID' \ + '-D__GNU_SOURCE=1' \ + '-DUSE_STLPORT=1' \ + '-D_STLP_USE_PTR_SPECIALIZATIONS=1' \ + '-DCHROME_BUILD_ID=""' \ + '-DDYNAMIC_ANNOTATIONS_ENABLED=1' \ + '-DWTF_USE_DYNAMIC_ANNOTATIONS=1' \ + '-D_DEBUG' + + +# Include paths placed before CFLAGS/CPPFLAGS +LOCAL_C_INCLUDES_Debug := \ + $(gyp_shared_intermediate_dir) \ + $(LOCAL_PATH)/third_party/libyuv/include \ + $(LOCAL_PATH)/third_party/libyuv + + +# Flags passed to only C++ (and not C) files. +LOCAL_CPPFLAGS_Debug := \ + -fno-exceptions \ + -fno-rtti \ + -fno-threadsafe-statics \ + -fvisibility-inlines-hidden \ + -Wno-deprecated \ + -std=gnu++11 \ + -Wno-narrowing \ + -Wno-literal-suffix \ + -Wno-non-virtual-dtor \ + -Wno-sign-promo \ + -Wno-non-virtual-dtor + + +# Flags passed to both C and C++ files. +MY_CFLAGS_Release := \ + --param=ssp-buffer-size=4 \ + -fno-strict-aliasing \ + -Wno-unused-parameter \ + -Wno-missing-field-initializers \ + -fvisibility=hidden \ + -pipe \ + -fPIC \ + -Wno-unused-local-typedefs \ + -Wno-format \ + -ffunction-sections \ + -funwind-tables \ + -g \ + -fno-short-enums \ + -finline-limit=64 \ + -Wa,--noexecstack \ + -U_FORTIFY_SOURCE \ + -Wno-extra \ + -Wno-ignored-qualifiers \ + -Wno-type-limits \ + -Wno-unused-but-set-variable \ + -Wno-address \ + -Wno-format-security \ + -Wno-return-type \ + -Wno-sequence-point \ + -Os \ + -fno-ident \ + -fdata-sections \ + -ffunction-sections \ + -funwind-tables + +MY_DEFS_Release := \ + '-DV8_DEPRECATION_WARNINGS' \ + '-D_FILE_OFFSET_BITS=64' \ + '-DNO_TCMALLOC' \ + '-DDISABLE_NACL' \ + '-DCHROMIUM_BUILD' \ + '-DUSE_LIBJPEG_TURBO=1' \ + '-DENABLE_WEBRTC=1' \ + '-DUSE_PROPRIETARY_CODECS' \ + '-DENABLE_BROWSER_CDMS' \ + '-DENABLE_CONFIGURATION_POLICY' \ + '-DDISCARDABLE_MEMORY_ALWAYS_SUPPORTED_NATIVELY' \ + '-DSYSTEM_NATIVELY_SIGNALS_MEMORY_PRESSURE' \ + '-DENABLE_EGLIMAGE=1' \ + '-DCLD_VERSION=1' \ + '-DENABLE_PRINTING=1' \ + '-DENABLE_MANAGED_USERS=1' \ + '-DVIDEO_HOLE=1' \ + '-DENABLE_LOAD_COMPLETION_HACKS=1' \ + '-DUSE_OPENSSL=1' \ + '-DUSE_OPENSSL_CERTS=1' \ + '-DANDROID' \ + '-D__GNU_SOURCE=1' \ + '-DUSE_STLPORT=1' \ + '-D_STLP_USE_PTR_SPECIALIZATIONS=1' \ + '-DCHROME_BUILD_ID=""' \ + '-DNDEBUG' \ + '-DNVALGRIND' \ + '-DDYNAMIC_ANNOTATIONS_ENABLED=0' + + +# Include paths placed before CFLAGS/CPPFLAGS +LOCAL_C_INCLUDES_Release := \ + $(gyp_shared_intermediate_dir) \ + $(LOCAL_PATH)/third_party/libyuv/include \ + $(LOCAL_PATH)/third_party/libyuv + + +# Flags passed to only C++ (and not C) files. +LOCAL_CPPFLAGS_Release := \ + -fno-exceptions \ + -fno-rtti \ + -fno-threadsafe-statics \ + -fvisibility-inlines-hidden \ + -Wno-deprecated \ + -std=gnu++11 \ + -Wno-narrowing \ + -Wno-literal-suffix \ + -Wno-non-virtual-dtor \ + -Wno-sign-promo \ + -Wno-non-virtual-dtor + + +LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION)) +LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION)) +LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) +LOCAL_ASFLAGS := $(LOCAL_CFLAGS) +### Rules for final target. +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static + +# Add target alias to "gyp_all_modules" target. +.PHONY: gyp_all_modules +gyp_all_modules: third_party_libyuv_libyuv_neon_gyp + +# Alias gyp target name. +.PHONY: libyuv_neon +libyuv_neon: third_party_libyuv_libyuv_neon_gyp + +include $(BUILD_STATIC_LIBRARY) diff --git a/libyuv_neon.target.linux-arm.mk b/libyuv_neon.target.linux-arm.mk index f4f3e35..014a843 100644 --- a/libyuv_neon.target.linux-arm.mk +++ b/libyuv_neon.target.linux-arm.mk @@ -6,6 +6,7 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := third_party_libyuv_libyuv_neon_gyp LOCAL_MODULE_SUFFIX := .a LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) @@ -24,9 +25,13 @@ GYP_COPIED_SOURCE_ORIGIN_DIRS := LOCAL_SRC_FILES := \ third_party/libyuv/source/compare_neon.cc \ + third_party/libyuv/source/compare_neon64.cc \ third_party/libyuv/source/rotate_neon.cc \ + third_party/libyuv/source/rotate_neon64.cc \ third_party/libyuv/source/row_neon.cc \ - third_party/libyuv/source/scale_neon.cc + third_party/libyuv/source/row_neon64.cc \ + third_party/libyuv/source/scale_neon.cc \ + third_party/libyuv/source/scale_neon64.cc # Flags passed to both C and C++ files. @@ -91,12 +96,6 @@ MY_DEFS_Debug := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DUSE_OPENSSL=1' \ @@ -115,10 +114,7 @@ MY_DEFS_Debug := \ LOCAL_C_INCLUDES_Debug := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ - $(LOCAL_PATH)/third_party/libyuv \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libyuv # Flags passed to only C++ (and not C) files. @@ -198,12 +194,6 @@ MY_DEFS_Release := \ '-DCLD_VERSION=1' \ '-DENABLE_PRINTING=1' \ '-DENABLE_MANAGED_USERS=1' \ - '-DDATA_REDUCTION_FALLBACK_HOST="http://compress.googlezip.net:80/"' \ - '-DDATA_REDUCTION_DEV_HOST="https://proxy-dev.googlezip.net:443/"' \ - '-DDATA_REDUCTION_DEV_FALLBACK_HOST="http://proxy-dev.googlezip.net:80/"' \ - '-DSPDY_PROXY_AUTH_ORIGIN="https://proxy.googlezip.net:443/"' \ - '-DDATA_REDUCTION_PROXY_PROBE_URL="http://check.googlezip.net/connect"' \ - '-DDATA_REDUCTION_PROXY_WARMUP_URL="http://www.gstatic.com/generate_204"' \ '-DVIDEO_HOLE=1' \ '-DENABLE_LOAD_COMPLETION_HACKS=1' \ '-DUSE_OPENSSL=1' \ @@ -222,10 +212,7 @@ MY_DEFS_Release := \ LOCAL_C_INCLUDES_Release := \ $(gyp_shared_intermediate_dir) \ $(LOCAL_PATH)/third_party/libyuv/include \ - $(LOCAL_PATH)/third_party/libyuv \ - $(PWD)/frameworks/wilhelm/include \ - $(PWD)/bionic \ - $(PWD)/external/stlport/stlport + $(LOCAL_PATH)/third_party/libyuv # Flags passed to only C++ (and not C) files. @@ -249,10 +236,9 @@ LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CO LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) LOCAL_ASFLAGS := $(LOCAL_CFLAGS) ### Rules for final target. - -LOCAL_SHARED_LIBRARIES := \ - libstlport \ - libdl +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static # Add target alias to "gyp_all_modules" target. .PHONY: gyp_all_modules diff --git a/libyuv_neon.target.linux-arm64.mk b/libyuv_neon.target.linux-arm64.mk new file mode 100644 index 0000000..63c8d4f --- /dev/null +++ b/libyuv_neon.target.linux-arm64.mk @@ -0,0 +1,225 @@ +# This file is generated by gyp; do not edit. + +include $(CLEAR_VARS) + +LOCAL_MODULE_CLASS := STATIC_LIBRARIES +LOCAL_MODULE := third_party_libyuv_libyuv_neon_gyp +LOCAL_MODULE_SUFFIX := .a +LOCAL_MODULE_TARGET_ARCH := $(TARGET_$(GYP_VAR_PREFIX)ARCH) +LOCAL_SDK_VERSION := 21 +gyp_intermediate_dir := $(call local-intermediates-dir,,$(GYP_VAR_PREFIX)) +gyp_shared_intermediate_dir := $(call intermediates-dir-for,GYP,shared,,,$(GYP_VAR_PREFIX)) + +# Make sure our deps are built first. +GYP_TARGET_DEPENDENCIES := + +GYP_GENERATED_OUTPUTS := + +# Make sure our deps and generated files are built first. +LOCAL_ADDITIONAL_DEPENDENCIES := $(GYP_TARGET_DEPENDENCIES) $(GYP_GENERATED_OUTPUTS) + +LOCAL_CPP_EXTENSION := .cc +LOCAL_GENERATED_SOURCES := + +GYP_COPIED_SOURCE_ORIGIN_DIRS := + +LOCAL_SRC_FILES := \ + third_party/libyuv/source/compare_neon.cc \ + third_party/libyuv/source/compare_neon64.cc \ + third_party/libyuv/source/rotate_neon.cc \ + third_party/libyuv/source/rotate_neon64.cc \ + third_party/libyuv/source/row_neon.cc \ + third_party/libyuv/source/row_neon64.cc \ + third_party/libyuv/source/scale_neon.cc \ + third_party/libyuv/source/scale_neon64.cc + + +# Flags passed to both C and C++ files. +MY_CFLAGS_Debug := \ + --param=ssp-buffer-size=4 \ + -fno-strict-aliasing \ + -Wno-unused-parameter \ + -Wno-missing-field-initializers \ + -fvisibility=hidden \ + -pipe \ + -fPIC \ + -Wno-unused-local-typedefs \ + -Wno-format \ + -ffunction-sections \ + -funwind-tables \ + -g \ + -fno-short-enums \ + -finline-limit=64 \ + -Wa,--noexecstack \ + -U_FORTIFY_SOURCE \ + -Wno-extra \ + -Wno-ignored-qualifiers \ + -Wno-type-limits \ + -Wno-unused-but-set-variable \ + -Wno-address \ + -Wno-format-security \ + -Wno-return-type \ + -Wno-sequence-point \ + -Os \ + -g \ + -gdwarf-4 \ + -fdata-sections \ + -ffunction-sections \ + -funwind-tables + +MY_DEFS_Debug := \ + '-DV8_DEPRECATION_WARNINGS' \ + '-D_FILE_OFFSET_BITS=64' \ + '-DNO_TCMALLOC' \ + '-DDISABLE_NACL' \ + '-DCHROMIUM_BUILD' \ + '-DUSE_LIBJPEG_TURBO=1' \ + '-DENABLE_WEBRTC=1' \ + '-DUSE_PROPRIETARY_CODECS' \ + '-DENABLE_BROWSER_CDMS' \ + '-DENABLE_CONFIGURATION_POLICY' \ + '-DDISCARDABLE_MEMORY_ALWAYS_SUPPORTED_NATIVELY' \ + '-DSYSTEM_NATIVELY_SIGNALS_MEMORY_PRESSURE' \ + '-DENABLE_EGLIMAGE=1' \ + '-DCLD_VERSION=1' \ + '-DENABLE_PRINTING=1' \ + '-DENABLE_MANAGED_USERS=1' \ + '-DVIDEO_HOLE=1' \ + '-DENABLE_LOAD_COMPLETION_HACKS=1' \ + '-DUSE_OPENSSL=1' \ + '-DUSE_OPENSSL_CERTS=1' \ + '-DANDROID' \ + '-D__GNU_SOURCE=1' \ + '-DUSE_STLPORT=1' \ + '-D_STLP_USE_PTR_SPECIALIZATIONS=1' \ + '-DCHROME_BUILD_ID=""' \ + '-DDYNAMIC_ANNOTATIONS_ENABLED=1' \ + '-DWTF_USE_DYNAMIC_ANNOTATIONS=1' \ + '-D_DEBUG' + + +# Include paths placed before CFLAGS/CPPFLAGS +LOCAL_C_INCLUDES_Debug := \ + $(gyp_shared_intermediate_dir) \ + $(LOCAL_PATH)/third_party/libyuv/include \ + $(LOCAL_PATH)/third_party/libyuv + + +# Flags passed to only C++ (and not C) files. +LOCAL_CPPFLAGS_Debug := \ + -fno-exceptions \ + -fno-rtti \ + -fno-threadsafe-statics \ + -fvisibility-inlines-hidden \ + -Wno-deprecated \ + -std=gnu++11 \ + -Wno-narrowing \ + -Wno-literal-suffix \ + -Wno-non-virtual-dtor \ + -Wno-sign-promo \ + -Wno-non-virtual-dtor + + +# Flags passed to both C and C++ files. +MY_CFLAGS_Release := \ + --param=ssp-buffer-size=4 \ + -fno-strict-aliasing \ + -Wno-unused-parameter \ + -Wno-missing-field-initializers \ + -fvisibility=hidden \ + -pipe \ + -fPIC \ + -Wno-unused-local-typedefs \ + -Wno-format \ + -ffunction-sections \ + -funwind-tables \ + -g \ + -fno-short-enums \ + -finline-limit=64 \ + -Wa,--noexecstack \ + -U_FORTIFY_SOURCE \ + -Wno-extra \ + -Wno-ignored-qualifiers \ + -Wno-type-limits \ + -Wno-unused-but-set-variable \ + -Wno-address \ + -Wno-format-security \ + -Wno-return-type \ + -Wno-sequence-point \ + -Os \ + -fno-ident \ + -fdata-sections \ + -ffunction-sections \ + -funwind-tables + +MY_DEFS_Release := \ + '-DV8_DEPRECATION_WARNINGS' \ + '-D_FILE_OFFSET_BITS=64' \ + '-DNO_TCMALLOC' \ + '-DDISABLE_NACL' \ + '-DCHROMIUM_BUILD' \ + '-DUSE_LIBJPEG_TURBO=1' \ + '-DENABLE_WEBRTC=1' \ + '-DUSE_PROPRIETARY_CODECS' \ + '-DENABLE_BROWSER_CDMS' \ + '-DENABLE_CONFIGURATION_POLICY' \ + '-DDISCARDABLE_MEMORY_ALWAYS_SUPPORTED_NATIVELY' \ + '-DSYSTEM_NATIVELY_SIGNALS_MEMORY_PRESSURE' \ + '-DENABLE_EGLIMAGE=1' \ + '-DCLD_VERSION=1' \ + '-DENABLE_PRINTING=1' \ + '-DENABLE_MANAGED_USERS=1' \ + '-DVIDEO_HOLE=1' \ + '-DENABLE_LOAD_COMPLETION_HACKS=1' \ + '-DUSE_OPENSSL=1' \ + '-DUSE_OPENSSL_CERTS=1' \ + '-DANDROID' \ + '-D__GNU_SOURCE=1' \ + '-DUSE_STLPORT=1' \ + '-D_STLP_USE_PTR_SPECIALIZATIONS=1' \ + '-DCHROME_BUILD_ID=""' \ + '-DNDEBUG' \ + '-DNVALGRIND' \ + '-DDYNAMIC_ANNOTATIONS_ENABLED=0' + + +# Include paths placed before CFLAGS/CPPFLAGS +LOCAL_C_INCLUDES_Release := \ + $(gyp_shared_intermediate_dir) \ + $(LOCAL_PATH)/third_party/libyuv/include \ + $(LOCAL_PATH)/third_party/libyuv + + +# Flags passed to only C++ (and not C) files. +LOCAL_CPPFLAGS_Release := \ + -fno-exceptions \ + -fno-rtti \ + -fno-threadsafe-statics \ + -fvisibility-inlines-hidden \ + -Wno-deprecated \ + -std=gnu++11 \ + -Wno-narrowing \ + -Wno-literal-suffix \ + -Wno-non-virtual-dtor \ + -Wno-sign-promo \ + -Wno-non-virtual-dtor + + +LOCAL_CFLAGS := $(MY_CFLAGS_$(GYP_CONFIGURATION)) $(MY_DEFS_$(GYP_CONFIGURATION)) +LOCAL_C_INCLUDES := $(GYP_COPIED_SOURCE_ORIGIN_DIRS) $(LOCAL_C_INCLUDES_$(GYP_CONFIGURATION)) +LOCAL_CPPFLAGS := $(LOCAL_CPPFLAGS_$(GYP_CONFIGURATION)) +LOCAL_ASFLAGS := $(LOCAL_CFLAGS) +### Rules for final target. +### Set directly by aosp_build_settings. +LOCAL_CLANG := false +LOCAL_NDK_STL_VARIANT := stlport_static + +# Add target alias to "gyp_all_modules" target. +.PHONY: gyp_all_modules +gyp_all_modules: third_party_libyuv_libyuv_neon_gyp + +# Alias gyp target name. +.PHONY: libyuv_neon +libyuv_neon: third_party_libyuv_libyuv_neon_gyp + +include $(BUILD_STATIC_LIBRARY) diff --git a/libyuv_test.gyp b/libyuv_test.gyp index 140ba6c..4965b56 100644 --- a/libyuv_test.gyp +++ b/libyuv_test.gyp @@ -9,6 +9,7 @@ { 'variables': { 'libyuv_disable_jpeg%': 0, + 'libyuv_enable_svn%': 0, }, 'targets': [ { @@ -21,7 +22,6 @@ 'testing/gtest.gyp:gtest_main', ], 'defines': [ - 'LIBYUV_SVNREVISION="<!(svnversion -n)"', # Enable the following 3 macros to turn off assembly for specified CPU. # 'LIBYUV_DISABLE_X86', # 'LIBYUV_DISABLE_NEON', @@ -49,6 +49,11 @@ 'unit_test/version_test.cc', ], 'conditions': [ + [ 'libyuv_enable_svn == 1', { + 'defines': [ + 'LIBYUV_SVNREVISION="<!(svnversion -n)"', + ], + }], ['OS=="linux"', { 'cflags': [ '-fexceptions', @@ -38,7 +38,7 @@ LOCAL_OBJ_FILES := \ all: libyuv.a convert libyuv.a: $(LOCAL_OBJ_FILES) - $(AR) $(ARFLAGS) -o $@ $(LOCAL_OBJ_FILES) + $(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES) # A test utility that uses libyuv conversion. convert: util/convert.cc libyuv.a diff --git a/setup_links.py b/setup_links.py new file mode 100755 index 0000000..878614f --- /dev/null +++ b/setup_links.py @@ -0,0 +1,471 @@ +#!/usr/bin/env python +# Copyright 2014 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +"""Setup links to a Chromium checkout for Libyuv. + +Libyuv shares a lot of dependencies and build tools with Chromium. +To do this, many of the paths of a Chromium checkout is emulated by creating +symlinks to files and directories. This script handles the setup of symlinks to +achieve this. + +It's a modified copy of the similar script that lives in WebRTC. +It also handles cleanup of the legacy Subversion-based approach that was used +before Chrome switched over their master repo from Subversion to Git. +""" + + +import ctypes +import errno +import logging +import optparse +import os +import shelve +import shutil +import subprocess +import sys +import textwrap + + +DIRECTORIES = [ + 'build', + 'buildtools', + 'google_apis', # Needed by build/common.gypi. + 'net', + 'testing', + 'third_party/android_testrunner', + 'third_party/android_tools', + 'third_party/binutils', + 'third_party/libc++', + 'third_party/libc++abi', + 'third_party/libjpeg', + 'third_party/libjpeg_turbo', + 'third_party/llvm-build', + 'third_party/nss', + 'third_party/yasm', + 'tools/android', + 'tools/clang', + 'tools/gn', + 'tools/gyp', + 'tools/memory', + 'tools/python', + 'tools/valgrind', + 'tools/win', +] + +FILES = { + '.gn': None, + 'tools/find_depot_tools.py': None, + 'third_party/BUILD.gn': None, +} + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +CHROMIUM_CHECKOUT = os.path.join('chromium', 'src') +LINKS_DB = 'links' + +# Version management to make future upgrades/downgrades easier to support. +SCHEMA_VERSION = 1 + + +def query_yes_no(question, default=False): + """Ask a yes/no question via raw_input() and return their answer. + + Modified from http://stackoverflow.com/a/3041990. + """ + prompt = " [%s/%%s]: " + prompt = prompt % ('Y' if default is True else 'y') + prompt = prompt % ('N' if default is False else 'n') + + if default is None: + default = 'INVALID' + + while True: + sys.stdout.write(question + prompt) + choice = raw_input().lower() + if choice == '' and default != 'INVALID': + return default + + if 'yes'.startswith(choice): + return True + elif 'no'.startswith(choice): + return False + + print "Please respond with 'yes' or 'no' (or 'y' or 'n')." + + +# Actions +class Action(object): + def __init__(self, dangerous): + self.dangerous = dangerous + + def announce(self, planning): + """Log a description of this action. + + Args: + planning - True iff we're in the planning stage, False if we're in the + doit stage. + """ + pass + + def doit(self, links_db): + """Execute the action, recording what we did to links_db, if necessary.""" + pass + + +class Remove(Action): + def __init__(self, path, dangerous): + super(Remove, self).__init__(dangerous) + self._priority = 0 + self._path = path + + def announce(self, planning): + log = logging.warn + filesystem_type = 'file' + if not self.dangerous: + log = logging.info + filesystem_type = 'link' + if planning: + log('Planning to remove %s: %s', filesystem_type, self._path) + else: + log('Removing %s: %s', filesystem_type, self._path) + + def doit(self, _links_db): + os.remove(self._path) + + +class Rmtree(Action): + def __init__(self, path): + super(Rmtree, self).__init__(dangerous=True) + self._priority = 0 + self._path = path + + def announce(self, planning): + if planning: + logging.warn('Planning to remove directory: %s', self._path) + else: + logging.warn('Removing directory: %s', self._path) + + def doit(self, _links_db): + if sys.platform.startswith('win'): + # shutil.rmtree() doesn't work on Windows if any of the directories are + # read-only, which svn repositories are. + subprocess.check_call(['rd', '/q', '/s', self._path], shell=True) + else: + shutil.rmtree(self._path) + + +class Makedirs(Action): + def __init__(self, path): + super(Makedirs, self).__init__(dangerous=False) + self._priority = 1 + self._path = path + + def doit(self, _links_db): + try: + os.makedirs(self._path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + +class Symlink(Action): + def __init__(self, source_path, link_path): + super(Symlink, self).__init__(dangerous=False) + self._priority = 2 + self._source_path = source_path + self._link_path = link_path + + def announce(self, planning): + if planning: + logging.info( + 'Planning to create link from %s to %s', self._link_path, + self._source_path) + else: + logging.debug( + 'Linking from %s to %s', self._link_path, self._source_path) + + def doit(self, links_db): + # Files not in the root directory need relative path calculation. + # On Windows, use absolute paths instead since NTFS doesn't seem to support + # relative paths for symlinks. + if sys.platform.startswith('win'): + source_path = os.path.abspath(self._source_path) + else: + if os.path.dirname(self._link_path) != self._link_path: + source_path = os.path.relpath(self._source_path, + os.path.dirname(self._link_path)) + + os.symlink(source_path, os.path.abspath(self._link_path)) + links_db[self._source_path] = self._link_path + + +class LinkError(IOError): + """Failed to create a link.""" + pass + + +# Handles symlink creation on the different platforms. +if sys.platform.startswith('win'): + def symlink(source_path, link_path): + flag = 1 if os.path.isdir(source_path) else 0 + if not ctypes.windll.kernel32.CreateSymbolicLinkW( + unicode(link_path), unicode(source_path), flag): + raise OSError('Failed to create symlink to %s. Notice that only NTFS ' + 'version 5.0 and up has all the needed APIs for ' + 'creating symlinks.' % source_path) + os.symlink = symlink + + +class LibyuvLinkSetup(): + def __init__(self, links_db, force=False, dry_run=False, prompt=False): + self._force = force + self._dry_run = dry_run + self._prompt = prompt + self._links_db = links_db + + def CreateLinks(self, on_bot): + logging.debug('CreateLinks') + # First, make a plan of action + actions = [] + + for source_path, link_path in FILES.iteritems(): + actions += self._ActionForPath( + source_path, link_path, check_fn=os.path.isfile, check_msg='files') + for source_dir in DIRECTORIES: + actions += self._ActionForPath( + source_dir, None, check_fn=os.path.isdir, + check_msg='directories') + + if not on_bot and self._force: + # When making the manual switch from legacy SVN checkouts to the new + # Git-based Chromium DEPS, the .gclient_entries file that contains cached + # URLs for all DEPS entries must be removed to avoid future sync problems. + entries_file = os.path.join(os.path.dirname(ROOT_DIR), '.gclient_entries') + if os.path.exists(entries_file): + actions.append(Remove(entries_file, dangerous=True)) + + actions.sort() + + if self._dry_run: + for action in actions: + action.announce(planning=True) + logging.info('Not doing anything because dry-run was specified.') + sys.exit(0) + + if any(a.dangerous for a in actions): + logging.warn('Dangerous actions:') + for action in (a for a in actions if a.dangerous): + action.announce(planning=True) + print + + if not self._force: + logging.error(textwrap.dedent("""\ + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + A C T I O N R E Q I R E D + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + Because chromium/src is transitioning to Git (from SVN), we needed to + change the way that the Libyuv standalone checkout works. Instead of + individually syncing subdirectories of Chromium in SVN, we're now + syncing Chromium (and all of its DEPS, as defined by its own DEPS file), + into the `chromium/src` directory. + + As such, all Chromium directories which are currently pulled by DEPS are + now replaced with a symlink into the full Chromium checkout. + + To avoid disrupting developers, we've chosen to not delete your + directories forcibly, in case you have some work in progress in one of + them :). + + ACTION REQUIRED: + Before running `gclient sync|runhooks` again, you must run: + %s%s --force + + Which will replace all directories which now must be symlinks, after + prompting with a summary of the work-to-be-done. + """), 'python ' if sys.platform.startswith('win') else '', sys.argv[0]) + sys.exit(1) + elif self._prompt: + if not query_yes_no('Would you like to perform the above plan?'): + sys.exit(1) + + for action in actions: + action.announce(planning=False) + action.doit(self._links_db) + + if not on_bot and self._force: + logging.info('Completed!\n\nNow run `gclient sync|runhooks` again to ' + 'let the remaining hooks (that probably were interrupted) ' + 'execute.') + + def CleanupLinks(self): + logging.debug('CleanupLinks') + for source, link_path in self._links_db.iteritems(): + if source == 'SCHEMA_VERSION': + continue + if os.path.islink(link_path) or sys.platform.startswith('win'): + # os.path.islink() always returns false on Windows + # See http://bugs.python.org/issue13143. + logging.debug('Removing link to %s at %s', source, link_path) + if not self._dry_run: + if os.path.exists(link_path): + if sys.platform.startswith('win') and os.path.isdir(link_path): + subprocess.check_call(['rmdir', '/q', link_path], shell=True) + else: + os.remove(link_path) + del self._links_db[source] + + @staticmethod + def _ActionForPath(source_path, link_path=None, check_fn=None, + check_msg=None): + """Create zero or more Actions to link to a file or directory. + + This will be a symlink on POSIX platforms. On Windows this requires + that NTFS is version 5.0 or higher (Vista or newer). + + Args: + source_path: Path relative to the Chromium checkout root. + For readability, the path may contain slashes, which will + automatically be converted to the right path delimiter on Windows. + link_path: The location for the link to create. If omitted it will be the + same path as source_path. + check_fn: A function returning true if the type of filesystem object is + correct for the attempted call. Otherwise an error message with + check_msg will be printed. + check_msg: String used to inform the user of an invalid attempt to create + a file. + Returns: + A list of Action objects. + """ + def fix_separators(path): + if sys.platform.startswith('win'): + return path.replace(os.altsep, os.sep) + else: + return path + + assert check_fn + assert check_msg + link_path = link_path or source_path + link_path = fix_separators(link_path) + + source_path = fix_separators(source_path) + source_path = os.path.join(CHROMIUM_CHECKOUT, source_path) + if os.path.exists(source_path) and not check_fn: + raise LinkError('_LinkChromiumPath can only be used to link to %s: ' + 'Tried to link to: %s' % (check_msg, source_path)) + + if not os.path.exists(source_path): + logging.debug('Silently ignoring missing source: %s. This is to avoid ' + 'errors on platform-specific dependencies.', source_path) + return [] + + actions = [] + + if os.path.exists(link_path) or os.path.islink(link_path): + if os.path.islink(link_path): + actions.append(Remove(link_path, dangerous=False)) + elif os.path.isfile(link_path): + actions.append(Remove(link_path, dangerous=True)) + elif os.path.isdir(link_path): + actions.append(Rmtree(link_path)) + else: + raise LinkError('Don\'t know how to plan: %s' % link_path) + + # Create parent directories to the target link if needed. + target_parent_dirs = os.path.dirname(link_path) + if (target_parent_dirs and + target_parent_dirs != link_path and + not os.path.exists(target_parent_dirs)): + actions.append(Makedirs(target_parent_dirs)) + + actions.append(Symlink(source_path, link_path)) + + return actions + +def _initialize_database(filename): + links_database = shelve.open(filename) + + # Wipe the database if this version of the script ends up looking at a + # newer (future) version of the links db, just to be sure. + version = links_database.get('SCHEMA_VERSION') + if version and version != SCHEMA_VERSION: + logging.info('Found database with schema version %s while this script only ' + 'supports %s. Wiping previous database contents.', version, + SCHEMA_VERSION) + links_database.clear() + links_database['SCHEMA_VERSION'] = SCHEMA_VERSION + return links_database + + +def main(): + on_bot = os.environ.get('CHROME_HEADLESS') == '1' + + parser = optparse.OptionParser() + parser.add_option('-d', '--dry-run', action='store_true', default=False, + help='Print what would be done, but don\'t perform any ' + 'operations. This will automatically set logging to ' + 'verbose.') + parser.add_option('-c', '--clean-only', action='store_true', default=False, + help='Only clean previously created links, don\'t create ' + 'new ones. This will automatically set logging to ' + 'verbose.') + parser.add_option('-f', '--force', action='store_true', default=on_bot, + help='Force link creation. CAUTION: This deletes existing ' + 'folders and files in the locations where links are ' + 'about to be created.') + parser.add_option('-n', '--no-prompt', action='store_false', dest='prompt', + default=(not on_bot), + help='Prompt if we\'re planning to do a dangerous action') + parser.add_option('-v', '--verbose', action='store_const', + const=logging.DEBUG, default=logging.INFO, + help='Print verbose output for debugging.') + options, _ = parser.parse_args() + + if options.dry_run or options.force or options.clean_only: + options.verbose = logging.DEBUG + logging.basicConfig(format='%(message)s', level=options.verbose) + + # Work from the root directory of the checkout. + script_dir = os.path.dirname(os.path.abspath(__file__)) + os.chdir(script_dir) + + if sys.platform.startswith('win'): + def is_admin(): + try: + return os.getuid() == 0 + except AttributeError: + return ctypes.windll.shell32.IsUserAnAdmin() != 0 + if not is_admin(): + logging.error('On Windows, you now need to have administrator ' + 'privileges for the shell running %s (or ' + '`gclient sync|runhooks`).\nPlease start another command ' + 'prompt as Administrator and try again.' % sys.argv[0]) + return 1 + + if not os.path.exists(CHROMIUM_CHECKOUT): + logging.error('Cannot find a Chromium checkout at %s. Did you run "gclient ' + 'sync" before running this script?', CHROMIUM_CHECKOUT) + return 2 + + links_database = _initialize_database(LINKS_DB) + try: + symlink_creator = LibyuvLinkSetup(links_database, options.force, + options.dry_run, options.prompt) + symlink_creator.CleanupLinks() + if not options.clean_only: + symlink_creator.CreateLinks(on_bot) + except LinkError as e: + print >> sys.stderr, e.message + return 3 + finally: + links_database.close() + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/source/compare.cc b/source/compare.cc index 9ea81b4..255e772 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -80,7 +80,7 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); #if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SUMSQUAREERROR_NEON uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); #endif @@ -114,8 +114,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, } #endif #if defined(HAS_SUMSQUAREERROR_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { // Note only used for multiples of 16 so count is not checked. SumSquareError = SumSquareError_SSE2; } diff --git a/source/compare_neon.cc b/source/compare_neon.cc index 5e7b8e4..ef006ec 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -16,7 +16,8 @@ namespace libyuv { extern "C" { #endif -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { volatile uint32 sse; @@ -56,7 +57,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { return sse; } -#endif // __ARM_NEON__ +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc new file mode 100644 index 0000000..cc078f8 --- /dev/null +++ b/source/compare_neon64.cc @@ -0,0 +1,63 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + "subs %2, %2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + return sse; +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/compare_posix.cc b/source/compare_posix.cc index ac36119..64dfc35 100644 --- a/source/compare_posix.cc +++ b/source/compare_posix.cc @@ -25,9 +25,9 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "lea " MEMLEA(0x10, 0) ",%0 \n" - "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" "lea " MEMLEA(0x10, 1) ",%1 \n" "sub $0x10,%2 \n" "movdqa %%xmm1,%%xmm3 \n" diff --git a/source/compare_win.cc b/source/compare_win.cc index 9983165..50d4d34 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -29,9 +29,9 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { align 4 wloop: - movdqa xmm1, [eax] + movdqu xmm1, [eax] lea eax, [eax + 16] - movdqa xmm2, [edx] + movdqu xmm2, [edx] lea edx, [edx + 16] sub ecx, 16 movdqa xmm3, xmm1 // abs trick diff --git a/source/convert.cc b/source/convert.cc index 874a6cb..9582b53 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -194,13 +194,15 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src, 16) && - IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_AVX; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; @@ -286,12 +288,7 @@ static int X420ToI420(const uint8* src_y, if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { SplitUVRow = SplitUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - SplitUVRow = SplitUVRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) && - IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && - IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { - SplitUVRow = SplitUVRow_SSE2; - } + SplitUVRow = SplitUVRow_SSE2; } } #endif @@ -312,15 +309,13 @@ static int X420ToI420(const uint8* src_y, } #endif #if defined(HAS_SPLITUVROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16 && + IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && + IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && + IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { SplitUVRow = SplitUVRow_Any_MIPS_DSPR2; if (IS_ALIGNED(halfwidth, 16)) { - SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2; - if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && - IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && - IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { - SplitUVRow = SplitUVRow_MIPS_DSPR2; - } + SplitUVRow = SplitUVRow_MIPS_DSPR2; } } #endif @@ -401,7 +396,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height) { int y; - int halfheight = (height + 1) >> 1; + int halfheight; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUV422Row_C; @@ -435,12 +430,15 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_AVX; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; @@ -457,14 +455,8 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -533,14 +525,8 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUVRow = YUY2ToUVRow_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } + YUY2ToUVRow = YUY2ToUVRow_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -606,14 +592,8 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, UYVYToUVRow = UYVYToUVRow_Any_SSE2; UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2; - UYVYToYRow = UYVYToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { - UYVYToUVRow = UYVYToUVRow_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - UYVYToYRow = UYVYToYRow_SSE2; - } - } + UYVYToUVRow = UYVYToUVRow_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -684,14 +664,8 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -711,11 +685,13 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif @@ -759,34 +735,31 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, src_bgra = src_bgra + (height - 1) * src_stride_bgra; src_stride_bgra = -src_stride_bgra; } -#if defined(HAS_BGRATOYROW_SSSE3) +#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { BGRAToUVRow = BGRAToUVRow_Any_SSSE3; BGRAToYRow = BGRAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3; - BGRAToYRow = BGRAToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) { - BGRAToUVRow = BGRAToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - BGRAToYRow = BGRAToYRow_SSSE3; - } - } + BGRAToUVRow = BGRAToUVRow_SSSE3; + BGRAToYRow = BGRAToYRow_SSSE3; } } -#elif defined(HAS_BGRATOYROW_NEON) +#endif +#if defined(HAS_BGRATOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { BGRAToYRow = BGRAToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { BGRAToYRow = BGRAToYRow_NEON; } - if (width >= 16) { + } +#endif +#if defined(HAS_BGRATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { BGRAToUVRow = BGRAToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { BGRAToUVRow = BGRAToUVRow_NEON; } } - } #endif for (y = 0; y < height - 1; y += 2) { @@ -828,32 +801,29 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) +#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ABGRToUVRow = ABGRToUVRow_Any_SSSE3; ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3; - ABGRToYRow = ABGRToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ABGRToYRow = ABGRToYRow_SSSE3; - } - } + ABGRToUVRow = ABGRToUVRow_SSSE3; + ABGRToYRow = ABGRToYRow_SSSE3; } } -#elif defined(HAS_ABGRTOYROW_NEON) +#endif +#if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ABGRToYRow = ABGRToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_NEON; } - if (width >= 16) { - ABGRToUVRow = ABGRToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_NEON; - } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; } } #endif @@ -897,32 +867,29 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, src_rgba = src_rgba + (height - 1) * src_stride_rgba; src_stride_rgba = -src_stride_rgba; } -#if defined(HAS_RGBATOYROW_SSSE3) +#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { RGBAToUVRow = RGBAToUVRow_Any_SSSE3; RGBAToYRow = RGBAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3; - RGBAToYRow = RGBAToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) { - RGBAToUVRow = RGBAToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - RGBAToYRow = RGBAToYRow_SSSE3; - } - } + RGBAToUVRow = RGBAToUVRow_SSSE3; + RGBAToYRow = RGBAToYRow_SSSE3; } } -#elif defined(HAS_RGBATOYROW_NEON) +#endif +#if defined(HAS_RGBATOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { RGBAToYRow = RGBAToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGBAToYRow = RGBAToYRow_NEON; } - if (width >= 16) { - RGBAToUVRow = RGBAToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_NEON; - } + } +#endif +#if defined(HAS_RGBATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + RGBAToUVRow = RGBAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_NEON; } } #endif @@ -963,9 +930,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -984,15 +948,16 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, if (IS_ALIGNED(width, 8)) { RGB24ToYRow = RGB24ToYRow_NEON; } - if (width >= 16) { - RGB24ToUVRow = RGB24ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVRow = RGB24ToUVRow_NEON; - } + } +#endif +#if defined(HAS_RGB24TOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_NEON; } } -#else // HAS_RGB24TOYROW_NEON - +#endif #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; @@ -1013,45 +978,49 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 -#endif // HAS_RGB24TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_RGB24TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RGB24TOYROW_NEON) - RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); - RGB24ToYRow(src_rgb24, dst_y, width); - RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); #else - RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_rgb24 += src_stride_rgb24 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_RGB24TOYROW_NEON) - RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); - RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); #else - RGB24ToARGBRow(src_rgb24, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_RGB24TOYROW_NEON) - free_aligned_buffer_64(row); + free_aligned_buffer_64(row); #endif + } return 0; } @@ -1075,9 +1044,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1096,15 +1062,16 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, if (IS_ALIGNED(width, 8)) { RAWToYRow = RAWToYRow_NEON; } - if (width >= 16) { - RAWToUVRow = RAWToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RAWToUVRow = RAWToUVRow_NEON; - } + } +#endif +#if defined(HAS_RAWTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + RAWToUVRow = RAWToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_NEON; } } -#else // HAS_RAWTOYROW_NEON - +#endif #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; @@ -1125,45 +1092,47 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 -#endif // HAS_RAWTOYROW_NEON - for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RAWTOYROW_NEON) - RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); - RAWToYRow(src_raw, dst_y, width); - RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); -#else - RAWToARGBRow(src_raw, row, width); - RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_raw += src_stride_raw * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if defined(HAS_RAWTOYROW_NEON) - RAWToUVRow(src_raw, 0, dst_u, dst_v, width); - RAWToYRow(src_raw, dst_y, width); -#else - RAWToARGBRow(src_raw, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); -#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + for (y = 0; y < height - 1; y += 2) { + #if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); + #else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + #endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + #if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + #else + RAWToARGBRow(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + #endif + } + #if !defined(HAS_RAWTOYROW_NEON) + free_aligned_buffer_64(row); + #endif } -#if !defined(HAS_RAWTOYROW_NEON) - free_aligned_buffer_64(row); -#endif return 0; } @@ -1187,9 +1156,6 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1237,45 +1203,50 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_RGB565TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_RGB565TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RGB565TOYROW_NEON) - RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); - RGB565ToYRow(src_rgb565, dst_y, width); - RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); + RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); #else - RGB565ToARGBRow(src_rgb565, row, width); - RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_rgb565 += src_stride_rgb565 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_RGB565TOYROW_NEON) - RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); - RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); #else - RGB565ToARGBRow(src_rgb565, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + RGB565ToARGBRow(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_RGB565TOYROW_NEON) - free_aligned_buffer_64(row); + free_aligned_buffer_64(row); #endif + } return 0; } @@ -1299,9 +1270,6 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1349,47 +1317,51 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGB1555TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_ARGB1555TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_ARGB1555TOYROW_NEON) - ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); - ARGB1555ToYRow(src_argb1555, dst_y, width); - ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, - width); + ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, + width); #else - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, - width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_argb1555 += src_stride_argb1555 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb1555 += src_stride_argb1555 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_ARGB1555TOYROW_NEON) - ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); - ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); #else - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_ARGB1555TOYROW_NEON) free_aligned_buffer_64(row); #endif + } return 0; } @@ -1413,9 +1385,6 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); #endif if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1463,47 +1432,52 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGB4444TOYROW_NEON - for (y = 0; y < height - 1; y += 2) { + { +#if !defined(HAS_ARGB4444TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_ARGB4444TOYROW_NEON) - ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); - ARGB4444ToYRow(src_argb4444, dst_y, width); - ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, - width); + ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, + width); #else - ARGB4444ToARGBRow(src_argb4444, row, width); - ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, - width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_argb4444 += src_stride_argb4444 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb4444 += src_stride_argb4444 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_ARGB4444TOYROW_NEON) - ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); - ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); #else - ARGB4444ToARGBRow(src_argb4444, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } + } #if !defined(HAS_ARGB4444TOYROW_NEON) - free_aligned_buffer_64(row); + free_aligned_buffer_64(row); #endif + } return 0; } diff --git a/source/convert_argb.cc b/source/convert_argb.cc index ac0bc3d..51e7438 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -82,13 +82,11 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I444ToARGBRow = I444ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; - } + I444ToARGBRow = I444ToARGBRow_SSSE3; } } -#elif defined(HAS_I444TOARGBROW_NEON) +#endif +#if defined(HAS_I444TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I444ToARGBRow = I444ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -144,10 +142,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -224,13 +219,11 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I411ToARGBRow = I411ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I411ToARGBRow = I411ToARGBRow_SSSE3; - } + I411ToARGBRow = I411ToARGBRow_SSSE3; } } -#elif defined(HAS_I411TOARGBROW_NEON) +#endif +#if defined(HAS_I411TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I411ToARGBRow = I411ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -276,14 +269,14 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, src_stride_y = dst_stride_argb = 0; } #if defined(HAS_YTOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { YToARGBRow = YToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { YToARGBRow = YToARGBRow_SSE2; } } -#elif defined(HAS_YTOARGBROW_NEON) +#endif +#if defined(HAS_YTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { YToARGBRow = YToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -329,13 +322,11 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { I400ToARGBRow = I400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I400ToARGBRow = I400ToARGBRow_SSE2; - } + I400ToARGBRow = I400ToARGBRow_SSE2; } } -#elif defined(HAS_I400TOARGBROW_NEON) +#endif +#if defined(HAS_I400TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I400ToARGBRow = I400ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -447,14 +438,14 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, src_stride_rgb24 = dst_stride_argb = 0; } #if defined(HAS_RGB24TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } -#elif defined(HAS_RGB24TOARGBROW_NEON) +#endif +#if defined(HAS_RGB24TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -497,14 +488,14 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, src_stride_raw = dst_stride_argb = 0; } #if defined(HAS_RAWTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RAWToARGBRow = RAWToARGBRow_SSSE3; } } -#elif defined(HAS_RAWTOARGBROW_NEON) +#endif +#if defined(HAS_RAWTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { RAWToARGBRow = RAWToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -547,14 +538,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, src_stride_rgb565 = dst_stride_argb = 0; } #if defined(HAS_RGB565TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { RGB565ToARGBRow = RGB565ToARGBRow_SSE2; } } -#elif defined(HAS_RGB565TOARGBROW_NEON) +#endif +#if defined(HAS_RGB565TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -597,14 +588,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, src_stride_argb1555 = dst_stride_argb = 0; } #if defined(HAS_ARGB1555TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; } } -#elif defined(HAS_ARGB1555TOARGBROW_NEON) +#endif +#if defined(HAS_ARGB1555TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -647,14 +638,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, src_stride_argb4444 = dst_stride_argb = 0; } #if defined(HAS_ARGB4444TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; } } -#elif defined(HAS_ARGB4444TOARGBROW_NEON) +#endif +#if defined(HAS_ARGB4444TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -696,13 +687,11 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; - } + NV12ToARGBRow = NV12ToARGBRow_SSSE3; } } -#elif defined(HAS_NV12TOARGBROW_NEON) +#endif +#if defined(HAS_NV12TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { NV12ToARGBRow = NV12ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -747,10 +736,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - NV21ToARGBRow = NV21ToARGBRow_SSSE3; - } + NV21ToARGBRow = NV21ToARGBRow_SSSE3; } } #endif @@ -798,13 +784,11 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; - } + NV12ToARGBRow = NV12ToARGBRow_SSSE3; } } -#elif defined(HAS_NV12TOARGBROW_NEON) +#endif +#if defined(HAS_NV12TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { NV12ToARGBRow = NV12ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -856,14 +840,11 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; - } + YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; } } -#elif defined(HAS_YUY2TOARGBROW_NEON) +#endif +#if defined(HAS_YUY2TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -909,14 +890,11 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - UYVYToARGBRow = UYVYToARGBRow_SSSE3; - } + UYVYToARGBRow = UYVYToARGBRow_SSSE3; } } -#elif defined(HAS_UYVYTOARGBROW_NEON) +#endif +#if defined(HAS_UYVYTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { UYVYToARGBRow = UYVYToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { diff --git a/source/convert_from.cc b/source/convert_from.cc index c1a2f62..e044e03 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -180,7 +180,8 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, I422ToYUY2Row = I422ToYUY2Row_SSE2; } } -#elif defined(HAS_I422TOYUY2ROW_NEON) +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -226,7 +227,8 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, I422ToYUY2Row = I422ToYUY2Row_SSE2; } } -#elif defined(HAS_I422TOYUY2ROW_NEON) +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -286,7 +288,8 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, I422ToUYVYRow = I422ToUYVYRow_SSE2; } } -#elif defined(HAS_I422TOUYVYROW_NEON) +#endif +#if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -332,7 +335,8 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, I422ToUYVYRow = I422ToUYVYRow_SSE2; } } -#elif defined(HAS_I422TOUYVYROW_NEON) +#endif +#if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -400,12 +404,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_Unaligned_SSE2; - if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && - IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && - IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } + MergeUVRow_ = MergeUVRow_SSE2; } } #endif @@ -479,10 +478,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -551,20 +547,27 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToBGRARow = I422ToBGRARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { - I422ToBGRARow = I422ToBGRARow_SSSE3; - } + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOBGRAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToBGRARow = I422ToBGRARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToBGRARow = I422ToBGRARow_AVX2; } } -#elif defined(HAS_I422TOBGRAROW_NEON) +#endif +#if defined(HAS_I422TOBGRAROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToBGRARow = I422ToBGRARow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToBGRARow = I422ToBGRARow_NEON; } } -#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) +#endif +#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && @@ -613,13 +616,11 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToABGRRow = I422ToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { - I422ToABGRRow = I422ToABGRRow_SSSE3; - } + I422ToABGRRow = I422ToABGRRow_SSSE3; } } -#elif defined(HAS_I422TOABGRROW_NEON) +#endif +#if defined(HAS_I422TOABGRROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToABGRRow = I422ToABGRRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -667,13 +668,11 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } + I422ToRGBARow = I422ToRGBARow_SSSE3; } } -#elif defined(HAS_I422TORGBAROW_NEON) +#endif +#if defined(HAS_I422TORGBAROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToRGBARow = I422ToRGBARow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -724,7 +723,8 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } -#elif defined(HAS_I422TORGB24ROW_NEON) +#endif +#if defined(HAS_I422TORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToRGB24Row = I422ToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -775,7 +775,8 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, I422ToRAWRow = I422ToRAWRow_SSSE3; } } -#elif defined(HAS_I422TORAWROW_NEON) +#endif +#if defined(HAS_I422TORAWROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToRAWRow = I422ToRAWRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -826,7 +827,8 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; } } -#elif defined(HAS_I422TOARGB1555ROW_NEON) +#endif +#if defined(HAS_I422TOARGB1555ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -878,7 +880,8 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; } } -#elif defined(HAS_I422TOARGB4444ROW_NEON) +#endif +#if defined(HAS_I422TOARGB4444ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -929,7 +932,8 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, I422ToRGB565Row = I422ToRGB565Row_SSSE3; } } -#elif defined(HAS_I422TORGB565ROW_NEON) +#endif +#if defined(HAS_I422TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToRGB565Row = I422ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 121a416..1e465ab 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -54,32 +54,32 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; - } + ARGBToUV444Row = ARGBToUV444Row_SSSE3; } } #endif +#if defined(HAS_ARGBTOUV444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; - ARGBToUV444Row = ARGBToUV444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; - ARGBToUV444Row = ARGBToUV444Row_NEON; } } #endif @@ -128,10 +128,15 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; } } #endif @@ -140,25 +145,16 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } } #endif @@ -206,11 +202,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -228,11 +220,13 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 32) { - ARGBToUV411Row = ARGBToUV411Row_Any_NEON; - if (IS_ALIGNED(width, 32)) { - ARGBToUV411Row = ARGBToUV411Row_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUV411ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 32) { + ARGBToUV411Row = ARGBToUV411Row_Any_NEON; + if (IS_ALIGNED(width, 32)) { + ARGBToUV411Row = ARGBToUV411Row_NEON; } } #endif @@ -261,9 +255,6 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, ARGBToYRow_C; void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; - // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); - uint8* row_v = row_u + ((halfwidth + 15) & ~15); if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { @@ -280,27 +271,24 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif @@ -308,10 +296,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } + MergeUVRow_ = MergeUVRow_SSE2; } } #endif @@ -331,22 +316,27 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } } #endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); + uint8* row_v = row_u + ((halfwidth + 15) & ~15); - for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); } - free_aligned_buffer_64(row_u); return 0; } @@ -364,9 +354,6 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, ARGBToYRow_C; void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; - // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); - uint8* row_v = row_u + ((halfwidth + 15) & ~15); if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { @@ -383,27 +370,24 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif @@ -411,10 +395,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } + MergeUVRow_ = MergeUVRow_SSE2; } } #endif @@ -434,22 +415,27 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } } #endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); + uint8* row_v = row_u + ((halfwidth + 15) & ~15); - for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); } - free_aligned_buffer_64(row_u); return 0; } @@ -487,10 +473,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; } } #endif @@ -498,24 +489,16 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } } #endif @@ -526,7 +509,8 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, I422ToYUY2Row = I422ToYUY2Row_SSE2; } } -#elif defined(HAS_I422TOYUY2ROW_NEON) +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -588,10 +572,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; } } #endif @@ -599,24 +588,16 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUV422Row = ARGBToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_NEON; - } - } } #endif @@ -627,7 +608,8 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, I422ToUYVYRow = I422ToUYVYRow_SSE2; } } -#elif defined(HAS_I422TOUYVYROW_NEON) +#endif +#if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -682,11 +664,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -761,7 +739,8 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; } } -#elif defined(HAS_ARGBTORGB24ROW_NEON) +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -808,7 +787,8 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, ARGBToRAWRow = ARGBToRAWRow_SSSE3; } } -#elif defined(HAS_ARGBTORAWROW_NEON) +#endif +#if defined(HAS_ARGBTORAWROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToRAWRow = ARGBToRAWRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -849,14 +829,14 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_rgb565 = 0; } #if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } } -#elif defined(HAS_ARGBTORGB565ROW_NEON) +#endif +#if defined(HAS_ARGBTORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -897,14 +877,14 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb1555 = 0; } #if defined(HAS_ARGBTOARGB1555ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; } } -#elif defined(HAS_ARGBTOARGB1555ROW_NEON) +#endif +#if defined(HAS_ARGBTOARGB1555ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -945,14 +925,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb4444 = 0; } #if defined(HAS_ARGBTOARGB4444ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; } } -#elif defined(HAS_ARGBTOARGB4444ROW_NEON) +#endif +#if defined(HAS_ARGBTOARGB4444ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -997,14 +977,8 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3; - ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; - if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; - } - } + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -1022,11 +996,13 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_NEON; } - if (width >= 16) { - ARGBToUVJRow = ARGBToUVJRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; } } #endif @@ -1074,11 +1050,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; - } + ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 2e0d61d..1efa265 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -14,8 +14,9 @@ #include <intrin.h> // For __cpuidex() #endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ - !defined(__native_client__) && defined(_M_X64) && \ - defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) + !defined(__native_client__) && \ + defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \ + (defined(_M_IX86) || defined(_M_X64)) #include <immintrin.h> // For _xgetbv() #endif @@ -51,7 +52,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { #if defined(_MSC_VER) && !defined(__clang__) #if (_MSC_FULL_VER >= 160040219) __cpuidex((int*)(cpu_info), info_eax, info_ecx); -#elif defined(_M_IX86) +#endif +#if defined(_M_IX86) __asm { mov eax, info_eax mov ecx, info_ecx @@ -97,13 +99,15 @@ int TestOsSaveYmm() { uint32 xcr0 = 0u; #if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. -#elif defined(_M_IX86) +#endif +#if defined(_M_IX86) && defined(_MSC_VER) __asm { xor ecx, ecx // xcr 0 _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. mov xcr0, eax } -#elif defined(__i386__) || defined(__x86_64__) +#endif +#if defined(__i386__) || defined(__x86_64__) asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); #endif // defined(_MSC_VER) return((xcr0 & 6) == 6); // Is ymm saved? @@ -134,6 +138,12 @@ int ArmCpuCaps(const char* cpuinfo_name) { fclose(f); return kCpuHasNEON; } + // aarch64 uses asimd for Neon. + p = strstr(cpuinfo_line, " asimd"); + if (p && (p[6] == ' ' || p[6] == '\n')) { + fclose(f); + return kCpuHasNEON; + } } } fclose(f); @@ -239,7 +249,8 @@ int InitCpuFlags(void) { if (TestEnv("LIBYUV_DISABLE_FMA3")) { cpu_info_ &= ~kCpuHasFMA3; } -#elif defined(__mips__) && defined(__linux__) +#endif +#if defined(__mips__) && defined(__linux__) // Linux mips parse text file for dsp detect. cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. #if defined(__mips_dspr2) @@ -256,12 +267,19 @@ int InitCpuFlags(void) { if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) { cpu_info_ &= ~kCpuHasMIPS_DSPR2; } -#elif defined(__arm__) +#endif +#if defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. // For Linux, /proc/cpuinfo can be tested but without that assume Neon. #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) cpu_info_ = kCpuHasNEON; +// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon +// flag in it. +// So for aarch64, neon enabling is hard coded here. +#endif +#if defined(__aarch64__) + cpu_info_ = kCpuHasNEON; #else // Linux arm parse text file for neon detect. cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); diff --git a/source/format_conversion.cc b/source/format_conversion.cc index a3daf96..21d224f 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -77,14 +77,14 @@ int ARGBToBayer(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; } } -#elif defined(HAS_ARGBTOBAYERROW_NEON) +#endif +#if defined(HAS_ARGBTOBAYERROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -319,24 +319,24 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer, ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; } } #endif @@ -460,7 +460,8 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, ARGBToBayerRow = ARGBToBayerRow_SSSE3; } } -#elif defined(HAS_ARGBTOBAYERROW_NEON) +#endif +#if defined(HAS_ARGBTOBAYERROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_NEON; if (IS_ALIGNED(width, 8)) { diff --git a/source/mjpeg_decoder.cc b/source/mjpeg_decoder.cc index 15b0ed8..36028c3 100644 --- a/source/mjpeg_decoder.cc +++ b/source/mjpeg_decoder.cc @@ -13,8 +13,8 @@ #ifdef HAVE_JPEG #include <assert.h> -#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\ - !defined(TARGET_IPHONE_SIMULATOR) +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) // Must be included before jpeglib. #include <setjmp.h> #define HAVE_SETJMP @@ -101,7 +101,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { } buf_.data = src; - buf_.len = (int)(src_len); + buf_.len = static_cast<int>(src_len); buf_vec_.pos = 0; decompress_struct_->client_data = &buf_vec_; #ifdef HAVE_SETJMP @@ -411,7 +411,7 @@ void init_source(j_decompress_ptr cinfo) { } boolean fill_input_buffer(j_decompress_ptr cinfo) { - BufferVector* buf_vec = (BufferVector*)(cinfo->client_data); + BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data); if (buf_vec->pos >= buf_vec->len) { assert(0 && "No more data"); // ERROR: No more data @@ -447,7 +447,7 @@ void ErrorHandler(j_common_ptr cinfo) { // ERROR: Error in jpeglib: buf #endif - SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err); + SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err); // This rewinds the call stack to the point of the corresponding setjmp() // and causes it to return (for a second time) with value 1. longjmp(mgr->setjmp_buffer, 1); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 3857008..b21192b 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -47,12 +47,15 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_AVX; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; @@ -96,9 +99,7 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, } #endif #if defined(HAS_COPYROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_16_SSE2; } #endif @@ -249,9 +250,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { MirrorRow = MirrorRow_SSSE3; } #endif @@ -302,14 +301,8 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -380,14 +373,8 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; - UYVYToYRow = UYVYToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { - UYVYToUV422Row = UYVYToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - UYVYToYRow = UYVYToYRow_SSE2; - } - } + UYVYToUV422Row = UYVYToUV422Row_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -499,9 +486,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, } #if defined(HAS_ARGBMIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { ARGBMirrorRow = ARGBMirrorRow_SSSE3; } #endif @@ -808,24 +793,31 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0; } -#if defined(HAS_I422TOBGRAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToBGRARow = I422ToBGRARow_Any_NEON; +#if defined(HAS_I422TOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOBGRAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToBGRARow = I422ToBGRARow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - I422ToBGRARow = I422ToBGRARow_NEON; + I422ToBGRARow = I422ToBGRARow_AVX2; } } -#elif defined(HAS_I422TOBGRAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I422ToBGRARow = I422ToBGRARow_Any_SSSE3; +#endif +#if defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_NEON; if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { - I422ToBGRARow = I422ToBGRARow_SSSE3; - } + I422ToBGRARow = I422ToBGRARow_NEON; } } -#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) +#endif +#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && @@ -885,14 +877,12 @@ int I422ToABGR(const uint8* src_y, int src_stride_y, I422ToABGRRow = I422ToABGRRow_NEON; } } -#elif defined(HAS_I422TOABGRROW_SSSE3) +#endif +#if defined(HAS_I422TOABGRROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToABGRRow = I422ToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { - I422ToABGRRow = I422ToABGRRow_SSSE3; - } + I422ToABGRRow = I422ToABGRRow_SSSE3; } } #endif @@ -947,14 +937,12 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y, I422ToRGBARow = I422ToRGBARow_NEON; } } -#elif defined(HAS_I422TORGBAROW_SSSE3) +#endif +#if defined(HAS_I422TORGBAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } + I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif @@ -997,7 +985,8 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; } } -#elif defined(HAS_NV12TORGB565ROW_NEON) +#endif +#if defined(HAS_NV12TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -1045,7 +1034,8 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y, NV21ToRGB565Row = NV21ToRGB565Row_SSSE3; } } -#elif defined(HAS_NV21TORGB565ROW_NEON) +#endif +#if defined(HAS_NV21TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -1079,9 +1069,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y, dst_stride_y = 0; } #if defined(HAS_SETROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { SetRow = SetRow_NEON; } #endif @@ -1145,8 +1133,7 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_SETROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height); return 0; } @@ -1197,9 +1184,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSE2; @@ -1312,12 +1297,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBGRAYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_SSSE3; } -#elif defined(HAS_ARGBGRAYROW_NEON) +#endif +#if defined(HAS_ARGBGRAYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_NEON; } @@ -1350,11 +1334,11 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBGRAYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_SSSE3; } -#elif defined(HAS_ARGBGRAYROW_NEON) +#endif +#if defined(HAS_ARGBGRAYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_NEON; } @@ -1383,11 +1367,11 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBSEPIAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBSepiaRow = ARGBSepiaRow_SSSE3; } -#elif defined(HAS_ARGBSEPIAROW_NEON) +#endif +#if defined(HAS_ARGBSEPIAROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBSepiaRow = ARGBSepiaRow_NEON; } @@ -1425,11 +1409,11 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; } -#elif defined(HAS_ARGBCOLORMATRIXROW_NEON) +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; } @@ -1568,11 +1552,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBQUANTIZEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBQuantizeRow = ARGBQuantizeRow_SSE2; } -#elif defined(HAS_ARGBQUANTIZEROW_NEON) +#endif +#if defined(HAS_ARGBQUANTIZEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBQuantizeRow = ARGBQuantizeRow_NEON; } @@ -1743,12 +1727,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBSHADEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBShadeRow = ARGBShadeRow_SSE2; } -#elif defined(HAS_ARGBSHADEROW_NEON) +#endif +#if defined(HAS_ARGBSHADEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBShadeRow = ARGBShadeRow_NEON; } @@ -1793,12 +1776,7 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && - IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif @@ -1806,12 +1784,7 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && - IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -1887,11 +1860,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBShuffleRow = ARGBShuffleRow_SSSE3; - } + ARGBShuffleRow = ARGBShuffleRow_SSSE3; } } #endif @@ -1947,8 +1916,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } // ARGBToBayer used to select G channel from ARGB. #if defined(HAS_ARGBTOBAYERGGROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerGGRow_SSE2; @@ -1956,8 +1924,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; @@ -2048,8 +2015,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelRow = SobelRow_SSE2; } #endif @@ -2070,8 +2036,7 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_, int width) = SobelToPlaneRow_C; #if defined(HAS_SOBELTOPLANEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelToPlaneRow = SobelToPlaneRow_SSE2; } #endif @@ -2093,8 +2058,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelXYRow = SobelXYRow_SSE2; } #endif @@ -2218,10 +2182,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && - IS_ALIGNED(width, 8)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; } #endif @@ -2264,10 +2225,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, src_stride_y = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && - IS_ALIGNED(width, 8)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; } #endif diff --git a/source/rotate.cc b/source/rotate.cc index 2ef3228..8218609 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -42,11 +42,7 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_MIRRORROW_NEON -void MirrorRow_NEON(const uint8* src, uint8* dst, int width); -#define HAS_MIRRORROW_UV_NEON -void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_TRANSPOSE_WX8_NEON void TransposeWx8_NEON(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width); @@ -55,7 +51,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width); -#endif // defined(__ARM_NEON__) +#endif #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ defined(__mips__) && \ @@ -194,31 +190,31 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, convertloop: // Read in the data from the source pointer. // First round of bit swap. - movdqa xmm0, [eax] - movdqa xmm1, [eax + edi] + movdqu xmm0, [eax] + movdqu xmm1, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm0 // use xmm7 as temp register. punpcklbw xmm0, xmm1 punpckhbw xmm7, xmm1 movdqa xmm1, xmm7 - movdqa xmm2, [eax] - movdqa xmm3, [eax + edi] + movdqu xmm2, [eax] + movdqu xmm3, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm2 punpcklbw xmm2, xmm3 punpckhbw xmm7, xmm3 movdqa xmm3, xmm7 - movdqa xmm4, [eax] - movdqa xmm5, [eax + edi] + movdqu xmm4, [eax] + movdqu xmm5, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm4 punpcklbw xmm4, xmm5 punpckhbw xmm7, xmm5 movdqa xmm5, xmm7 - movdqa xmm6, [eax] - movdqa xmm7, [eax + edi] + movdqu xmm6, [eax] + movdqu xmm7, [eax + edi] lea eax, [eax + 2 * edi] - movdqa [esp], xmm5 // backup xmm5 + movdqu [esp], xmm5 // backup xmm5 neg edi movdqa xmm5, xmm6 // use xmm5 as temp register. punpcklbw xmm6, xmm7 @@ -239,8 +235,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, punpcklwd xmm4, xmm6 punpckhwd xmm5, xmm6 movdqa xmm6, xmm5 - movdqa xmm5, [esp] // restore xmm5 - movdqa [esp], xmm6 // backup xmm6 + movdqu xmm5, [esp] // restore xmm5 + movdqu [esp], xmm6 // backup xmm6 movdqa xmm6, xmm5 // use xmm6 as temp register. punpcklwd xmm5, xmm7 punpckhwd xmm6, xmm7 @@ -251,7 +247,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, punpckldq xmm0, xmm4 punpckhdq xmm6, xmm4 movdqa xmm4, xmm6 - movdqa xmm6, [esp] // restore xmm6 + movdqu xmm6, [esp] // restore xmm6 movlpd qword ptr [edx], xmm0 movhpd qword ptr [ebx], xmm0 movlpd qword ptr [edx + esi], xmm4 @@ -296,7 +292,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ret } } -#elif !defined(LIBYUV_DISABLE_X86) && \ +#endif +#if !defined(LIBYUV_DISABLE_X86) && \ (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) #define HAS_TRANSPOSE_WX8_SSSE3 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, @@ -411,31 +408,31 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "mov 0x2c(%ecx),%ecx \n" "1: \n" - "movdqa (%eax),%xmm0 \n" - "movdqa (%eax,%edi,1),%xmm1 \n" + "movdqu (%eax),%xmm0 \n" + "movdqu (%eax,%edi,1),%xmm1 \n" "lea (%eax,%edi,2),%eax \n" "movdqa %xmm0,%xmm7 \n" "punpcklbw %xmm1,%xmm0 \n" "punpckhbw %xmm1,%xmm7 \n" "movdqa %xmm7,%xmm1 \n" - "movdqa (%eax),%xmm2 \n" - "movdqa (%eax,%edi,1),%xmm3 \n" + "movdqu (%eax),%xmm2 \n" + "movdqu (%eax,%edi,1),%xmm3 \n" "lea (%eax,%edi,2),%eax \n" "movdqa %xmm2,%xmm7 \n" "punpcklbw %xmm3,%xmm2 \n" "punpckhbw %xmm3,%xmm7 \n" "movdqa %xmm7,%xmm3 \n" - "movdqa (%eax),%xmm4 \n" - "movdqa (%eax,%edi,1),%xmm5 \n" + "movdqu (%eax),%xmm4 \n" + "movdqu (%eax,%edi,1),%xmm5 \n" "lea (%eax,%edi,2),%eax \n" "movdqa %xmm4,%xmm7 \n" "punpcklbw %xmm5,%xmm4 \n" "punpckhbw %xmm5,%xmm7 \n" "movdqa %xmm7,%xmm5 \n" - "movdqa (%eax),%xmm6 \n" - "movdqa (%eax,%edi,1),%xmm7 \n" + "movdqu (%eax),%xmm6 \n" + "movdqu (%eax,%edi,1),%xmm7 \n" "lea (%eax,%edi,2),%eax \n" - "movdqa %xmm5,(%esp) \n" + "movdqu %xmm5,(%esp) \n" "neg %edi \n" "movdqa %xmm6,%xmm5 \n" "punpcklbw %xmm7,%xmm6 \n" @@ -455,8 +452,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "punpcklwd %xmm6,%xmm4 \n" "punpckhwd %xmm6,%xmm5 \n" "movdqa %xmm5,%xmm6 \n" - "movdqa (%esp),%xmm5 \n" - "movdqa %xmm6,(%esp) \n" + "movdqu (%esp),%xmm5 \n" + "movdqu %xmm6,(%esp) \n" "movdqa %xmm5,%xmm6 \n" "punpcklwd %xmm7,%xmm5 \n" "punpckhwd %xmm7,%xmm6 \n" @@ -465,7 +462,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "punpckldq %xmm4,%xmm0 \n" "punpckhdq %xmm4,%xmm6 \n" "movdqa %xmm6,%xmm4 \n" - "movdqa (%esp),%xmm6 \n" + "movdqu (%esp),%xmm6 \n" "movlpd %xmm0,(%edx) \n" "movhpd %xmm0,(%ebx) \n" "movlpd %xmm4,(%edx,%esi,1) \n" @@ -514,7 +511,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "ret \n" #endif ); -#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ +#endif +#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ defined(__x86_64__) // 64 bit version has enough registers to do 16x8 to 8x16 at a time. #define HAS_TRANSPOSE_WX8_FAST_SSSE3 @@ -525,38 +523,38 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, // First round of bit swap. ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%3),%%xmm1 \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm0,%%xmm8 \n" "punpcklbw %%xmm1,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa (%0),%%xmm2 \n" + "movdqu (%0),%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm8,%%xmm9 \n" "palignr $0x8,%%xmm1,%%xmm1 \n" "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqa (%0,%3),%%xmm3 \n" + "movdqu (%0,%3),%%xmm3 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm2,%%xmm10 \n" "punpcklbw %%xmm3,%%xmm2 \n" "punpckhbw %%xmm3,%%xmm10 \n" "movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm10,%%xmm11 \n" - "movdqa (%0),%%xmm4 \n" + "movdqu (%0),%%xmm4 \n" "palignr $0x8,%%xmm3,%%xmm3 \n" "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqa (%0,%3),%%xmm5 \n" + "movdqu (%0,%3),%%xmm5 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm4,%%xmm12 \n" "punpcklbw %%xmm5,%%xmm4 \n" "punpckhbw %%xmm5,%%xmm12 \n" "movdqa %%xmm4,%%xmm5 \n" "movdqa %%xmm12,%%xmm13 \n" - "movdqa (%0),%%xmm6 \n" + "movdqu (%0),%%xmm6 \n" "palignr $0x8,%%xmm5,%%xmm5 \n" "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqa (%0,%3),%%xmm7 \n" + "movdqu (%0,%3),%%xmm7 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm6,%%xmm14 \n" "punpcklbw %%xmm7,%%xmm6 \n" @@ -666,29 +664,29 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, // First round of bit swap. ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%4),%%xmm1 \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm0,%%xmm8 \n" "punpcklbw %%xmm1,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm8 \n" "movdqa %%xmm8,%%xmm1 \n" - "movdqa (%0),%%xmm2 \n" - "movdqa (%0,%4),%%xmm3 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm2,%%xmm8 \n" "punpcklbw %%xmm3,%%xmm2 \n" "punpckhbw %%xmm3,%%xmm8 \n" "movdqa %%xmm8,%%xmm3 \n" - "movdqa (%0),%%xmm4 \n" - "movdqa (%0,%4),%%xmm5 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm4,%%xmm8 \n" "punpcklbw %%xmm5,%%xmm4 \n" "punpckhbw %%xmm5,%%xmm8 \n" "movdqa %%xmm8,%%xmm5 \n" - "movdqa (%0),%%xmm6 \n" - "movdqa (%0,%4),%%xmm7 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm6,%%xmm8 \n" "punpcklbw %%xmm7,%%xmm6 \n" @@ -818,9 +816,7 @@ void TransposePlane(const uint8* src, int src_stride, } #endif #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 16) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { TransposeWx8 = TransposeWx8_FAST_SSSE3; } #endif @@ -888,16 +884,12 @@ void RotatePlane180(const uint8* src, int src_stride, } #endif #if defined(HAS_MIRRORROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { MirrorRow = MirrorRow_SSE2; } #endif #if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { MirrorRow = MirrorRow_SSSE3; } #endif @@ -906,6 +898,7 @@ void RotatePlane180(const uint8* src, int src_stride, MirrorRow = MirrorRow_AVX2; } #endif +// TODO(fbarchard): Mirror on mips handle unaligned memory. #if defined(HAS_MIRRORROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && @@ -924,12 +917,15 @@ void RotatePlane180(const uint8* src, int src_stride, } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_AVX; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; @@ -1010,13 +1006,13 @@ void TransposeUV(const uint8* src, int src_stride, if (TestCpuFlag(kCpuHasNEON)) { TransposeUVWx8 = TransposeUVWx8_NEON; } -#elif defined(HAS_TRANSPOSE_UVWX8_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(width, 8) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_TRANSPOSE_UVWX8_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { TransposeUVWx8 = TransposeUVWx8_SSE2; } -#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) +#endif +#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; @@ -1084,12 +1080,13 @@ void RotateUV180(const uint8* src, int src_stride, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { MirrorRowUV = MirrorUVRow_NEON; } -#elif defined(HAS_MIRRORROW_UV_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_MIRRORROW_UV_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { MirrorRowUV = MirrorUVRow_SSSE3; } -#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2) +#endif +#if defined(HAS_MIRRORUVROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { MirrorRowUV = MirrorUVRow_MIPS_DSPR2; diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index ab0f9ce..b05977e 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -50,13 +50,12 @@ static void ARGBTranspose(const uint8* src, int src_stride, void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest. - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; } -#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest. - IS_ALIGNED(src, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; } #endif @@ -103,9 +102,7 @@ void ARGBRotate180(const uint8* src, int src_stride, ARGBMirrorRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_ARGBMIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { ARGBMirrorRow = ARGBMirrorRow_SSSE3; } #endif @@ -130,12 +127,15 @@ void ARGBRotate180(const uint8* src, int src_stride, } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32)) { CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_AVX; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index d354e11..a23a40f 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -17,7 +17,8 @@ namespace libyuv { extern "C" { #endif -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) static uvec8 kVTbl4x4Transpose = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; @@ -525,7 +526,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" ); } -#endif +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc new file mode 100644 index 0000000..92358af --- /dev/null +++ b/source/rotate_neon64.cc @@ -0,0 +1,543 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +static uvec8 kVTbl4x4Transpose = + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + const uint8* src_temp = NULL; + int64 width64 = (int64) width; // Work around clang 3.4 warning. + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %3, %3, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "ld1 {v0.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v3.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v4.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v5.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v6.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v7.8b}, [%0] \n" + + "trn2 v16.8b, v0.8b, v1.8b \n" + "trn1 v17.8b, v0.8b, v1.8b \n" + "trn2 v18.8b, v2.8b, v3.8b \n" + "trn1 v19.8b, v2.8b, v3.8b \n" + "trn2 v20.8b, v4.8b, v5.8b \n" + "trn1 v21.8b, v4.8b, v5.8b \n" + "trn2 v22.8b, v6.8b, v7.8b \n" + "trn1 v23.8b, v6.8b, v7.8b \n" + + "trn2 v3.4h, v17.4h, v19.4h \n" + "trn1 v1.4h, v17.4h, v19.4h \n" + "trn2 v2.4h, v16.4h, v18.4h \n" + "trn1 v0.4h, v16.4h, v18.4h \n" + "trn2 v7.4h, v21.4h, v23.4h \n" + "trn1 v5.4h, v21.4h, v23.4h \n" + "trn2 v6.4h, v20.4h, v22.4h \n" + "trn1 v4.4h, v20.4h, v22.4h \n" + + "trn2 v21.2s, v1.2s, v5.2s \n" + "trn1 v17.2s, v1.2s, v5.2s \n" + "trn2 v20.2s, v0.2s, v4.2s \n" + "trn1 v16.2s, v0.2s, v4.2s \n" + "trn2 v23.2s, v3.2s, v7.2s \n" + "trn1 v19.2s, v3.2s, v7.2s \n" + "trn2 v22.2s, v2.2s, v6.2s \n" + "trn1 v18.2s, v2.2s, v6.2s \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v17.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v19.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v21.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v20.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v23.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v22.8b}, [%0] \n" + + "add %1, %1, #8 \n" // src += 8 + "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride + "subs %3, %3, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %3, %3, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %3, #2 \n" + "b.lt 3f \n" + + "cmp %3, #4 \n" + "b.lt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "ld1 {v0.s}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.s}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.s}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.s}[3], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[3], [%0] \n" + + "mov %0, %2 \n" + + MEMACCESS(4) + "ld1 {v2.16b}, [%4] \n" + + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + MEMACCESS(0) + "st1 {v3.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v3.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v3.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v3.s}[3], [%0] \n" + + "add %0, %2, #4 \n" + MEMACCESS(0) + "st1 {v0.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v0.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v0.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v0.s}[3], [%0] \n" + + "add %1, %1, #4 \n" // src += 4 + "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride + "subs %3, %3, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %3, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "ld1 {v0.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.h}[3], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[3], [%0] \n" + + "trn2 v2.8b, v0.8b, v1.8b \n" + "trn1 v3.8b, v0.8b, v1.8b \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v3.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v2.8b}, [%0] \n" + + "add %1, %1, #2 \n" // src += 2 + "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride + "subs %3, %3, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "ld1 {v0.b}[0], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[1], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[2], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[3], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[4], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[5], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[6], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[7], [%1] \n" + + MEMACCESS(2) + "st1 {v0.8b}, [%2] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width64) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"(static_cast<ptrdiff_t>(src_stride)), // %5 + "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23" + ); +} + +static uint8 kVTbl4x4TransposeDi[32] = + { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, + 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; + +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + const uint8* src_temp = NULL; + int64 width64 = (int64) width; // Work around clang 3.4 warning. + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %4, %4, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v2.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v3.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v4.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v5.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v6.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v7.16b}, [%0] \n" + + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" + + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" + + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v16.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v17.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v19.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.d}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.d}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v17.d}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v19.d}[1], [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "st1 {v20.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v22.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v21.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v23.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v20.d}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v22.d}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v21.d}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v23.d}[1], [%0] \n" + + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b + "subs %4, %4, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %4, %4, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %4, #2 \n" + "b.lt 3f \n" + + "cmp %4, #4 \n" + "b.lt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v3.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v4.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v5.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v6.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v7.8b}, [%0] \n" + + MEMACCESS(8) + "ld1 {v30.16b}, [%8], #16 \n" + "ld1 {v31.16b}, [%8] \n" + + "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" + "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" + "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" + "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v16.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.s}[3], [%0], %6 \n" + + "add %0, %2, #4 \n" + MEMACCESS(0) + "st1 {v18.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.s}[3], [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "st1 {v17.s}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v17.s}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v17.s}[2], [%0], %7 \n" + MEMACCESS(0) + "st1 {v17.s}[3], [%0], %7 \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "st1 {v19.s}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v19.s}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v19.s}[2], [%0], %7 \n" + MEMACCESS(0) + "st1 {v19.s}[3], [%0] \n" + + "add %1, %1, #8 \n" // src += 4 * 2 + "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a + "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b + "subs %4, %4, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %4, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[3], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[3], [%0] \n" + + "trn1 v4.8b, v0.8b, v2.8b \n" + "trn2 v5.8b, v0.8b, v2.8b \n" + "trn1 v6.8b, v1.8b, v3.8b \n" + "trn2 v7.8b, v1.8b, v3.8b \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v4.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v6.d}[0], [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "st1 {v5.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v7.d}[0], [%0] \n" + + "add %1, %1, #4 \n" // src += 2 * 2 + "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a + "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b + "subs %4, %4, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[0], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[1], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[2], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[3], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[4], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[5], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[6], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[7], [%1] \n" + + MEMACCESS(2) + "st1 {v0.d}[0], [%2] \n" + MEMACCESS(3) + "st1 {v1.d}[0], [%3] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width64) // %4 + : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 + "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 + "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v30", "v31" + ); +} +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_any.cc b/source/row_any.cc index 97ef844..b1ede4e 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -35,19 +35,19 @@ extern "C" { } #ifdef HAS_I422TOARGBROW_SSSE3 -YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, +YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C, 1, 4, 7) #endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_I444TOARGBROW_SSSE3 -YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, +YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C, 0, 4, 7) -YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, +YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, I411ToARGBRow_C, 2, 4, 7) -YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, +YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, I422ToBGRARow_C, 1, 4, 7) -YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, +YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, I422ToABGRRow_C, 1, 4, 7) -YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, +YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, I422ToRGBARow_C, 1, 4, 7) // I422ToRGB565Row_SSSE3 is unaligned. YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C, @@ -65,6 +65,9 @@ YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) #ifdef HAS_I422TOARGBROW_AVX2 YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15) #endif // HAS_I422TOARGBROW_AVX2 +#ifdef HAS_I422TOBGRAROW_AVX2 +YANY(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, I422ToBGRARow_C, 1, 4, 15) +#endif // HAS_I422TOBGRAROW_AVX2 #ifdef HAS_I422TOARGBROW_NEON YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7) YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7) @@ -79,9 +82,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C, YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, 1, 2, 7) YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) +#endif // HAS_I422TOARGBROW_NEON +#ifdef HAS_I422TOYUY2ROW_NEON YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) +#endif // HAS_I422TOYUY2ROW_NEON +#ifdef HAS_I422TOUYVYROW_NEON YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) -#endif // HAS_I422TOARGBROW_NEON +#endif // HAS_I422TOUYVYROW_NEON #undef YANY // Wrappers to handle odd width @@ -98,9 +105,9 @@ YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) } #ifdef HAS_NV12TOARGBROW_SSSE3 -NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, +NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C, 0, 4) -NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, +NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C, 0, 4) #endif // HAS_NV12TOARGBROW_SSSE3 #ifdef HAS_NV12TOARGBROW_NEON @@ -141,15 +148,15 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, 3, 4, 2) #endif #if defined(HAS_I400TOARGBROW_SSE2) -RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C, +RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 7, 1, 4) #endif #if defined(HAS_YTOARGBROW_SSE2) RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 7, 1, 4) -RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C, +RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C, 15, 2, 4) -RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C, +RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, UYVYToARGBRow_C, 15, 2, 4) // These require alignment on ARGB, so C is used for remainder. RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C, @@ -227,35 +234,67 @@ YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32) YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32) #endif #ifdef HAS_ARGBTOYROW_SSSE3 -YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 4, 1, 16) #endif #ifdef HAS_BGRATOYROW_SSSE3 -YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16) -YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16) +YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 4, 1, 16) +YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 4, 1, 16) +YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 4, 1, 16) +YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 2, 1, 16) +YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 2, 1, 16) #endif #ifdef HAS_ARGBTOYJROW_SSSE3 -YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16) +YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 4, 1, 16) #endif #ifdef HAS_ARGBTOYROW_NEON YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_ARGBTOYJROW_NEON YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_BGRATOYROW_NEON YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_ABGRTOYROW_NEON YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_RGBATOYROW_NEON YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_RGB24TOYROW_NEON YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8) +#endif +#ifdef HAS_RAWTOYROW_NEON YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8) +#endif +#ifdef HAS_RGB565TOYROW_NEON YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_ARGB1555TOYROW_NEON YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_ARGB4444TOYROW_NEON YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_YUY2TOYROW_NEON YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16) +#endif +#ifdef HAS_UYVYTOYROW_NEON YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16) +#endif +#ifdef HAS_RGB24TOARGBROW_NEON YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8) +#endif +#ifdef HAS_RAWTOARGBROW_NEON YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8) +#endif +#ifdef HAS_RGB565TOARGBROW_NEON YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8) +#endif +#ifdef HAS_ARGB1555TOARGBROW_NEON YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8) +#endif +#ifdef HAS_ARGB4444TOARGBROW_NEON YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) #endif #undef YANY @@ -313,27 +352,50 @@ UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31) UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31) #endif #ifdef HAS_ARGBTOUVROW_SSSE3 -UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15) -UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C, - 4, 15) -UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15) -UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15) -UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15) -UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15) -UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15) +UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, ARGBToUVRow_C, 4, 15) +UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, ARGBToUVJRow_C, 4, 15) +UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, BGRAToUVRow_C, 4, 15) +UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, ABGRToUVRow_C, 4, 15) +UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, RGBAToUVRow_C, 4, 15) +#endif +#ifdef HAS_YUY2TOUVROW_SSE2 +UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, YUY2ToUVRow_C, 2, 15) +UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, UYVYToUVRow_C, 2, 15) #endif #ifdef HAS_ARGBTOUVROW_NEON UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) +#endif +#ifdef HAS_ARGBTOUVJROW_NEON UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15) +#endif +#ifdef HAS_BGRATOUVROW_NEON UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15) +#endif +#ifdef HAS_ABGRTOUVROW_NEON UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15) +#endif +#ifdef HAS_RGBATOUVROW_NEON UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15) +#endif +#ifdef HAS_RGB24TOUVROW_NEON UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_NEON UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) +#endif +#ifdef HAS_RGB565TOUVROW_NEON UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_NEON UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) +#endif +#ifdef HAS_ARGB4444TOUVROW_NEON UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) +#endif +#ifdef HAS_YUY2TOUVROW_NEON UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15) +#endif +#ifdef HAS_UYVYTOUVROW_NEON UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) #endif #undef UVANY @@ -350,7 +412,7 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) } #ifdef HAS_ARGBTOUV444ROW_SSSE3 -UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3, +UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, ARGBToUV444Row_C, 4, 15, 0) #endif #ifdef HAS_YUY2TOUV422ROW_AVX2 @@ -359,12 +421,14 @@ UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, UYVYToUV422Row_C, 2, 31, 1) #endif -#ifdef HAS_ARGBTOUVROW_SSSE3 -UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3, +#ifdef HAS_ARGBTOUV422ROW_SSSE3 +UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, ARGBToUV422Row_C, 4, 15, 1) -UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, +#endif +#ifdef HAS_YUY2TOUV422ROW_SSE2 +UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, YUY2ToUV422Row_C, 2, 15, 1) -UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, +UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, UYVYToUV422Row_C, 2, 15, 1) #endif #ifdef HAS_YUY2TOUV422ROW_NEON @@ -393,7 +457,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, } #ifdef HAS_SPLITUVROW_SSE2 -SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15) +SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, SplitUVRow_C, 15) #endif #ifdef HAS_SPLITUVROW_AVX2 SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31) @@ -402,7 +466,7 @@ SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31) SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15) #endif #ifdef HAS_SPLITUVROW_MIPS_DSPR2 -SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, +SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, SplitUVRow_C, 15) #endif #undef SPLITUVROWANY @@ -419,7 +483,7 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, } #ifdef HAS_MERGEUVROW_SSE2 -MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15) +MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, MergeUVRow_C, 15) #endif #ifdef HAS_MERGEUVROW_AVX2 MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31) @@ -490,7 +554,7 @@ YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, ARGBShuffleRow_C, 4, 4, 3) #endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 -YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3, +YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, ARGBShuffleRow_C, 4, 4, 7) #endif #ifdef HAS_ARGBSHUFFLEROW_AVX2 @@ -521,11 +585,11 @@ NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, InterpolateRow_C, 1, 1, 32) #endif #ifdef HAS_INTERPOLATEROW_SSSE3 -NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3, +NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, InterpolateRow_C, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_SSE2 -NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2, +NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, InterpolateRow_C, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_NEON diff --git a/source/row_common.cc b/source/row_common.cc index fa2b752..afc74c0 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -964,7 +964,7 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, } #if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) // C mimic assembly. // TODO(fbarchard): Remove subsampling from Neon. void I444ToARGBRow_C(const uint8* src_y, @@ -1885,17 +1885,17 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, } } -// Blend 2 rows into 1 for conversions such as I422ToI420. -void HalfRow_C(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { +// Blend 2 rows into 1. +static void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { int x; for (x = 0; x < pix; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; } } -void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, - uint16* dst_uv, int pix) { +static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, + uint16* dst_uv, int pix) { int x; for (x = 0; x < pix; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; @@ -2137,19 +2137,6 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, free_aligned_buffer_64(row_y); } -void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - int width) { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; - YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width); - YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width); - I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); - free_aligned_buffer_64(row_y); -} - void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { @@ -2163,19 +2150,6 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, free_aligned_buffer_64(row_y); } -void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - int width) { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; - UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width); - UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width); - I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); - free_aligned_buffer_64(row_y); -} - #endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // !defined(LIBYUV_DISABLE_X86) diff --git a/source/row_mips.cc b/source/row_mips.cc index ae9370c..d713321 100644 --- a/source/row_mips.cc +++ b/source/row_mips.cc @@ -378,7 +378,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { // MIPS DSPR2 functions #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) + (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { @@ -447,89 +447,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ); } -void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, - uint8* dst_v, int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "srl $t4, %[width], 4 \n" // multiplies of 16 - "blez $t4, 2f \n" - " andi %[width], %[width], 0xf \n" // residual - - ".p2align 2 \n" - "1: \n" - "addiu $t4, $t4, -1 \n" - "lwr $t0, 0(%[src_uv]) \n" - "lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0 - "lwr $t1, 4(%[src_uv]) \n" - "lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2 - "lwr $t2, 8(%[src_uv]) \n" - "lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4 - "lwr $t3, 12(%[src_uv]) \n" - "lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6 - "lwr $t5, 16(%[src_uv]) \n" - "lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8 - "lwr $t6, 20(%[src_uv]) \n" - "lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10 - "lwr $t7, 24(%[src_uv]) \n" - "lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12 - "lwr $t8, 28(%[src_uv]) \n" - "lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14 - "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 - "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 - "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 - "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 - "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 - "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 - "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 - "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 - "addiu %[src_uv], %[src_uv], 32 \n" - "swr $t9, 0(%[dst_v]) \n" - "swl $t9, 3(%[dst_v]) \n" - "swr $t0, 0(%[dst_u]) \n" - "swl $t0, 3(%[dst_u]) \n" - "swr $t1, 4(%[dst_v]) \n" - "swl $t1, 7(%[dst_v]) \n" - "swr $t2, 4(%[dst_u]) \n" - "swl $t2, 7(%[dst_u]) \n" - "swr $t3, 8(%[dst_v]) \n" - "swl $t3, 11(%[dst_v]) \n" - "swr $t5, 8(%[dst_u]) \n" - "swl $t5, 11(%[dst_u]) \n" - "swr $t6, 12(%[dst_v]) \n" - "swl $t6, 15(%[dst_v]) \n" - "swr $t7, 12(%[dst_u]) \n" - "swl $t7, 15(%[dst_u]) \n" - "addiu %[dst_u], %[dst_u], 16 \n" - "bgtz $t4, 1b \n" - " addiu %[dst_v], %[dst_v], 16 \n" - - "beqz %[width], 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, 0(%[src_uv]) \n" - "lbu $t1, 1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], 2 \n" - "addiu %[width], %[width], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[width], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r" (src_uv), - [width] "+r" (width), - [dst_u] "+r" (dst_u), - [dst_v] "+r" (dst_v) - : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6", "t7", "t8", "t9" - ); -} - void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { __asm__ __volatile__ ( ".set push \n" diff --git a/source/row_neon.cc b/source/row_neon.cc index a84e3e4..ac1c5e5 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -16,7 +16,8 @@ extern "C" { #endif // This module is for GCC Neon -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ @@ -542,7 +543,6 @@ void YToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( MEMACCESS(3) - MEMACCESS(3) "vld1.8 {d24}, [%3] \n" MEMACCESS(4) "vld1.8 {d25}, [%4] \n" @@ -1274,30 +1274,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ); } -void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. - "subs %3, %3, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. - "vrhadd.u8 q0, q1 \n" // average row 1 and 2 - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(src_uv_stride), // %1 - "+r"(dst_uv), // %2 - "+r"(pix) // %3 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { @@ -3141,7 +3117,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, : "cc", "memory", "q0", "q1" // Clobber List ); } -#endif // __ARM_NEON__ +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" diff --git a/source/row_neon64.cc b/source/row_neon64.cc new file mode 100644 index 0000000..fc2deaa --- /dev/null +++ b/source/row_neon64.cc @@ -0,0 +1,3047 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + MEMACCESS(1) \ + "ld1 {v1.s}[0], [%1], #4 \n" \ + MEMACCESS(2) \ + "ld1 {v1.s}[1], [%2], #4 \n" + +// Read 8 Y, 2 U and 2 V from 422 +#define READYUV411 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + MEMACCESS(1) \ + "ld1 {v2.h}[0], [%1], #2 \n" \ + MEMACCESS(2) \ + "ld1 {v2.h}[1], [%2], #2 \n" \ + "zip1 v1.8b, v2.8b, v2.8b \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + MEMACCESS(1) \ + "ld1 {v1.d}[0], [%1], #8 \n" \ + MEMACCESS(2) \ + "ld1 {v1.d}[1], [%2], #8 \n" \ + "uaddlp v1.8h, v1.16b \n" \ + "rshrn v1.8b, v1.8h, #1 \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "movi v1.8b , #128 \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + MEMACCESS(1) \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + MEMACCESS(1) \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v3.8b, v2.8b, v2.8b \n" \ + "uzp2 v1.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +// Read 8 YUY2 +#define READYUY2 \ + MEMACCESS(0) \ + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ + "uzp2 v3.8b, v1.8b, v1.8b \n" \ + "uzp1 v1.8b, v1.8b, v1.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +// Read 8 UYVY +#define READUYVY \ + MEMACCESS(0) \ + "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ + "orr v0.8b, v3.8b, v3.8b \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +#define YUV422TORGB_SETUP_REG \ + "movi v24.8b, #128 \n" \ + "movi v25.8h, #74, lsl #0 \n" /* YG */\ + "movi v26.8h, #16, lsl #0 \n" \ + "movi v27.8h, #127, lsl #0 \n" /* UB */\ + "movi v28.8h, #102, lsl #0 \n" /* VR */\ + "mvni v29.8h, #0x18, lsl #0 \n" /* UG -25 */\ + "mvni v30.8h, #0x33, lsl #0 \n" /* VG -52 */ + +#define YUV422TORGB(vR, vG, vB) \ + "eor v1.8b, v1.8b, v24.8b \n" /* Subtract 128 from U&V */ \ + "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ + "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ + "sub v0.8h, v0.8h, v26.8h \n" /* offset y */ \ + "uaddw v1.8h, v2.8h, v1.8b \n" \ + "mul v0.8h, v0.8h, v25.8h \n" /* Y x 74 */ \ + "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ + "sxtl v2.8h, v2.8b \n" \ + "sxtl v1.8h, v1.8b \n" /* Extract U */ \ + "mul " #vR ".8h, v2.8h, v28.8h \n" /* R = (V - 128) x VR */ \ + "mul " #vB ".8h, v1.8h, v27.8h \n" /* B = (U - 128) x UB */ \ + "mul " #vG ".8h, v1.8h, v29.8h \n" /* G1 = (U - 128) x UG */ \ + "mul v2.8h, v2.8h, v30.8h \n" /* G2 = (V - 128) x VG */ \ + "sqadd " #vR ".8h, " #vR ".8h, v0.8h \n" /* R += (Y - 16) YG */ \ + "sqadd " #vB ".8h, " #vB ".8h, v0.8h \n" /* B += (Y - 16) YG */ \ + "sqadd " #vG ".8h, " #vG ".8h, v2.8h \n" /* G = G1 + G2 */ \ + "sqadd " #vG ".8h, " #vG ".8h, v0.8h \n" /* G += (Y - 16) YG */ \ + "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ + "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ + "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ + +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ + + +#ifdef HAS_I444TOARGBROW_NEON +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV444 + YUV422TORGB(v22, v21, v20) + "subs %4, %4, #8 \n" + "movi v23.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I444TOARGBROW_NEON + +#ifdef HAS_I422TOARGBROW_NEON +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v22, v21, v20) + "subs %4, %4, #8 \n" + "movi v23.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TOARGBROW_NEON + +#ifdef HAS_I411TOARGBROW_NEON +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV411 + YUV422TORGB(v22, v21, v20) + "subs %4, %4, #8 \n" + "movi v23.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I411TOARGBROW_NEON + +#ifdef HAS_I422TOBGRAROW_NEON +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v21, v22, v23) + "subs %4, %4, #8 \n" + "movi v20.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_bgra), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TOBGRAROW_NEON + +#ifdef HAS_I422TOABGRROW_NEON +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v20, v21, v22) + "subs %4, %4, #8 \n" + "movi v23.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_abgr), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TOABGRROW_NEON + +#ifdef HAS_I422TORGBAROW_NEON +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v23, v22, v21) + "subs %4, %4, #8 \n" + "movi v20.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TORGBAROW_NEON + +#ifdef HAS_I422TORGB24ROW_NEON +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v22, v21, v20) + "subs %4, %4, #8 \n" + MEMACCESS(3) + "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TORGB24ROW_NEON + +#ifdef HAS_I422TORAWROW_NEON +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v20, v21, v22) + "subs %4, %4, #8 \n" + MEMACCESS(3) + "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_raw), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TORAWROW_NEON + +#define ARGBTORGB565 \ + "shll v0.8h, v22.8b, #8 \n" /* R */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "sri v0.8h, v21.8h, #5 \n" /* RG */ \ + "sri v0.8h, v20.8h, #11 \n" /* RGB */ + +#ifdef HAS_I422TORGB565ROW_NEON +void I422ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v22, v21, v20) + "subs %4, %4, #8 \n" + ARGBTORGB565 + MEMACCESS(3) + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TORGB565ROW_NEON + +#define ARGBTOARGB1555 \ + "shll v0.8h, v23.8b, #8 \n" /* A */ \ + "shll v22.8h, v22.8b, #8 \n" /* R */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "sri v0.8h, v22.8h, #1 \n" /* AR */ \ + "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v20.8h, #11 \n" /* ARGB */ + +#ifdef HAS_I422TOARGB1555ROW_NEON +void I422ToARGB1555Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v22, v21, v20) + "subs %4, %4, #8 \n" + "movi v23.8b, #255 \n" + ARGBTOARGB1555 + MEMACCESS(3) + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TOARGB1555ROW_NEON + +#define ARGBTOARGB4444 \ + /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ + "ushr v20.8b, v20.8b, #4 \n" /* B */ \ + "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ + "ushr v22.8b, v22.8b, #4 \n" /* R */ \ + "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ + "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ + "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ + "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ + +#ifdef HAS_I422TOARGB4444ROW_NEON +void I422ToARGB4444Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "movi v4.16b, #0x0f \n" // bits to clear with vbic. + "1: \n" + READYUV422 + YUV422TORGB(v22, v21, v20) + "subs %4, %4, #8 \n" + "movi v23.8b, #255 \n" + ARGBTOARGB4444 + MEMACCESS(3) + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TOARGB4444ROW_NEON + +#ifdef HAS_YTOARGBROW_NEON +void YToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV400 + YUV422TORGB(v22, v21, v20) + "subs %2, %2, #8 \n" + "movi v23.8b, #255 \n" + MEMACCESS(1) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_YTOARGBROW_NEON + +#ifdef HAS_I400TOARGBROW_NEON +void I400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + "movi v23.8b, #255 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v20.8b}, [%0], #8 \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" + "subs %2, %2, #8 \n" + MEMACCESS(1) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v20", "v21", "v22", "v23" + ); +} +#endif // HAS_I400TOARGBROW_NEON + +#ifdef HAS_NV12TOARGBROW_NEON +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READNV12 + YUV422TORGB(v22, v21, v20) + "subs %3, %3, #8 \n" + "movi v23.8b, #255 \n" + MEMACCESS(2) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_NV12TOARGBROW_NEON + +#ifdef HAS_NV21TOARGBROW_NEON +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READNV21 + YUV422TORGB(v22, v21, v20) + "subs %3, %3, #8 \n" + "movi v23.8b, #255 \n" + MEMACCESS(2) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_NV21TOARGBROW_NEON + +#ifdef HAS_NV12TORGB565ROW_NEON +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READNV12 + YUV422TORGB(v22, v21, v20) + "subs %3, %3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_NV12TORGB565ROW_NEON + +#ifdef HAS_NV21TORGB565ROW_NEON +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READNV21 + YUV422TORGB(v22, v21, v20) + "subs %3, %3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_NV21TORGB565ROW_NEON + +#ifdef HAS_YUY2TOARGBROW_NEON +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUY2 + YUV422TORGB(v22, v21, v20) + "subs %2, %2, #8 \n" + "movi v23.8b, #255 \n" + MEMACCESS(1) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_YUY2TOARGBROW_NEON + +#ifdef HAS_UYVYTOARGBROW_NEON +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READUYVY + YUV422TORGB(v22, v21, v20) + "subs %2, %2, #8 \n" + "movi v23.8b, #255 \n" + MEMACCESS(1) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_UYVYTOARGBROW_NEON + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +#ifdef HAS_SPLITUVROW_NEON +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store U + MEMACCESS(2) + "st1 {v1.16b}, [%2], #16 \n" // store V + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_SPLITUVROW_NEON + +// Reads 16 U's and V's and writes out 16 pairs of UV. +#ifdef HAS_MERGEUVROW_NEON +void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load U + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(2) + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" + : + "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_MERGEUVROW_NEON + +// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +#ifdef HAS_COPYROW_NEON +void CopyRow_NEON(const uint8* src, uint8* dst, int count) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + MEMACCESS(1) + "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_COPYROW_NEON + +// SetRow8 writes 'count' bytes using a 32 bit value repeated. +#ifdef HAS_SETROW_NEON +void SetRow_NEON(uint8* dst, uint32 v32, int count) { + asm volatile ( + "dup v0.4s, %w2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v32) // %2 + : "cc", "memory", "v0" + ); +} +#endif // HAS_SETROW_NEON + +// TODO(fbarchard): Make fully assembler +// SetRow32 writes 'count' words using a 32 bit value repeated. +#ifdef HAS_ARGBSETROWS_NEON +void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + SetRow_NEON(dst, v32, width << 2); + dst += dst_stride; + } +} +#endif // HAS_ARGBSETROWS_NEON + +#ifdef HAS_MIRRORROW_NEON +void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "add %0, %0, %2 \n" + "sub %0, %0, #16 \n" + + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %2, %2, #16 \n" // 16 pixels per loop. + "rev64 v0.16b, v0.16b \n" + MEMACCESS(1) + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + MEMACCESS(1) + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0" + ); +} +#endif // HAS_MIRRORROW_NEON + +#ifdef HAS_MIRRORUVROW_NEON +void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + // Start at end of source row. + "add %0, %0, %3, lsl #1 \n" + "sub %0, %0, #16 \n" + + "1: \n" + MEMACCESS(0) + "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 + "subs %3, %3, #8 \n" // 8 pixels per loop. + "rev64 v0.8b, v0.8b \n" + "rev64 v1.8b, v1.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // dst += 8 + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((ptrdiff_t)-16) // %4 + : "cc", "memory", "v0", "v1" + ); +} +#endif // HAS_MIRRORUVROW_NEON + +#ifdef HAS_ARGBMIRRORROW_NEON +void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "add %0, %0, %2, lsl #2 \n" + "sub %0, %0, #16 \n" + + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %2, %2, #4 \n" // 4 pixels per loop. + "rev64 v0.4s, v0.4s \n" + MEMACCESS(1) + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + MEMACCESS(1) + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0" + ); +} +#endif // HAS_ARGBMIRRORROW_NEON + +#ifdef HAS_RGB24TOARGBROW_NEON +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "movi v4.8b, #255 \n" // Alpha + "1: \n" + MEMACCESS(0) + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} +#endif // HAS_RGB24TOARGBROW_NEON + +#ifdef HAS_RAWTOARGBROW_NEON +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "movi v5.8b, #255 \n" // Alpha + "1: \n" + MEMACCESS(0) + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %2, %2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + MEMACCESS(1) + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} +#endif // HAS_RAWTOARGBROW_NEON + +#define RGB565TOARGB \ + "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ + "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ + "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ + "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ + "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ + "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ + "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ + "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ + "dup v2.2D, v0.D[1] \n" /* R */ + +#ifdef HAS_RGB565TOARGBROW_NEON +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { + asm volatile ( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + MEMACCESS(1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List + ); +} +#endif // HAS_RGB565TOARGBROW_NEON + +#define ARGB1555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ + \ + "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ + "xtn2 v3.16b, v2.8h \n" \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ + "dup v1.2D, v0.D[1] \n" \ + "dup v3.2D, v2.D[1] \n" + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ + "dup v1.2D, v0.D[1] \n" /* G */ \ + +#ifdef HAS_ARGB1555TOARGBROW_NEON +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + asm volatile ( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + MEMACCESS(1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_ARGB1555TOARGBROW_NEON + +#define ARGB4444TOARGB \ + "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ + "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ + "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ + "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ + "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ + "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ + "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ + "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ + "dup v0.2D, v2.D[1] \n" \ + "dup v1.2D, v3.D[1] \n" + +#ifdef HAS_ARGB4444TOARGBROW_NEON +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + MEMACCESS(1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} +#endif // HAS_ARGB4444TOARGBROW_NEON + +#ifdef HAS_ARGBTORGB24ROW_NEON +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels + "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} +#endif // HAS_ARGBTORGB24ROW_NEON + +#ifdef HAS_ARGBTORAWROW_NEON +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "subs %2, %2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "orr v5.8b, v1.8b, v1.8b \n" // mov b + MEMACCESS(1) + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} +#endif // HAS_ARGBTORAWROW_NEON + +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_YUY2TOYROW_NEON + +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_UYVYTOYROW_NEON + +#ifdef HAS_YUY2TOUV422ROW_NEON +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + MEMACCESS(2) + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_YUY2TOUV422ROW_NEON + +#ifdef HAS_UYVYTOUV422ROW_NEON +void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + MEMACCESS(2) + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_UYVYTOUV422ROW_NEON + +#ifdef HAS_YUY2TOUVROW_NEON +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_yuy2b = src_yuy2 + stride_yuy2; + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + MEMACCESS(3) + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(src_yuy2b), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", + "v5", "v6", "v7" // Clobber List + ); +} +#endif // HAS_YUY2TOUVROW_NEON + +#ifdef HAS_UYVYTOUVROW_NEON +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_uyvyb = src_uyvy + stride_uyvy; + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + MEMACCESS(3) + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(src_uyvyb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", + "v5", "v6", "v7" // Clobber List + ); +} +#endif // HAS_UYVYTOUVROW_NEON + +// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG +#ifdef HAS_ARGBTOBAYERROW_NEON +void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + "mov v2.s[0], %w3 \n" // selector + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b,v1.16b}, [%0], 32 \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels + "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels + "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels + MEMACCESS(1) + "st1 {v4.8b}, [%1], #8 \n" // store 8. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : "r"(selector) // %3 + : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List + ); +} +#endif // HAS_ARGBTOBAYERROW_NEON + +// Select G channels from ARGB. e.g. GGGGGGGG +#ifdef HAS_ARGBTOBAYERGGROW_NEON +void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /*selector*/, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_ARGBTOBAYERGGROW_NEON + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +#ifdef HAS_ARGBSHUFFLEROW_NEON +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + MEMACCESS(3) + "ld1 {v2.16b}, [%3] \n" // shuffler + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} +#endif // HAS_ARGBSHUFFLEROW_NEON + +#ifdef HAS_I422TOYUY2ROW_NEON +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "orr v2.8b, v1.8b, v1.8b \n" + MEMACCESS(1) + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + MEMACCESS(2) + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_I422TOYUY2ROW_NEON + +#ifdef HAS_I422TOUYVYROW_NEON +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "orr v3.8b, v2.8b, v2.8b \n" + MEMACCESS(1) + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + MEMACCESS(2) + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_I422TOUYVYROW_NEON + +#ifdef HAS_ARGBTORGB565ROW_NEON +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23" + ); +} +#endif // HAS_ARGBTORGB565ROW_NEON + +#ifdef HAS_ARGBTOARGB1555ROW_NEON +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, + int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23" + ); +} +#endif // HAS_ARGBTOARGB1555ROW_NEON + +#ifdef HAS_ARGBTOARGB4444ROW_NEON +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, + int pix) { + asm volatile ( + "movi v4.16b, #0x0f \n" // bits to clear with vbic. + "1: \n" + MEMACCESS(0) + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" + ); +} +#endif // HAS_ARGBTOARGB4444ROW_NEON + +#ifdef HAS_ARGBTOYROW_NEON +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBTOYROW_NEON + +#ifdef HAS_ARGBTOYJROW_NEON +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #15 \n" // B * 0.11400 coefficient + "movi v5.8b, #75 \n" // G * 0.58700 coefficient + "movi v6.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" + ); +} +#endif // HAS_ARGBTOYJROW_NEON + +// 8x1 pixels. +#ifdef HAS_ARGBTOUV444ROW_NEON +void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned + + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", + "v24", "v25", "v26", "v27", "v28", "v29" + ); +} +#endif // HAS_ARGBTOUV444ROW_NEON + +// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGBTOUV422ROW_NEON +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + "subs %3, %3, #16 \n" // 16 processed per loop. + "mul v3.8h, v0.8h, v20.8h \n" // B + "mls v3.8h, v1.8h, v21.8h \n" // G + "mls v3.8h, v2.8h, v22.8h \n" // R + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + + "mul v4.8h, v2.8h, v20.8h \n" // R + "mls v4.8h, v1.8h, v24.8h \n" // G + "mls v4.8h, v0.8h, v23.8h \n" // B + "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUV422ROW_NEON + +// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. +#ifdef HAS_ARGBTOUV411ROW_NEON +void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(0) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. + "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. + "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. + "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %3, %3, #32 \n" // 32 processed per loop. + "mul v3.8h, v0.8h, v20.8h \n" // B + "mls v3.8h, v1.8h, v21.8h \n" // G + "mls v3.8h, v2.8h, v22.8h \n" // R + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "mul v4.8h, v2.8h, v20.8h \n" // R + "mls v4.8h, v1.8h, v24.8h \n" // G + "mls v4.8h, v0.8h, v23.8h \n" // B + "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUV411ROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#define RGBTOUV(QB, QG, QR) \ + "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ + "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ + "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ + "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ + "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ + "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ + "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ + "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ + "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ + "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +// TODO(fbarchard): consider ptrdiff_t for all strides. + +#ifdef HAS_ARGBTOUVROW_NEON +void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb_1 = src_argb + src_stride_argb; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUVROW_NEON + +// TODO(fbarchard): Subsample match C code. +#ifdef HAS_ARGBTOUVJROW_NEON +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb_1 = src_argb + src_stride_argb; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUVJROW_NEON + +#ifdef HAS_BGRATOUVROW_NEON +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_bgra_1 = src_bgra + src_stride_bgra; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more + "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v3.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_bgra_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_BGRATOUVROW_NEON + +#ifdef HAS_ABGRTOUVROW_NEON +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_abgr_1 = src_abgr + src_stride_abgr; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v3.8h, #1 \n" // 2x average + "urshr v2.8h, v2.8h, #1 \n" + "urshr v1.8h, v1.8h, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v2.8h, v1.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_abgr_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ABGRTOUVROW_NEON + +#ifdef HAS_RGBATOUVROW_NEON +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_rgba_1 = src_rgba + src_stride_rgba; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_rgba_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_RGBATOUVROW_NEON + +#ifdef HAS_RGB24TOUVROW_NEON +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_rgb24_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_RGB24TOUVROW_NEON + +#ifdef HAS_RAWTOUVROW_NEON +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_raw_1 = src_raw + src_stride_raw; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v2.8h, v2.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(v2.8h, v1.8h, v0.8h) + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_raw_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_RAWTOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_RGB565TOUVROW_NEON +void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; + asm volatile ( + "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 + "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 + "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 + "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 + "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 + "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v17.D[0] \n" + "ins v18.D[1], v19.D[0] \n" + "ins v20.D[1], v21.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v18.8h, #1 \n" + "urshr v6.8h, v20.8h, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "mul v16.8h, v4.8h, v22.8h \n" // B + "mls v16.8h, v5.8h, v23.8h \n" // G + "mls v16.8h, v6.8h, v24.8h \n" // R + "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned + "mul v17.8h, v6.8h, v22.8h \n" // R + "mls v17.8h, v5.8h, v26.8h \n" // G + "mls v17.8h, v4.8h, v25.8h \n" // B + "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_rgb565_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27" + ); +} +#endif // HAS_RGB565TOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB1555TOUVROW_NEON +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_argb1555_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28" + ); +} +#endif // HAS_ARGB1555TOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB4444TOUVROW_NEON +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + MEMACCESS(3) + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_argb4444_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28" + + ); +} +#endif // HAS_ARGB4444TOUVROW_NEON + +#ifdef HAS_RGB565TOYROW_NEON +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { + asm volatile ( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", + "v24", "v25", "v26", "v27" + ); +} +#endif // HAS_RGB565TOYROW_NEON + +#ifdef HAS_ARGB1555TOYROW_NEON +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGB1555TOYROW_NEON + +#ifdef HAS_ARGB4444TOYROW_NEON +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { + asm volatile ( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" + ); +} +#endif // HAS_ARGB4444TOYROW_NEON + +#ifdef HAS_BGRATOYROW_NEON +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_BGRATOYROW_NEON + +#ifdef HAS_ABGRTOYROW_NEON +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_ABGRTOYROW_NEON + +#ifdef HAS_RGBATOYROW_NEON +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_RGBATOYROW_NEON + +#ifdef HAS_RGB24TOYROW_NEON +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_RGB24TOYROW_NEON + +#ifdef HAS_RAWTOYROW_NEON +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + MEMACCESS(0) + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_RAWTOYROW_NEON + +// Bilinear filter 16x2 -> 16x1 +#ifdef HAS_INTERPOLATEROW_NEON +void InterpolateRow_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + asm volatile ( + "cmp %4, #0 \n" + "b.eq 100f \n" + "cmp %4, #64 \n" + "b.eq 75f \n" + "cmp %4, #128 \n" + "b.eq 50f \n" + "cmp %4, #192 \n" + "b.eq 25f \n" + + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v0.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction), // %4 + "+r"(y0_fraction) // %5 + : + : "cc", "memory", "v0", "v1", "v3", "v4", "v5" + ); +} +#endif // HAS_INTERPOLATEROW_NEON + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +#ifdef HAS_ARGBBLENDROW_NEON +void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "subs %3, %3, #8 \n" + "b.lt 89f \n" + // Blend 8 pixels. + "8: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels + "subs %3, %3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.ge 8b \n" + + "89: \n" + "adds %3, %3, #8-1 \n" + "b.lt 99f \n" + + // Blend 1 pixels. + "1: \n" + MEMACCESS(0) + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. + MEMACCESS(1) + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + MEMACCESS(2) + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18" + ); +} +#endif // HAS_ARGBBLENDROW_NEON + +// Attenuate 8 pixels at a time. +#ifdef HAS_ARGBATTENUATEROW_NEON +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + // Attenuate 8 pixels. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + MEMACCESS(1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" + ); +} +#endif // HAS_ARGBATTENUATEROW_NEON + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +#ifdef HAS_ARGBQUANTIZEROW_NEON +void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add + + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + MEMACCESS(0) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" + ); +} +#endif // HAS_ARGBQUANTIZEROW_NEON + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +#ifdef HAS_ARGBSHADEROW_NEON +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. + + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + MEMACCESS(1) + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "v0", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBSHADEROW_NEON + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +#ifdef HAS_ARGBGRAYROW_NEON +void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "movi v24.8b, #15 \n" // B * 0.11400 coefficient + "movi v25.8b, #75 \n" // G * 0.58700 coefficient + "movi v26.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R + MEMACCESS(1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" + ); +} +#endif // HAS_ARGBGRAYROW_NEON + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 + +#ifdef HAS_ARGBSEPIAROW_NEON +void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { + asm volatile ( + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + MEMACCESS(0) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" + ); +} +#endif // HAS_ARGBSEPIAROW_NEON + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +#ifdef HAS_ARGBCOLORMATRIXROW_NEON +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + asm volatile ( + MEMACCESS(3) + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. + + "1: \n" + MEMACCESS(0) + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + MEMACCESS(1) + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBCOLORMATRIXROW_NEON + +// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +#ifdef HAS_ARGBMULTIPLYROW_NEON +void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBMULTIPLYROW_NEON + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +#ifdef HAS_ARGBADDROW_NEON +void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBADDROW_NEON + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +#ifdef HAS_ARGBSUBTRACTROW_NEON +void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + MEMACCESS(1) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBSUBTRACTROW_NEON + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +#ifdef HAS_SOBELROW_NEON +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + MEMACCESS(1) + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "orr v1.8b, v0.8b, v0.8b \n" + "orr v2.8b, v0.8b, v0.8b \n" + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_SOBELROW_NEON + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +#ifdef HAS_SOBELTOPLANEROW_NEON +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + // 16 pixel loop. + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "uqadd v0.16b, v0.16b, v1.16b \n" // add + MEMACCESS(2) + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1" + ); +} +#endif // HAS_SOBELTOPLANEROW_NEON + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +#ifdef HAS_SOBELXYROW_NEON +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + MEMACCESS(1) + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "uqadd v1.8b, v0.8b, v2.8b \n" // add + MEMACCESS(2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_SOBELXYROW_NEON + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +#ifdef HAS_SOBELXROW_NEON +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0],%5 \n" // top + MEMACCESS(0) + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + MEMACCESS(1) + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + MEMACCESS(1) + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + MEMACCESS(2) + "ld1 {v2.8b}, [%2],%5 \n" // bottom + MEMACCESS(2) + "ld1 {v3.8b}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + MEMACCESS(3) + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_SOBELXROW_NEON + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +#ifdef HAS_SOBELYROW_NEON +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0],%4 \n" // left + MEMACCESS(1) + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + MEMACCESS(1) + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0],%5 \n" // right + MEMACCESS(1) + "ld1 {v3.8b}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_SOBELYROW_NEON +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_posix.cc b/source/row_posix.cc index 106fda5..9e514dd 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -221,7 +221,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { "1: \n" "movq " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x8,0) ",%0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -252,37 +252,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, - int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" @@ -318,17 +287,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" "sub $0x10,%2 \n" - "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" "jg 1b \n" : "+r"(src_rgb24), // %0 @@ -359,17 +328,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" "sub $0x10,%2 \n" - "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" "jg 1b \n" : "+r"(src_raw), // %0 @@ -418,8 +387,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" BUNDLEALIGN - MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) - MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) + MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -475,8 +444,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" BUNDLEALIGN - MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) - MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) + MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -519,8 +488,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) - MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -631,7 +600,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "pslld $0x8,%%xmm0 \n" @@ -672,7 +641,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0xf,%%xmm7 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm0,%%xmm3 \n" @@ -712,7 +681,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { "psrlw $0x8,%%xmm3 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm3,%%xmm0 \n" "pand %%xmm4,%%xmm1 \n" @@ -744,43 +713,6 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kARGBToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -820,44 +752,6 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -912,15 +806,19 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -962,7 +860,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif ); } +#endif // HAS_ARGBTOUVROW_SSSE3 +#ifdef HAS_ARGBTOUVJROW_SSSE3 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { @@ -979,156 +879,19 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToUJ), // %0 - "m"(kARGBToVJ), // %1 - "m"(kAddUVJ128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1161,7 +924,7 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)) + : "r"((intptr_t)(src_stride_argb)) // %4 : "memory", "cc" #if defined(__native_client__) && defined(__x86_64__) , "r14" @@ -1171,7 +934,9 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif ); } +#endif // HAS_ARGBTOUVJROW_SSSE3 +#ifdef HAS_ARGBTOUV444ROW_SSSE3 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -1187,71 +952,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6" -#endif - ); -} - -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, - uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1301,7 +1001,9 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, #endif ); } +#endif // HAS_ARGBTOUV444ROW_SSSE3 +#ifdef HAS_ARGBTOUV422ROW_SSSE3 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -1317,67 +1019,6 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1423,6 +1064,7 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, #endif ); } +#endif // HAS_ARGBTOUV422ROW_SSSE3 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { asm volatile ( @@ -1430,43 +1072,6 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kBGRAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1513,85 +1118,19 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_bgra0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_bgra)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kBGRAToU), // %0 - "m"(kBGRAToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1640,43 +1179,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kABGRToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1714,43 +1216,6 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kRGBAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1797,85 +1262,19 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_abgr0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kABGRToU), // %0 - "m"(kABGRToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1933,85 +1332,19 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_rgba0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)) - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kRGBAToU), // %0 - "m"(kRGBAToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -2043,7 +1376,7 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)) // %4 + : "r"((intptr_t)(src_stride_rgba)) : "memory", "cc" #if defined(__native_client__) && defined(__x86_64__) , "r14" @@ -2053,7 +1386,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, #endif ); } -#endif // HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3 #define UB 127 /* min(63,(int8)(2.018 * 64)) */ @@ -2199,8 +1531,8 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2354,8 +1686,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2393,8 +1725,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2430,8 +1762,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2464,191 +1796,6 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" - // Does not use r14. -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV444 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV411 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READNV12 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" - // Does not use r14. -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READNV12 - YVUTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" @@ -2686,8 +1833,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" - "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" + "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2725,8 +1872,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, "movdqa %%xmm2,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm2 \n" "punpckhwd %%xmm0,%%xmm1 \n" - "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" + "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2765,125 +1912,6 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" - "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" - "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_bgra, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "pcmpeqb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm5 \n" - "movdqa %%xmm5,%%xmm0 \n" - "punpcklwd %%xmm1,%%xmm5 \n" - "punpckhwd %%xmm1,%%xmm0 \n" - "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" - "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" - "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_abgr, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" - "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "pcmpeqb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "movdqa %%xmm5,%%xmm0 \n" - "punpcklwd %%xmm1,%%xmm5 \n" - "punpckhwd %%xmm1,%%xmm0 \n" "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" @@ -2939,8 +1967,8 @@ void YToARGBRow_SSE2(const uint8* y_buf, "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "por %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" @@ -2970,10 +1998,10 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { "lea " MEMLEA(-0x10,0) ",%0 \n" LABELALIGN "1: \n" - MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 + MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 "pshufb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src), // %0 @@ -3039,7 +2067,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n" "pshufb %%xmm1,%%xmm0 \n" "sub $8,%3 \n" @@ -3077,11 +2105,11 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { "movdqa %3,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src), // %0 @@ -3104,45 +2132,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -3182,38 +2171,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" - "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2" -#endif - ); -} - -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, - uint8* dst_uv, int width) { - asm volatile ( - "sub %0,%1 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" @@ -3246,11 +2203,11 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x20,%2 \n" "jg 1b \n" @@ -3266,6 +2223,31 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_SSE2 +#ifdef HAS_COPYROW_AVX +void CopyRow_AVX(const uint8* src, uint8* dst, int count) { + asm volatile ( + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_COPYROW_AVX + #ifdef HAS_COPYROW_X86 void CopyRow_X86(const uint8* src, uint8* dst, int width) { size_t width_tmp = (size_t)(width); @@ -3282,7 +2264,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) { #endif // HAS_COPYROW_X86 #ifdef HAS_COPYROW_ERMS -// Unaligned Multiple of 1. +// Multiple of 1. void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { size_t width_tmp = (size_t)(width); asm volatile ( @@ -3306,19 +2288,19 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { "psrld $0x8,%%xmm1 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa " MEMACCESS(1) ",%%xmm4 \n" - "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "movdqu " MEMACCESS(1) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" "pand %%xmm0,%%xmm2 \n" "pand %%xmm0,%%xmm3 \n" "pand %%xmm1,%%xmm4 \n" "pand %%xmm1,%%xmm5 \n" "por %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm3 \n" - "movdqa %%xmm2," MEMACCESS(1) " \n" - "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -3380,16 +2362,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { "punpcklbw %%xmm2,%%xmm2 \n" "punpckhwd %%xmm2,%%xmm3 \n" "punpcklwd %%xmm2,%%xmm2 \n" - "movdqa " MEMACCESS(1) ",%%xmm4 \n" - "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "movdqu " MEMACCESS(1) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" "pand %%xmm0,%%xmm2 \n" "pand %%xmm0,%%xmm3 \n" "pand %%xmm1,%%xmm4 \n" "pand %%xmm1,%%xmm5 \n" "por %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm3 \n" - "movdqa %%xmm2," MEMACCESS(1) " \n" - "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -3473,128 +2455,15 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { "psrlw $0x8,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 @@ -3607,9 +2476,8 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, ); } -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, - int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -3653,8 +2521,8 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, ); } -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -3697,117 +2565,6 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { - asm volatile ( - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -3829,8 +2586,8 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, ); } -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -3874,8 +2631,8 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ); } -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -3989,7 +2746,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jge 41b \n" @@ -4107,16 +2864,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // 4 pixel loop. LABELALIGN "40: \n" - "movdqa " MEMACCESS(0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" "pshufb %4,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movdqa " MEMACCESS(1) ",%%xmm1 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" @@ -4126,7 +2883,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jge 40b \n" "jmp 49f \n" @@ -4153,7 +2910,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jge 41b \n" @@ -4212,17 +2969,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pshufhw $0xff,%%xmm0,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "punpckhbw %%xmm1,%%xmm1 \n" "pshufhw $0xff,%%xmm1,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm1 \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "pand %%xmm4,%%xmm2 \n" @@ -4231,7 +2988,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "pand %%xmm5,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -4364,16 +3121,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm0 \n" "paddw %%xmm5,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrld $0x18,%%xmm2 \n" "psrld $0x18,%%xmm3 \n" @@ -4386,8 +3143,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "punpcklwd %%xmm3,%%xmm0 \n" "punpckhwd %%xmm3,%%xmm1 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -4430,30 +3187,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm6 \n" "phaddw %%xmm6,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm5 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm5 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm5 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm5 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "psrld $0x18,%%xmm6 \n" "psrld $0x18,%%xmm1 \n" "packuswb %%xmm1,%%xmm6 \n" @@ -4463,8 +3220,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { "punpcklwd %%xmm5,%%xmm0 \n" "punpckhwd %%xmm5,%%xmm1 \n" "sub $0x8,%1 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 @@ -4495,12 +3252,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddsw %%xmm7,%%xmm0 \n" @@ -4510,13 +3267,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm6,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm7 \n" "phaddsw %%xmm7,%%xmm1 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm5,%%xmm6 \n" "pmaddubsw %%xmm5,%%xmm7 \n" "phaddsw %%xmm7,%%xmm6 \n" @@ -4529,8 +3286,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "punpcklwd %%xmm1,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm6 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,1) ",%1 \n" "jg 1b \n" @@ -4568,14 +3325,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "punpckhbw %%xmm5,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n" "pmullw %%xmm3,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm7 \n" "pmullw %%xmm3,%%xmm1 \n" "pand %%xmm6,%%xmm7 \n" "paddw %%xmm4,%%xmm0 \n" @@ -4583,7 +3340,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, "packuswb %%xmm1,%%xmm0 \n" "por %%xmm7,%%xmm0 \n" "sub $0x4,%1 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" "lea " MEMLEA(0x10,0) ",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 @@ -4612,7 +3369,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -4623,7 +3380,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -4876,8 +3633,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "paddusb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -4894,10 +3651,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movdqa %%xmm1," MEMACCESS(2) " \n" - "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" - "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" - "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" + "movdqu %%xmm1," MEMACCESS(2) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" + "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" + "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" "lea " MEMLEA(0x40,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 @@ -4928,12 +3685,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x10,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 @@ -4967,8 +3724,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "paddusb %%xmm1,%%xmm2 \n" @@ -4985,10 +3742,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "punpcklwd %%xmm0,%%xmm7 \n" "punpckhwd %%xmm0,%%xmm1 \n" "sub $0x10,%3 \n" - "movdqa %%xmm6," MEMACCESS(2) " \n" - "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" - "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" - "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" + "movdqu %%xmm6," MEMACCESS(2) " \n" + "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" + "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" + "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" "lea " MEMLEA(0x40,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 @@ -5035,22 +3792,22 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, "punpcklwd %%xmm1,%%xmm4 \n" "punpckhwd %%xmm1,%%xmm5 \n" "paddd %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(2) ",%%xmm2 \n" + "movdqu " MEMACCESS(2) ",%%xmm2 \n" "paddd %%xmm0,%%xmm2 \n" "paddd %%xmm3,%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" + "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" "paddd %%xmm0,%%xmm3 \n" "paddd %%xmm4,%%xmm0 \n" - "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" "paddd %%xmm0,%%xmm4 \n" "paddd %%xmm5,%%xmm0 \n" - "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" "lea " MEMLEA(0x40,2) ",%2 \n" "paddd %%xmm0,%%xmm5 \n" - "movdqa %%xmm2," MEMACCESS(1) " \n" - "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" - "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" - "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" "sub $0x4,%3 \n" "jge 40b \n" @@ -5115,10 +3872,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 4 pixel small loop \n" LABELALIGN "4: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" BUNDLEALIGN MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 @@ -5149,10 +3906,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 4 pixel loop \n" LABELALIGN "40: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" BUNDLEALIGN MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 @@ -5196,7 +3953,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 1 pixel loop \n" LABELALIGN "10: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 "lea " MEMLEA(0x10,0) ",%0 \n" "psubd " MEMACCESS(1) ",%%xmm0 \n" @@ -5352,241 +4109,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, // General purpose row blend. LABELALIGN "1: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm2) - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "pmaddubsw %%xmm5,%%xmm0 \n" - "pmaddubsw %%xmm5,%%xmm1 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - LABELALIGN - "25: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - LABELALIGN - "75: \n" - "movdqa " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm0) - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - "sub $0x10,%2 \n" - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm5" -#endif - ); -} -#endif // HAS_INTERPOLATEROW_SSSE3 - -#ifdef HAS_INTERPOLATEROW_SSE2 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2 - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm2 \n" - "punpckhbw %%xmm4,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "punpckhbw %%xmm4,%%xmm1 \n" - "psubw %%xmm0,%%xmm2 \n" - "psubw %%xmm1,%%xmm3 \n" - "paddw %%xmm2,%%xmm2 \n" - "paddw %%xmm3,%%xmm3 \n" - "pmulhw %%xmm5,%%xmm2 \n" - "pmulhw %%xmm5,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - LABELALIGN - "25: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - LABELALIGN - "75: \n" - "movdqa " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0 - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - "sub $0x10,%2 \n" - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} -#endif // HAS_INTERPOLATEROW_SSE2 - -#ifdef HAS_INTERPOLATEROW_SSSE3 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,1,4,1,xmm2) - "movdqu %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" "pmaddubsw %%xmm5,%%xmm0 \n" @@ -5666,13 +4191,13 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, #endif ); } -#endif // HAS_INTERPOLATEROW_SSSE3 +#endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { asm volatile ( "sub %1,%0 \n" "shr %3 \n" @@ -5699,8 +4224,8 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, "1: \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm2 \n" "punpckhbw %%xmm4,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm0 \n" @@ -5788,31 +4313,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_SSE2 -#ifdef HAS_HALFROW_SSE2 -void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - asm volatile ( - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0 - "sub $0x10,%2 \n" - MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1) - "lea " MEMLEA(0x10,0) ",%0 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(pix) // %2 - : "r"((intptr_t)(src_uv_stride)) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0" -#endif - ); -} -#endif // HAS_HALFROW_SSE2 - #ifdef HAS_ARGBTOBAYERROW_SSSE3 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { @@ -5822,8 +4322,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, "pshufd $0x0,%%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" @@ -5852,8 +4352,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, "psrld $0x18,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrld $0x8,%%xmm0 \n" "psrld $0x8,%%xmm1 \n" @@ -5882,34 +4382,7 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( - "movdqa " MEMACCESS(3) ",%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "r"(shuffler) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { - asm volatile ( - "movdqa " MEMACCESS(3) ",%%xmm5 \n" + "movdqu " MEMACCESS(3) ",%%xmm5 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" diff --git a/source/row_win.cc b/source/row_win.cc index 8eb8889..d0a1059 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -10,7 +10,7 @@ #include "libyuv/row.h" -#if defined (_M_X64) +#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) #include <emmintrin.h> #include <tmmintrin.h> // For _mm_maddubs_epi16 #endif @@ -21,7 +21,8 @@ extern "C" { #endif // This module is for Visual C. -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + (defined(_M_IX86) || defined(_M_X64)) #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ @@ -78,61 +79,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* v_buf, uint8* dst_argb, int width) { - - __m128i xmm0, xmm1, xmm2, xmm3; - const __m128i xmm5 = _mm_set1_epi8(-1); - const __m128i xmm4 = _mm_setzero_si128(); - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; - - while (width > 0) { - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); - xmm1 = _mm_load_si128(&xmm0); - xmm2 = _mm_load_si128(&xmm0); - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); - xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); - xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); - xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); - xmm3 = _mm_loadl_epi64((__m128i*)y_buf); - xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); - xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); - xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); - xmm0 = _mm_adds_epi16(xmm0, xmm3); - xmm1 = _mm_adds_epi16(xmm1, xmm3); - xmm2 = _mm_adds_epi16(xmm2, xmm3); - xmm0 = _mm_srai_epi16(xmm0, 6); - xmm1 = _mm_srai_epi16(xmm1, 6); - xmm2 = _mm_srai_epi16(xmm2, 6); - xmm0 = _mm_packus_epi16(xmm0, xmm0); - xmm1 = _mm_packus_epi16(xmm1, xmm1); - xmm2 = _mm_packus_epi16(xmm2, xmm2); - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); - xmm1 = _mm_load_si128(&xmm0); - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); - - _mm_store_si128((__m128i *)dst_argb, xmm0); - _mm_store_si128((__m128i *)(dst_argb + 16), xmm1); - - y_buf += 8; - u_buf += 4; - dst_argb += 32; - width -= 8; - } -} - -// Unaligned destination version. -void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3; const __m128i xmm5 = _mm_set1_epi8(-1); const __m128i xmm4 = _mm_setzero_si128(); @@ -143,8 +89,8 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); - xmm1 = _mm_load_si128(&xmm0); - xmm2 = _mm_load_si128(&xmm0); + xmm1 = _mm_loadu_si128(&xmm0); + xmm2 = _mm_loadu_si128(&xmm0); xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); @@ -166,7 +112,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, xmm2 = _mm_packus_epi16(xmm2, xmm2); xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); - xmm1 = _mm_load_si128(&xmm0); + xmm1 = _mm_loadu_si128(&xmm0); xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); @@ -179,6 +125,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, width -= 8; } } + // 32 bit #else // defined(_M_X64) @@ -327,35 +274,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { punpckhwd xmm1, xmm1 por xmm0, xmm5 por xmm1, xmm5 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, - int pix) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - - align 4 - convertloop: - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 - punpckhwd xmm1, xmm1 - por xmm0, xmm5 - por xmm1, xmm5 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] @@ -387,17 +305,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 - movdqa [edx + 32], xmm2 + movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 - movdqa [edx + 16], xmm1 + movdqu [edx + 16], xmm1 por xmm3, xmm5 sub ecx, 16 - movdqa [edx + 48], xmm3 + movdqu [edx + 48], xmm3 lea edx, [edx + 64] jg convertloop ret @@ -427,17 +345,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 - movdqa [edx + 32], xmm2 + movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 - movdqa [edx + 16], xmm1 + movdqu [edx + 16], xmm1 por xmm3, xmm5 sub ecx, 16 - movdqa [edx + 48], xmm3 + movdqu [edx + 48], xmm3 lea edx, [edx + 64] jg convertloop ret @@ -492,8 +410,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 - movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -546,8 +464,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 - movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -586,8 +504,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, movdqa xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -689,7 +607,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G pslld xmm0, 8 // R @@ -729,7 +647,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G movdqa xmm3, xmm0 // R @@ -767,7 +685,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 pand xmm0, xmm3 // low nibble pand xmm1, xmm4 // high nibble @@ -796,10 +714,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -812,7 +730,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { packuswb xmm0, xmm2 paddb xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -831,10 +749,10 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -848,7 +766,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -938,75 +856,6 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { #endif // HAS_ARGBTOYJROW_AVX2 __declspec(naked) __declspec(align(16)) -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 - movdqa xmm4, kARGBToY - - align 4 - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm4, kARGBToYJ - movdqa xmm5, kAddYJ64 - - align 4 - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - paddw xmm0, xmm5 - paddw xmm2, xmm5 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1017,40 +866,6 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 - movdqa xmm4, kBGRAToY - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -1085,40 +900,6 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 - movdqa xmm4, kABGRToY - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -1153,40 +934,6 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 - movdqa xmm4, kRGBAToY - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -1229,14 +976,19 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1295,14 +1047,19 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1411,147 +1168,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOUVROW_AVX2 __declspec(naked) __declspec(align(16)) -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kARGBToUJ - movdqa xmm6, kARGBToVJ - movdqa xmm5, kAddUVJ128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - paddw xmm0, xmm5 // +.5 rounding -> unsigned - paddw xmm1, xmm5 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1568,64 +1184,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, align 4 convertloop: /* convert to U and V */ - movdqa xmm0, [eax] // U - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - - movdqa xmm0, [eax] // V - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm6 - pmaddubsw xmm1, xmm6 - pmaddubsw xmm2, xmm6 - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - lea eax, [eax + 64] - movdqa [edx + edi], xmm0 - lea edx, [edx + 16] - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* convert to U and V */ movdqu xmm0, [eax] // U movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -1684,65 +1242,6 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -1804,84 +1303,19 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kBGRAToU - movdqa xmm6, kBGRAToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] + movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1940,84 +1374,19 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kABGRToU - movdqa xmm6, kABGRToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] + movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -2076,84 +1445,19 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kRGBAToU - movdqa xmm6, kRGBAToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] + movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -2192,6 +1496,7 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, ret } } + #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_I422TOARGBROW_AVX2 @@ -2295,6 +1600,80 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, ret } } + +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). +// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. +__declspec(naked) __declspec(align(16)) +void I422ToBGRARow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpxor ymm4, ymm4, ymm4 + + align 4 + convertloop: + vmovq xmm0, qword ptr [esi] // U + vmovq xmm1, qword ptr [esi + edi] // V + lea esi, [esi + 8] + vpunpcklbw ymm0, ymm0, ymm1 // UV + vpermq ymm0, ymm0, 0xd8 + vpunpcklwd ymm0, ymm0, ymm0 // UVUV + vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV + vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV + vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV + vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed + vpsubw ymm1, ymm1, kUVBiasG_AVX + vpsubw ymm0, ymm0, kUVBiasR_AVX + + // Step 2: Find Y contribution to 16 R,G,B values + vmovdqu xmm3, [eax] // NOLINT + lea eax, [eax + 16] + vpermq ymm3, ymm3, 0xd8 + vpunpcklbw ymm3, ymm3, ymm4 + vpsubsw ymm3, ymm3, kYSub16_AVX + vpmullw ymm3, ymm3, kYToRgb_AVX + vpaddsw ymm2, ymm2, ymm3 // B += Y + vpaddsw ymm1, ymm1, ymm3 // G += Y + vpaddsw ymm0, ymm0, ymm3 // R += Y + vpsraw ymm2, ymm2, 6 + vpsraw ymm1, ymm1, 6 + vpsraw ymm0, ymm0, 6 +// TODO(fbarchard): Switch register order to match SSSE3. + vpackuswb ymm2, ymm2, ymm2 // B + vpackuswb ymm1, ymm1, ymm1 // G + vpackuswb ymm0, ymm0, ymm0 // R + + // Step 3: Weave into BGRA + vpunpcklbw ymm1, ymm1, ymm2 // GB + vpermq ymm1, ymm1, 0xd8 + vpunpcklbw ymm0, ymm5, ymm0 // AR + vpermq ymm0, ymm0, 0xd8 + vpunpcklwd ymm2, ymm0, ymm1 // ARGB first 8 pixels + vpunpckhwd ymm0, ymm0, ymm1 // ARGB next 8 pixels + vmovdqu [edx], ymm2 + vmovdqu [edx + 32], ymm0 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + + pop edi + pop esi + ret + } +} #endif // HAS_I422TOARGBROW_AVX2 #ifdef HAS_I422TOARGBROW_SSSE3 @@ -2424,8 +1803,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2530,7 +1909,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest unaligned. +// 8 pixels // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void I422ToRGB565Row_SSSE3(const uint8* y_buf, @@ -2634,8 +2013,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2679,8 +2058,8 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2719,8 +2098,8 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2757,214 +2136,6 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - ret - } -} - -// 8 pixels, unaligned. -// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV444 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -// 8 pixels, unaligned. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -// 8 pixels, unaligned. -// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -// Similar to I420 but duplicate UV once more. -__declspec(naked) __declspec(align(16)) -void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ecx, [esp + 12 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV411 // modifies EBX - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - pop ebx - ret - } -} - -// 8 pixels, dest aligned 16. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // Y - mov esi, [esp + 4 + 8] // UV - mov edx, [esp + 4 + 12] // argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READNV12 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - ret - } -} - -// 8 pixels, dest aligned 16. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // Y - mov esi, [esp + 4 + 8] // VU - mov edx, [esp + 4 + 12] // argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READNV12 - YVUTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] @@ -3005,47 +2176,6 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, movdqa xmm0, xmm5 punpcklwd xmm5, xmm1 // BGRA first 4 pixels punpckhwd xmm0, xmm1 // BGRA next 4 pixels - movdqa [edx], xmm5 - movdqa [edx + 16], xmm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_bgra, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // bgra - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into BGRA - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - punpcklbw xmm1, xmm0 // GB - punpcklbw xmm5, xmm2 // AR - movdqa xmm0, xmm5 - punpcklwd xmm5, xmm1 // BGRA first 4 pixels - punpckhwd xmm0, xmm1 // BGRA next 4 pixels movdqu [edx], xmm5 movdqu [edx + 16], xmm0 lea edx, [edx + 32] @@ -3087,47 +2217,6 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm2 punpcklwd xmm2, xmm0 // RGBA first 4 pixels punpckhwd xmm1, xmm0 // RGBA next 4 pixels - movdqa [edx], xmm2 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_abgr, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // abgr - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm2, xmm1 // RG - punpcklbw xmm0, xmm5 // BA - movdqa xmm1, xmm2 - punpcklwd xmm2, xmm0 // RGBA first 4 pixels - punpckhwd xmm1, xmm0 // RGBA next 4 pixels movdqu [edx], xmm2 movdqu [edx + 16], xmm1 lea edx, [edx + 32] @@ -3169,47 +2258,6 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, movdqa xmm0, xmm5 punpcklwd xmm5, xmm1 // RGBA first 4 pixels punpckhwd xmm0, xmm1 // RGBA next 4 pixels - movdqa [edx], xmm5 - movdqa [edx + 16], xmm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgba - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into RGBA - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - punpcklbw xmm1, xmm2 // GR - punpcklbw xmm5, xmm0 // AB - movdqa xmm0, xmm5 - punpcklwd xmm5, xmm1 // RGBA first 4 pixels - punpckhwd xmm0, xmm1 // RGBA next 4 pixels movdqu [edx], xmm5 movdqu [edx + 16], xmm0 lea edx, [edx + 32] @@ -3261,8 +2309,8 @@ void YToARGBRow_SSE2(const uint8* y_buf, punpckhwd xmm1, xmm1 // BGRA next 4 pixels por xmm0, xmm4 por xmm1, xmm4 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3289,10 +2337,10 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { align 4 convertloop: - movdqa xmm0, [eax + ecx] + movdqu xmm0, [eax + ecx] pshufb xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -3332,8 +2380,6 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORROW_SSE2 -// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 -// version can not. __declspec(naked) __declspec(align(16)) void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { @@ -3382,7 +2428,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, align 4 convertloop: - movdqa xmm0, [eax] + movdqu xmm0, [eax] lea eax, [eax - 16] pshufb xmm0, xmm1 sub ecx, 8 @@ -3414,11 +2460,11 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { align 4 convertloop: - movdqa xmm0, [eax] + movdqu xmm0, [eax] lea eax, [eax - 16] pshufb xmm0, xmm5 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -3469,43 +2515,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - pand xmm0, xmm5 // even bytes - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm2, 8 // odd bytes - psrlw xmm3, 8 - packuswb xmm2, xmm3 - movdqa [edx], xmm0 - movdqa [edx + edi], xmm2 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] @@ -3527,6 +2536,7 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ret } } + #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_AVX2 @@ -3582,36 +2592,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, align 4 convertloop: - movdqa xmm0, [eax] // read 16 U's - movdqa xmm1, [eax + edx] // and 16 V's - lea eax, [eax + 16] - movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 // first 8 UV pairs - punpckhbw xmm2, xmm1 // next 8 UV pairs - movdqa [edi], xmm0 - movdqa [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, - uint8* dst_uv, int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width - sub edx, eax - - align 4 - convertloop: movdqu xmm0, [eax] // read 16 U's movdqu xmm1, [eax + edx] // and 16 V's lea eax, [eax + 16] @@ -3675,11 +2655,11 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 32 jg convertloop @@ -3688,7 +2668,33 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_SSE2 -// Unaligned Multiple of 1. +#ifdef HAS_COPYROW_AVX +// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. +__declspec(naked) __declspec(align(16)) +void CopyRow_AVX(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 64 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_COPYROW_AVX + +// Multiple of 1. __declspec(naked) __declspec(align(16)) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { __asm { @@ -3705,6 +2711,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { } #ifdef HAS_COPYROW_X86 +// Multiple of 4. __declspec(naked) __declspec(align(16)) void CopyRow_X86(const uint8* src, uint8* dst, int count) { __asm { @@ -3737,19 +2744,19 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { align 4 convertloop: - movdqa xmm2, [eax] - movdqa xmm3, [eax + 16] + movdqu xmm2, [eax] + movdqu xmm3, [eax + 16] lea eax, [eax + 32] - movdqa xmm4, [edx] - movdqa xmm5, [edx + 16] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] pand xmm2, xmm0 pand xmm3, xmm0 pand xmm4, xmm1 pand xmm5, xmm1 por xmm2, xmm4 por xmm3, xmm5 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3809,16 +2816,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { punpcklbw xmm2, xmm2 punpckhwd xmm3, xmm2 punpcklwd xmm2, xmm2 - movdqa xmm4, [edx] - movdqa xmm5, [edx + 16] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] pand xmm2, xmm0 pand xmm3, xmm0 pand xmm4, xmm1 pand xmm5, xmm1 por xmm2, xmm4 por xmm3, xmm5 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -4147,113 +3154,6 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y - pand xmm1, xmm5 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] @@ -4269,8 +3169,8 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, } __declspec(naked) __declspec(align(16)) -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { __asm { push esi push edi @@ -4313,8 +3213,8 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, } __declspec(naked) __declspec(align(16)) -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 @@ -4359,111 +3259,6 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] @@ -4479,8 +3274,8 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, } __declspec(naked) __declspec(align(16)) -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { __asm { push esi push edi @@ -4523,8 +3318,8 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, } __declspec(naked) __declspec(align(16)) -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 @@ -4640,7 +3435,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jge convertloop4 @@ -4756,16 +3551,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // 4 pixel loop. convertloop4: - movdqa xmm3, [eax] // src argb + movdqu xmm3, [eax] // src argb lea eax, [eax + 16] movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [esi] // _r_b + movdqu xmm2, [esi] // _r_b pshufb xmm3, kShuffleAlpha // alpha pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [esi] // _a_g + movdqu xmm1, [esi] // _a_g lea esi, [esi + 16] psrlw xmm1, 8 // _a_g por xmm0, xmm4 // set alpha to 255 @@ -4775,7 +3570,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jge convertloop4 jmp convertloop4b @@ -4801,7 +3596,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jge convertuloop4 @@ -4857,17 +3652,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm0 // first 2 pshufhw xmm2, xmm0, 0FFh // 8 alpha words pshuflw xmm2, xmm2, 0FFh pmulhuw xmm0, xmm2 // rgb * a - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels punpckhbw xmm1, xmm1 // next 2 pixels pshufhw xmm2, xmm1, 0FFh // 8 alpha words pshuflw xmm2, xmm2, 0FFh pmulhuw xmm1, xmm2 // rgb * a - movdqa xmm2, [eax] // alphas + movdqu xmm2, [eax] // alphas lea eax, [eax + 16] psrlw xmm0, 8 pand xmm2, xmm4 @@ -4876,7 +3671,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { pand xmm0, xmm5 // keep original alphas por xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop @@ -5151,16 +3946,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { align 4 convertloop: - movdqa xmm0, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm0, xmm1 paddw xmm0, xmm5 // Add .5 for rounding. psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 G bytes - movdqa xmm2, [eax] // A - movdqa xmm3, [eax + 16] + movdqu xmm2, [eax] // A + movdqu xmm3, [eax + 16] lea eax, [eax + 32] psrld xmm2, 24 psrld xmm3, 24 @@ -5173,8 +3968,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { punpcklwd xmm0, xmm3 // GGGA first 4 punpckhwd xmm1, xmm3 // GGGA next 4 sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] jg convertloop ret @@ -5211,30 +4006,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { align 4 convertloop: - movdqa xmm0, [eax] // B - movdqa xmm6, [eax + 16] + movdqu xmm0, [eax] // B + movdqu xmm6, [eax + 16] pmaddubsw xmm0, xmm2 pmaddubsw xmm6, xmm2 phaddw xmm0, xmm6 psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 B values - movdqa xmm5, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm5, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm3 pmaddubsw xmm1, xmm3 phaddw xmm5, xmm1 psrlw xmm5, 7 packuswb xmm5, xmm5 // 8 G values punpcklbw xmm0, xmm5 // 8 BG values - movdqa xmm5, [eax] // R - movdqa xmm1, [eax + 16] + movdqu xmm5, [eax] // R + movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm5, xmm1 psrlw xmm5, 7 packuswb xmm5, xmm5 // 8 R values - movdqa xmm6, [eax] // A - movdqa xmm1, [eax + 16] + movdqu xmm6, [eax] // A + movdqu xmm1, [eax + 16] psrld xmm6, 24 psrld xmm1, 24 packuswb xmm6, xmm1 @@ -5244,8 +4039,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { punpcklwd xmm0, xmm5 // BGRA first 4 punpckhwd xmm1, xmm5 // BGRA next 4 sub ecx, 8 - movdqa [eax], xmm0 - movdqa [eax + 16], xmm1 + movdqu [eax], xmm0 + movdqu [eax + 16], xmm1 lea eax, [eax + 32] jg convertloop ret @@ -5274,12 +4069,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, align 4 convertloop: - movdqa xmm0, [eax] // B - movdqa xmm7, [eax + 16] + movdqu xmm0, [eax] // B + movdqu xmm7, [eax + 16] pmaddubsw xmm0, xmm2 pmaddubsw xmm7, xmm2 - movdqa xmm6, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm6, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 phaddsw xmm0, xmm7 // B @@ -5289,13 +4084,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, packuswb xmm0, xmm0 // 8 B values packuswb xmm6, xmm6 // 8 G values punpcklbw xmm0, xmm6 // 8 BG values - movdqa xmm1, [eax] // R - movdqa xmm7, [eax + 16] + movdqu xmm1, [eax] // R + movdqu xmm7, [eax + 16] pmaddubsw xmm1, xmm4 pmaddubsw xmm7, xmm4 phaddsw xmm1, xmm7 // R - movdqa xmm6, [eax] // A - movdqa xmm7, [eax + 16] + movdqu xmm6, [eax] // A + movdqu xmm7, [eax + 16] pmaddubsw xmm6, xmm5 pmaddubsw xmm7, xmm5 phaddsw xmm6, xmm7 // A @@ -5308,8 +4103,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, punpcklwd xmm0, xmm1 // BGRA first 4 punpckhwd xmm6, xmm1 // BGRA next 4 sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm6 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm6 lea eax, [eax + 32] lea edx, [edx + 32] jg convertloop @@ -5342,14 +4137,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm5 // first 2 pixels pmulhuw xmm0, xmm2 // pixel * scale >> 16 - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels punpckhbw xmm1, xmm5 // next 2 pixels pmulhuw xmm1, xmm2 pmullw xmm0, xmm3 // * interval_size - movdqa xmm7, [eax] // read 4 pixels + movdqu xmm7, [eax] // read 4 pixels pmullw xmm1, xmm3 pand xmm7, xmm6 // mask alpha paddw xmm0, xmm4 // + interval_size / 2 @@ -5357,7 +4152,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, packuswb xmm0, xmm1 por xmm0, xmm7 sub ecx, 4 - movdqa [eax], xmm0 + movdqu [eax], xmm0 lea eax, [eax + 16] jg convertloop ret @@ -5381,7 +4176,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 // first 2 @@ -5392,7 +4187,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, psrlw xmm1, 8 packuswb xmm0, xmm1 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop @@ -5749,8 +4544,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely movdqa xmm2, xmm0 // GG @@ -5767,10 +4562,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, por xmm3, xmm5 // GGGA por xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm1 - movdqa [edx + 16], xmm2 - movdqa [edx + 32], xmm3 - movdqa [edx + 48], xmm0 + movdqu [edx], xmm1 + movdqu [edx + 16], xmm2 + movdqu [edx + 32], xmm3 + movdqu [edx + 48], xmm0 lea edx, [edx + 64] jg convertloop @@ -5795,12 +4590,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop @@ -5830,8 +4625,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] movdqa xmm2, xmm0 paddusb xmm2, xmm1 // sobel = sobelx + sobely @@ -5848,10 +4643,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, punpcklwd xmm7, xmm0 // Next 4 punpckhwd xmm1, xmm0 // Last 4 sub ecx, 16 - movdqa [edx], xmm6 - movdqa [edx + 16], xmm4 - movdqa [edx + 32], xmm7 - movdqa [edx + 48], xmm1 + movdqu [edx], xmm6 + movdqu [edx + 16], xmm4 + movdqu [edx + 32], xmm7 + movdqu [edx + 48], xmm1 lea edx, [edx + 64] jg convertloop @@ -5907,10 +4702,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, align 4 s4: // top left - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] // - top right psubd xmm0, [eax + edx * 4] @@ -5950,10 +4745,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, align 4 l4: // top left - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] // - top right psubd xmm0, [eax + edx * 4] @@ -6002,7 +4797,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 1 pixel loop align 4 l1: - movdqa xmm0, [eax] + movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] lea eax, [eax + 16] psubd xmm0, [esi] @@ -6058,26 +4853,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, punpckhwd xmm5, xmm1 paddd xmm0, xmm2 - movdqa xmm2, [esi] // previous row above. + movdqu xmm2, [esi] // previous row above. paddd xmm2, xmm0 paddd xmm0, xmm3 - movdqa xmm3, [esi + 16] + movdqu xmm3, [esi + 16] paddd xmm3, xmm0 paddd xmm0, xmm4 - movdqa xmm4, [esi + 32] + movdqu xmm4, [esi + 32] paddd xmm4, xmm0 paddd xmm0, xmm5 - movdqa xmm5, [esi + 48] + movdqu xmm5, [esi + 48] lea esi, [esi + 64] paddd xmm5, xmm0 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 - movdqa [edx + 32], xmm4 - movdqa [edx + 48], xmm5 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + movdqu [edx + 32], xmm4 + movdqu [edx + 48], xmm5 lea edx, [edx + 64] sub ecx, 4 @@ -6296,7 +5091,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_AVX2 -#ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 __declspec(naked) __declspec(align(16)) void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, @@ -6332,225 +5126,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, align 4 xloop: - movdqa xmm0, [esi] - movdqa xmm2, [esi + edx] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - pmaddubsw xmm0, xmm5 - pmaddubsw xmm1, xmm5 - psrlw xmm0, 7 - psrlw xmm1, 7 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop - jmp xloop99 - - // Blend 25 / 75. - align 4 - xloop25: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop25 - jmp xloop99 - - // Blend 50 / 50. - align 4 - xloop50: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop50 - jmp xloop99 - - // Blend 75 / 25. - align 4 - xloop75: - movdqa xmm1, [esi] - movdqa xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop75 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - align 4 - xloop100: - movdqa xmm0, [esi] - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop100 - - xloop99: - pop edi - pop esi - ret - } -} -#endif // HAS_INTERPOLATEROW_SSSE3 - -#ifdef HAS_INTERPOLATEROW_SSE2 -// Bilinear filter 16x2 -> 16x1 -__declspec(naked) __declspec(align(16)) -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 256. Blend 100 / 0. - cmp eax, 64 - je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. - cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. - cmp eax, 192 - je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. - - movd xmm5, eax // xmm5 = y fraction - punpcklbw xmm5, xmm5 - psrlw xmm5, 1 - punpcklwd xmm5, xmm5 - punpckldq xmm5, xmm5 - punpcklqdq xmm5, xmm5 - pxor xmm4, xmm4 - - align 4 - xloop: - movdqa xmm0, [esi] // row0 - movdqa xmm2, [esi + edx] // row1 - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm4 - punpckhbw xmm3, xmm4 - punpcklbw xmm0, xmm4 - punpckhbw xmm1, xmm4 - psubw xmm2, xmm0 // row1 - row0 - psubw xmm3, xmm1 - paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 - paddw xmm3, xmm3 - pmulhw xmm2, xmm5 // scale diff - pmulhw xmm3, xmm5 - paddw xmm0, xmm2 // sum rows - paddw xmm1, xmm3 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop - jmp xloop99 - - // Blend 25 / 75. - align 4 - xloop25: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop25 - jmp xloop99 - - // Blend 50 / 50. - align 4 - xloop50: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop50 - jmp xloop99 - - // Blend 75 / 25. - align 4 - xloop75: - movdqa xmm1, [esi] - movdqa xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop75 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - align 4 - xloop100: - movdqa xmm0, [esi] - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop100 - - xloop99: - pop edi - pop esi - ret - } -} -#endif // HAS_INTERPOLATEROW_SSE2 - -// Bilinear filter 16x2 -> 16x1 -__declspec(naked) __declspec(align(16)) -void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - shr eax, 1 - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 128. Blend 100 / 0. - cmp eax, 32 - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. - cmp eax, 64 - je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. - cmp eax, 96 - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. - - movd xmm0, eax // high fraction 0..127 - neg eax - add eax, 128 - movd xmm5, eax // low fraction 128..1 - punpcklbw xmm5, xmm0 - punpcklwd xmm5, xmm5 - pshufd xmm5, xmm5, 0 - - align 4 - xloop: movdqu xmm0, [esi] movdqu xmm2, [esi + edx] movdqu xmm1, xmm0 @@ -6624,9 +5199,9 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 __declspec(naked) __declspec(align(16)) -void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { __asm { push esi push edi @@ -6735,58 +5310,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, #endif // HAS_INTERPOLATEROW_SSE2 __declspec(naked) __declspec(align(16)) -void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // src_uv_stride - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - sub edi, eax - - align 4 - convertloop: - movdqa xmm0, [eax] - pavgb xmm0, [eax + edx] - sub ecx, 16 - movdqa [eax + edi], xmm0 - lea eax, [eax + 16] - jg convertloop - pop edi - ret - } -} - -#ifdef HAS_HALFROW_AVX2 -__declspec(naked) __declspec(align(16)) -void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // src_uv_stride - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - sub edi, eax - - align 4 - convertloop: - vmovdqu ymm0, [eax] - vpavgb ymm0, ymm0, [eax + edx] - sub ecx, 32 - vmovdqu [eax + edi], ymm0 - lea eax, [eax + 32] - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_HALFROW_AVX2 - -__declspec(naked) __declspec(align(16)) void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { __asm { @@ -6798,8 +5321,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] pshufb xmm0, xmm5 pshufb xmm1, xmm5 @@ -6826,8 +5349,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrld xmm0, 8 // Move green to bottom. psrld xmm1, 8 @@ -6851,33 +5374,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler - movdqa xmm5, [ecx] - mov ecx, [esp + 16] // pix - - align 4 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pshufb xmm0, xmm5 - pshufb xmm1, xmm5 - sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - jg wloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - movdqa xmm5, [ecx] + movdqu xmm5, [ecx] mov ecx, [esp + 16] // pix align 4 diff --git a/source/scale.cc b/source/scale.cc index 5b33b5f..09b8a2c 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -57,20 +57,15 @@ static void ScalePlaneDown2(int src_width, int src_height, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON; } -#elif defined(HAS_SCALEROWDOWN2_SSE2) +#endif +#if defined(HAS_SCALEROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 : - ScaleRowDown2Box_Unaligned_SSE2); - if (IS_ALIGNED(src_ptr, 16) && - IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : - ScaleRowDown2Box_SSE2); - } + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : + ScaleRowDown2Box_SSE2); } -#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -112,21 +107,15 @@ static void ScalePlaneDown2_16(int src_width, int src_height, ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; } -#elif defined(HAS_SCALEROWDOWN2_16_SSE2) +#endif +#if defined(HAS_SCALEROWDOWN2_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? - ScaleRowDown2_Unaligned_16_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 : - ScaleRowDown2Box_Unaligned_16_SSE2); - if (IS_ALIGNED(src_ptr, 16) && - IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : - ScaleRowDown2Box_16_SSE2); - } + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : + ScaleRowDown2Box_16_SSE2); } -#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -168,13 +157,13 @@ static void ScalePlaneDown4(int src_width, int src_height, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; } -#elif defined(HAS_SCALEROWDOWN4_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN4_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; } -#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -212,14 +201,14 @@ static void ScalePlaneDown4_16(int src_width, int src_height, ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; } -#elif defined(HAS_SCALEROWDOWN4_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN4_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } -#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -271,8 +260,7 @@ static void ScalePlaneDown34(int src_width, int src_height, } #endif #if defined(HAS_SCALEROWDOWN34_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_SSSE3; @@ -351,8 +339,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height, } #endif #if defined(HAS_SCALEROWDOWN34_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; @@ -445,9 +432,9 @@ static void ScalePlaneDown38(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; } } -#elif defined(HAS_SCALEROWDOWN38_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_SSSE3; @@ -456,7 +443,8 @@ static void ScalePlaneDown38(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; } } -#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -522,9 +510,9 @@ static void ScalePlaneDown38_16(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON; } } -#elif defined(HAS_SCALEROWDOWN38_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN38_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; @@ -533,7 +521,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; } } -#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -758,11 +747,11 @@ static void ScalePlaneBox(int src_width, int src_height, uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C; #if defined(HAS_SCALEADDROWS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) #ifdef AVOID_OVERREAD - IS_ALIGNED(src_width, 16) && + && IS_ALIGNED(src_width, 16) #endif - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ) { ScaleAddRows = ScaleAddRows_SSE2; } #endif @@ -830,11 +819,11 @@ static void ScalePlaneBox_16(int src_width, int src_height, uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C; #if defined(HAS_SCALEADDROWS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) #ifdef AVOID_OVERREAD - IS_ALIGNED(src_width, 16) && + && IS_ALIGNED(src_width, 16) #endif - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ) { ScaleAddRows = ScaleAddRows_16_SSE2; } #endif @@ -889,10 +878,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif @@ -900,10 +886,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -991,10 +974,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) { InterpolateRow = InterpolateRow_Any_16_SSE2; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSE2; - if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; - } + InterpolateRow = InterpolateRow_16_SSE2; } } #endif @@ -1002,10 +982,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) { InterpolateRow = InterpolateRow_Any_16_SSSE3; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; - if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; - } + InterpolateRow = InterpolateRow_16_SSSE3; } } #endif @@ -1090,10 +1067,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif @@ -1101,10 +1075,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -1144,9 +1115,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleColsUp2_SSE2; } #endif @@ -1229,10 +1198,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { InterpolateRow = InterpolateRow_Any_16_SSE2; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSE2; - if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; - } + InterpolateRow = InterpolateRow_16_SSE2; } } #endif @@ -1240,10 +1206,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { InterpolateRow = InterpolateRow_Any_16_SSSE3; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; - if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; - } + InterpolateRow = InterpolateRow_16_SSSE3; } } #endif @@ -1283,9 +1246,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_16_C; #if defined(HAS_SCALECOLS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleColsUp2_16_SSE2; } #endif @@ -1366,9 +1327,7 @@ static void ScalePlaneSimple(int src_width, int src_height, if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleCols = ScaleColsUp2_SSE2; } #endif @@ -1401,9 +1360,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height, if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_16_C; #if defined(HAS_SCALECOLS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleCols = ScaleColsUp2_16_SSE2; } #endif diff --git a/source/scale_argb.cc b/source/scale_argb.cc index e339cd7..a798cad 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -53,16 +53,14 @@ static void ScaleARGBDown2(int src_width, int src_height, } #if defined(HAS_SCALEARGBROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : ScaleARGBRowDown2Box_SSE2); } -#elif defined(HAS_SCALEARGBROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON : ScaleARGBRowDown2_NEON; } @@ -98,14 +96,12 @@ static void ScaleARGBDown4Box(int src_width, int src_height, assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. #if defined(HAS_SCALEARGBROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; } -#elif defined(HAS_SCALEARGBROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; } #endif @@ -139,14 +135,13 @@ static void ScaleARGBDownEven(int src_width, int src_height, assert(IS_ALIGNED(src_height, 2)); src_argb += (y >> 16) * src_stride + (x >> 16) * 4; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; } -#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_argb, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; } @@ -193,10 +188,7 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif @@ -204,10 +196,7 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -289,10 +278,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif @@ -300,10 +286,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -346,9 +329,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif @@ -430,10 +411,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(src_width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -470,10 +448,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif @@ -481,10 +456,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -531,9 +503,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif @@ -640,9 +610,7 @@ static void ScaleARGBSimple(int src_width, int src_height, if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBCols = ScaleARGBColsUp2_SSE2; } #endif diff --git a/source/scale_common.cc b/source/scale_common.cc index e4b2acc..459c61a 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -888,11 +888,7 @@ void ScalePlaneVertical(int src_height, if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif @@ -900,11 +896,7 @@ void ScalePlaneVertical(int src_height, if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -970,11 +962,7 @@ void ScalePlaneVertical_16(int src_height, if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) { InterpolateRow = InterpolateRow_Any_16_SSE2; if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSE2; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; - } + InterpolateRow = InterpolateRow_16_SSE2; } } #endif @@ -982,11 +970,7 @@ void ScalePlaneVertical_16(int src_height, if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) { InterpolateRow = InterpolateRow_Any_16_SSSE3; if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; - } + InterpolateRow = InterpolateRow_16_SSSE3; } } #endif diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 1b8a5ba..7921219 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -16,7 +16,8 @@ extern "C" { #endif // This module is for GCC Neon. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) // NEON downscalers with interpolation. // Provided by Fritz Koenig @@ -756,7 +757,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, ); } -#endif // __ARM_NEON__ +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc new file mode 100644 index 0000000..933abd4 --- /dev/null +++ b/source/scale_neon64.cc @@ -0,0 +1,766 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + // load even pixels into v0, odd into v1 + MEMACCESS(0) + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %2, %2, #16 \n" // 16 processed per loop + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} + +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc + MEMACCESS(1) + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "uaddlp v1.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + MEMACCESS(2) + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} + +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride; + const uint8* src_ptr2 = src_ptr + src_stride * 2; + const uint8* src_ptr3 = src_ptr + src_stride * 3; +asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + MEMACCESS(3) + "ld1 {v1.16b}, [%2], #16 \n" + MEMACCESS(4) + "ld1 {v2.16b}, [%3], #16 \n" + MEMACCESS(5) + "ld1 {v3.16b}, [%4], #16 \n" + "subs %5, %5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v0.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" + "uadalp v0.8h, v3.16b \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + MEMACCESS(1) + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(src_ptr2), // %3 + "+r"(src_ptr3), // %4 + "+r"(dst_width) // %5 + : + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %2, %2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 + MEMACCESS(1) + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} + +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + MEMACCESS(3) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %2, %2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" + + // 3 * line_0 + line_1 + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" + + // (3 * line_0 + line_1) >> 2 + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" + + MEMACCESS(1) + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", + "v20", "memory", "cc" + ); +} + +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + MEMACCESS(3) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" + + MEMACCESS(1) + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" + ); +} + +static uvec8 kShuf38 = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +static uvec8 kShuf38_2 = + { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 }; +static vec16 kMult38_Div6 = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +static vec16 kMult38_Div9 = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(3) + "ld1 {v3.16b}, [%3] \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %2, %2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + MEMACCESS(1) + "st1 {v2.8b}, [%1], #8 \n" + MEMACCESS(1) + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride * 2; + ptrdiff_t tmp_src_stride = src_stride; + + asm volatile ( + MEMACCESS(5) + "ld1 {v29.8h}, [%5] \n" + MEMACCESS(6) + "ld1 {v30.16b}, [%6] \n" + MEMACCESS(7) + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + MEMACCESS(3) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + MEMACCESS(4) + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %4, %4, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" + + // combine source lines + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" + + // combine source lines + "add v0.8h, v0.8h, v16.8h \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + + // 0+1+2, 3+4+5 + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + + MEMACCESS(1) + "st1 {v3.8b}, [%1], #8 \n" + MEMACCESS(1) + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(src_ptr1), // %3 + "+r"(dst_width) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", + "v30", "v31", "memory", "cc" + ); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + // TODO(fbarchard): use src_stride directly for clang 3.5+. + ptrdiff_t tmp_src_stride = src_stride; + asm volatile ( + MEMACCESS(4) + "ld1 {v30.8h}, [%4] \n" + MEMACCESS(5) + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + MEMACCESS(3) + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %3, %3, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + + // combine source lines + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "uqrshrn v2.8b, v2.8h, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + + // combine source lines + "uaddl v0.8h, v0.8b, v4.8b \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + + // 0+1+2, 3+4+5 + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + + MEMACCESS(1) + "st1 {v3.8b}, [%1], #8 \n" + MEMACCESS(1) + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(dst_width) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v30", "v31", "memory", "cc" + ); +} + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + int y_fraction = 256 - source_y_fraction; + asm volatile ( + "cmp %4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %4, #64 \n" + "b.eq 75f \n" + "cmp %4, #128 \n" + "b.eq 50f \n" + "cmp %4, #192 \n" + "b.eq 25f \n" + + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v0.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + MEMACCESS(0) + "st1 {v0.b}[15], [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction),// %4 + "+r"(y_fraction) // %5 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" + ); +} + +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS (0) + "ld2 {v0.4s, v1.4s}, [%0], #32 \n" + MEMACCESS (0) + "ld2 {v2.4s, v3.4s}, [%0], #32 \n" + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS (1) + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + MEMACCESS (1) + "st1 {v3.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r" (src_ptr), // %0 + "+r" (dst), // %1 + "+r" (dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + MEMACCESS (0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + MEMACCESS (1) + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + MEMACCESS (2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r" (src_ptr), // %0 + "+r" (src_stride), // %1 + "+r" (dst), // %2 + "+r" (dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.s}[0], [%0], %3 \n" + MEMACCESS(0) + "ld1 {v0.s}[1], [%0], %3 \n" + MEMACCESS(0) + "ld1 {v0.s}[2], [%0], %3 \n" + MEMACCESS(0) + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx * 4) // %3 + : "memory", "cc", "v0" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +// TODO, might be worth another optimization pass in future. +// It could be upgraded to 8 pixels at a time to start with. +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "add %1, %1, %0 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 + MEMACCESS(1) + "ld1 {v1.8b}, [%1], %4 \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], %4 \n" + MEMACCESS(1) + "ld1 {v3.8b}, [%1], %4 \n" + MEMACCESS(0) + "ld1 {v4.8b}, [%0], %4 \n" + MEMACCESS(1) + "ld1 {v5.8b}, [%1], %4 \n" + MEMACCESS(0) + "ld1 {v6.8b}, [%0], %4 \n" + MEMACCESS(1) + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + MEMACCESS(2) + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx * 4) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_posix.cc b/source/scale_posix.cc index 352e667..92e3354 100644 --- a/source/scale_posix.cc +++ b/source/scale_posix.cc @@ -101,110 +101,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 - BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -226,9 +122,8 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -236,7 +131,7 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "psrlw $0x8,%%xmm0 \n" @@ -262,9 +157,8 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ); } -void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -315,8 +209,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" @@ -348,18 +242,18 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 + MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 - MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 - MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 + MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 + MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4 + MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm4,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" @@ -412,8 +306,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "movdqa %%xmm2,%%xmm1 \n" "palignr $0x8,%%xmm0,%%xmm1 \n" @@ -461,8 +355,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm2,%%xmm6 \n" "pmaddubsw %%xmm5,%%xmm6 \n" @@ -479,9 +373,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, "psrlw $0x2,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 + MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm4,%%xmm6 \n" @@ -533,8 +427,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7 "pavgb %%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm2,%%xmm6 \n" @@ -553,8 +447,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, "psrlw $0x2,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" - MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" @@ -590,8 +484,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm4,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" @@ -631,9 +525,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" + "pavgb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm2,%%xmm1 \n" "movdqa %%xmm0,%%xmm6 \n" @@ -679,8 +574,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6 "movhlps %%xmm0,%%xmm1 \n" "movhlps %%xmm6,%%xmm7 \n" "punpcklbw %%xmm5,%%xmm0 \n" @@ -689,7 +584,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, "punpcklbw %%xmm5,%%xmm7 \n" "paddusw %%xmm6,%%xmm0 \n" "paddusw %%xmm7,%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 + MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6 "lea " MEMLEA(0x10,0) ",%0 \n" "movhlps %%xmm6,%%xmm7 \n" "punpcklbw %%xmm5,%%xmm6 \n" @@ -741,7 +636,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "mov %0,%3 \n" "add %6,%0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -753,7 +648,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "2: \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" "add %6,%0 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm2 \n" @@ -765,8 +660,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "3: \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x10,3) ",%0 \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x10,%4 \n" @@ -870,14 +765,14 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm1 \n" "sub $0x20,%2 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" "jg 1b \n" @@ -898,12 +793,12 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "shufps $0xdd,%%xmm1,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -923,15 +818,15 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -951,11 +846,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" @@ -964,7 +859,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -1003,7 +898,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, "punpckldq %%xmm3,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -1056,7 +951,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -1156,14 +1051,14 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "movdqa %%xmm0,%%xmm1 \n" "punpckldq %%xmm0,%%xmm0 \n" "punpckhdq %%xmm1,%%xmm1 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" "jg 1b \n" diff --git a/source/scale_win.cc b/source/scale_win.cc index 840b973..8370ef4 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -105,117 +105,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x1 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 4 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 4 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop esi - ret - } -} - -// Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - align 4 - wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] @@ -234,9 +123,8 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, // Blends 32x1 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) -void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride @@ -273,9 +161,8 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, // Blends 32x2 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) -void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr @@ -331,8 +218,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] pand xmm0, xmm5 pand xmm1, xmm5 @@ -366,16 +253,16 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, [eax + esi * 2] - movdqa xmm3, [eax + esi * 2 + 16] - movdqa xmm4, [eax + edi] - movdqa xmm5, [eax + edi + 16] + movdqu xmm2, [eax + esi * 2] + movdqu xmm3, [eax + esi * 2 + 16] + movdqu xmm4, [eax + edi] + movdqu xmm5, [eax + edi + 16] lea eax, [eax + 32] pavgb xmm2, xmm4 pavgb xmm3, xmm5 @@ -429,8 +316,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm1 palignr xmm1, xmm0, 8 @@ -483,8 +370,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, align 4 wloop: - movdqa xmm0, [eax] // pixels 0..7 - movdqa xmm1, [eax + esi] + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] pavgb xmm0, xmm1 pshufb xmm0, xmm2 pmaddubsw xmm0, xmm5 @@ -501,8 +388,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqa xmm0, [eax + 16] // pixels 16..23 - movdqa xmm1, [eax + esi + 16] + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm1 pshufb xmm0, xmm4 @@ -542,8 +429,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, align 4 wloop: - movdqa xmm0, [eax] // pixels 0..7 - movdqa xmm1, [eax + esi] + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm2 @@ -562,8 +449,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqa xmm0, [eax + 16] // pixels 16..23 - movdqa xmm1, [eax + esi + 16] + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm1, xmm0 pavgb xmm0, xmm1 @@ -599,8 +486,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, align 4 xloop: - movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 lea eax, [eax + 32] pshufb xmm0, xmm4 pshufb xmm1, xmm5 @@ -635,8 +522,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, align 4 xloop: - movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 - movdqa xmm6, [eax + esi] + movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqu xmm6, [eax + esi] movhlps xmm1, xmm0 movhlps xmm7, xmm6 punpcklbw xmm0, xmm5 @@ -645,7 +532,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, punpcklbw xmm7, xmm5 paddusw xmm0, xmm6 paddusw xmm1, xmm7 - movdqa xmm6, [eax + esi * 2] + movdqu xmm6, [eax + esi * 2] lea eax, [eax + 16] movhlps xmm7, xmm6 punpcklbw xmm6, xmm5 @@ -701,9 +588,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, align 4 xloop: - movdqa xmm0, [eax] // average 2 rows into xmm0 - pavgb xmm0, [eax + esi] + movdqu xmm0, [eax] // average 2 rows into xmm0 + movdqu xmm1, [eax + esi] lea eax, [eax + 16] + pavgb xmm0, xmm1 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 pshufb xmm1, xmm2 @@ -750,7 +638,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, align 4 xloop: // first row - movdqa xmm0, [esi] + movdqu xmm0, [esi] lea eax, [esi + edx] movdqa xmm1, xmm0 punpcklbw xmm0, xmm4 @@ -763,7 +651,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // sum remaining rows align 4 yloop: - movdqa xmm2, [eax] // read 16 pixels + movdqu xmm2, [eax] // read 16 pixels lea eax, [eax + edx] // advance to next row movdqa xmm3, xmm2 punpcklbw xmm2, xmm4 @@ -775,8 +663,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, align 4 ydone: - movdqa [edi], xmm0 - movdqa [edi + 16], xmm1 + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 lea edi, [edi + 32] sub ecx, 16 @@ -891,14 +779,14 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, align 4 wloop: - movdqa xmm0, [eax] + movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 punpckhbw xmm1, xmm1 sub ecx, 32 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] jg wloop @@ -920,12 +808,12 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] shufps xmm0, xmm1, 0xdd sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop @@ -947,15 +835,15 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop @@ -978,10 +866,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 @@ -990,7 +878,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop @@ -1027,7 +915,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, punpckldq xmm2, xmm3 punpcklqdq xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop @@ -1076,7 +964,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop @@ -1267,14 +1155,14 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, align 4 wloop: - movdqa xmm0, [eax] + movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 punpckldq xmm0, xmm0 punpckhdq xmm1, xmm1 sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] jg wloop diff --git a/sync_chromium.py b/sync_chromium.py new file mode 100755 index 0000000..65353c3 --- /dev/null +++ b/sync_chromium.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# Copyright 2014 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +import argparse +import os +import subprocess +import sys + +# Bump this whenever the algorithm changes and you need bots/devs to re-sync, +# ignoring the .last_sync_chromium file +SCRIPT_VERSION = 1 + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + + +def get_target_os_list(): + try: + main_gclient = os.path.join(os.path.dirname(ROOT_DIR), '.gclient') + config_dict = {} + with open(main_gclient, 'rb') as deps_content: + exec(deps_content, config_dict) + return ','.join(config_dict.get('target_os', [])) + except Exception as e: + print >> sys.stderr, "error while parsing .gclient:", e + + +def main(): + CR_DIR = os.path.join(ROOT_DIR, 'chromium') + + p = argparse.ArgumentParser() + p.add_argument('--target-revision', required=True, + help='The target chromium git revision [REQUIRED]') + p.add_argument('--chromium-dir', default=CR_DIR, + help=('The path to the chromium directory to sync ' + '(default: %(default)r)')) + opts = p.parse_args() + opts.chromium_dir = os.path.abspath(opts.chromium_dir) + + target_os_list = get_target_os_list() + + # Do a quick check to see if we were successful last time to make runhooks + # sooper fast. + flag_file = os.path.join(opts.chromium_dir, '.last_sync_chromium') + flag_file_content = '\n'.join([ + str(SCRIPT_VERSION), + opts.target_revision, + repr(target_os_list), + ]) + if os.path.exists(flag_file): + with open(flag_file, 'r') as f: + if f.read() == flag_file_content: + print "Chromium already up to date:", opts.target_revision + return 0 + os.unlink(flag_file) + + # To avoid gclient sync problems when DEPS entries have been removed we must + # wipe the .gclient_entries file that contains cached URLs for all DEPS. + entries_file = os.path.join(opts.chromium_dir, '.gclient_entries') + if os.path.exists(entries_file): + os.unlink(entries_file) + + env = os.environ.copy() + env['GYP_CHROMIUM_NO_ACTION'] = '1' + gclient_cmd = 'gclient.bat' if sys.platform.startswith('win') else 'gclient' + args = [ + gclient_cmd, 'sync', '--force', '--revision', 'src@'+opts.target_revision + ] + + if os.environ.get('CHROME_HEADLESS') == '1': + args.append('-vvv') + + if sys.platform.startswith('win'): + cache_path = os.path.join(os.path.splitdrive(ROOT_DIR)[0] + os.path.sep, + 'b', 'git-cache') + else: + cache_path = '/b/git-cache' + + gclientfile = os.path.join(opts.chromium_dir, '.gclient') + with open(gclientfile, 'rb') as spec: + spec = spec.read().splitlines() + spec[-1] = 'cache_dir = %r' % (cache_path,) + with open(gclientfile + '.bot', 'wb') as f: + f.write('\n'.join(spec)) + + args += [ + '--gclientfile', '.gclient.bot', + '--delete_unversioned_trees', '--reset', '--upstream' + ] + else: + args.append('--no-history') + + if target_os_list: + args += ['--deps=' + target_os_list] + + print 'Running "%s" in %s' % (' '.join(args), opts.chromium_dir) + ret = subprocess.call(args, cwd=opts.chromium_dir, env=env) + if ret == 0: + with open(flag_file, 'wb') as f: + f.write(flag_file_content) + + return ret + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/tools/lsan/suppressions.txt b/tools/lsan/suppressions.txt new file mode 100644 index 0000000..fa66e9a --- /dev/null +++ b/tools/lsan/suppressions.txt @@ -0,0 +1,10 @@ +# This is a suppressions file that must exist in order for the Leak Sanitizer +# tool that runs on the ASan bot to be able to run with the default +# configuration. More info about LSan on +# http://www.chromium.org/developers/testing/leaksanitizer + +#### Third-party leaks #### + +#### Actual bugs in Libyuv code #### + + diff --git a/tools/sanitizer_options.gyp b/tools/sanitizer_options.gyp index ea453d3..e69de29 100644 --- a/tools/sanitizer_options.gyp +++ b/tools/sanitizer_options.gyp @@ -1,59 +0,0 @@ -# Copyright 2014 The LibYuv Project Authors. All rights reserved. -# -# Use of this source code is governed by a BSD-style license -# that can be found in the LICENSE file in the root of the source -# tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may -# be found in the AUTHORS file in the root of the source tree. - -# This is a similar target to the one in Chromium's base.gyp. It's needed to get -# the same sanitizer settings as Chromium uses (i.e. ASan, LSan, TSan...). -{ - 'targets': [ - { - 'target_name': 'sanitizer_options', - 'type': 'static_library', - 'toolsets': ['host', 'target'], - 'variables': { - # Every target is going to depend on sanitizer_options, so allow - # this one to depend on itself. - 'prune_self_dependency': 1, - # Do not let 'none' targets depend on this one, they don't need to. - 'link_dependency': 1, - }, - 'sources': [ - 'sanitizer_options/sanitizer_options.cc', - ], - 'include_dirs': [ - '<(DEPTH)', - ], - # Some targets may want to opt-out from ASan, TSan and MSan and link - # without the corresponding runtime libraries. We drop the libc++ - # dependency and omit the compiler flags to avoid bringing instrumented - # code to those targets. - 'conditions': [ - ['use_custom_libcxx==1', { - 'dependencies!': [ - '<(DEPTH)/third_party/libc++/libc++.gyp:libcxx_proxy', - ], - }], - ['tsan==1', { - 'sources': [ - 'tsan_suppressions/tsan_suppressions.cc', - ], - }], - ], - 'cflags!': [ - '-fsanitize=address', - '-fsanitize=thread', - '-fsanitize=memory', - '-fsanitize-memory-track-origins', - ], - 'direct_dependent_settings': { - 'ldflags': [ - '-Wl,-u_sanitizer_options_link_helper', - ], - }, - }, - ], # targets -} diff --git a/tools/supplement.gypi b/tools/supplement.gypi deleted file mode 100644 index c1cff3f..0000000 --- a/tools/supplement.gypi +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2011 The LibYuv Project Authors. All rights reserved. -# -# Use of this source code is governed by a BSD-style license -# that can be found in the LICENSE file in the root of the source -# tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may -# be found in the AUTHORS file in the root of the source tree. - -# Supplement file for libyuv. To be processed, this file needs to be located in -# the first level of directories below the gyp_libyuv file. - -# This is needed to workaround the otherwise failing include of base.gyp in -# Chromium's common.gypi when a sanitizer tool is used. -{ - 'variables': { - 'use_sanitizer_options': 0, - }, - 'target_defaults': { - 'conditions': [ - # Add default sanitizer options similar to Chromium. This is needed to get - # the sanitizer options that has LeakSanitizer disabled by default. - # Otherwise yasm will throw leak errors during compile when - # GYP_DEFINES="asan=1". - ['OS=="linux" and (chromeos==0 or target_arch!="ia32")', { - 'dependencies': [ - '<(DEPTH)/tools/sanitizer_options.gyp:sanitizer_options', - ], - }], - ], - }, -} diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index 141445e..464e255 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -244,6 +244,32 @@ TEST_F(libyuvTest, BenchmarkPsnr_Opt) { free_aligned_buffer_64(src_b); } + +TEST_F(libyuvTest, BenchmarkPsnr_Unaligned) { + align_buffer_64(src_a, benchmark_width_ * benchmark_height_ + 1); + align_buffer_64(src_b, benchmark_width_ * benchmark_height_); + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + src_a[i + 1] = i; + src_b[i] = i; + } + + MaskCpuFlags(-1); + + double opt_time = get_time(); + for (int i = 0; i < benchmark_iterations_; ++i) + CalcFramePsnr(src_a + 1, benchmark_width_, + src_b, benchmark_width_, + benchmark_width_, benchmark_height_); + + opt_time = (get_time() - opt_time) / benchmark_iterations_; + printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6); + + EXPECT_EQ(0, 0); + + free_aligned_buffer_64(src_a); + free_aligned_buffer_64(src_b); +} + TEST_F(libyuvTest, Psnr) { const int kSrcWidth = benchmark_width_; const int kSrcHeight = benchmark_height_; diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 9948074..fd82ed1 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -66,7 +66,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ srandom(time(NULL)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ - src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \ + src_y[i * kWidth + j + OFF] = (random() & 0xff); \ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ @@ -203,7 +203,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ srandom(time(NULL)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ - src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \ + src_y[i * kWidth + j + OFF] = (random() & 0xff); \ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ @@ -316,7 +316,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ srandom(time(NULL)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ - src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \ + src_y[i * kWidth + j + OFF] = (random() & 0xff); \ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ @@ -430,8 +430,8 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ align_buffer_64(src_y, kWidth * kHeight + OFF); \ align_buffer_64(src_u, kSizeUV + OFF); \ align_buffer_64(src_v, kSizeUV + OFF); \ - align_buffer_64(dst_argb_c, kStrideB * kHeight); \ - align_buffer_64(dst_argb_opt, kStrideB * kHeight); \ + align_buffer_64(dst_argb_c, kStrideB * kHeight + OFF); \ + align_buffer_64(dst_argb_opt, kStrideB * kHeight + OFF); \ srandom(time(NULL)); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (random() & 0xff); \ @@ -440,20 +440,20 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ src_u[i + OFF] = (random() & 0xff); \ src_v[i + OFF] = (random() & 0xff); \ } \ - memset(dst_argb_c, 1, kStrideB * kHeight); \ - memset(dst_argb_opt, 101, kStrideB * kHeight); \ + memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ MaskCpuFlags(0); \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ - dst_argb_c, kStrideB, \ + dst_argb_c + OFF, kStrideB, \ kWidth, NEG kHeight); \ MaskCpuFlags(-1); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ - dst_argb_opt, kStrideB, \ + dst_argb_opt + OFF, kStrideB, \ kWidth, NEG kHeight); \ } \ int max_diff = 0; \ @@ -462,10 +462,10 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ align_buffer_64(dst_argb32_opt, kWidth * BPP_C * kHeight); \ memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight); \ memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight); \ - FMT_B##To##FMT_C(dst_argb_c, kStrideB, \ + FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, \ dst_argb32_c, kWidth * BPP_C , \ kWidth, kHeight); \ - FMT_B##To##FMT_C(dst_argb_opt, kStrideB, \ + FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, \ dst_argb32_opt, kWidth * BPP_C , \ kWidth, kHeight); \ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ @@ -538,10 +538,10 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ srandom(time(NULL)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ - src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \ + src_y[i * kWidth + j + OFF] = (random() & 0xff); \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \ - src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) * 2 + j + OFF] = \ + src_uv[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j + OFF] = \ (random() & 0xff); \ } \ } \ @@ -714,7 +714,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ benchmark_width_, DIFF, _Opt, +, 0) TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4) -#ifdef __arm__ +#if defined(__arm__) || defined (__aarch64__) TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4) #else TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0) @@ -1163,7 +1163,7 @@ TEST_F(libyuvTest, CropNV12) { const int kWidth = benchmark_width_; const int kHeight = benchmark_height_; const int crop_y = - (benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2; + ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1; const int kDestWidth = benchmark_width_; const int kDestHeight = benchmark_height_ - crop_y * 2;; const int sample_size = kWidth * kHeight + @@ -1189,16 +1189,12 @@ TEST_F(libyuvTest, CropNV12) { SUBSAMPLE(kDestHeight, SUBSAMP_Y)); srandom(time(NULL)); - for (int i = 0; i < kHeight; ++i) - for (int j = 0; j < kWidth; ++j) - src_y[(i * kWidth) + j] = (random() & 0xff); - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { - for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { - src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) + j * 2 + 0] = - (random() & 0xff); - src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) + j * 2 + 1] = - (random() & 0xff); - } + for (int i = 0; i < kHeight * kWidth; ++i) { + src_y[i] = (random() & 0xff); + } + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * + SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++i) { + src_uv[i] = (random() & 0xff); } memset(dst_y, 1, kDestWidth * kDestHeight); memset(dst_u, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * @@ -1211,13 +1207,6 @@ TEST_F(libyuvTest, CropNV12) { memset(dst_v_2, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); - NV12ToI420(src_y + crop_y * kWidth, kWidth, - src_uv + (crop_y / 2) * kWidth, kWidth, - dst_y, kDestWidth, - dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), - dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X), - kDestWidth, kDestHeight); - ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X), @@ -1227,6 +1216,13 @@ TEST_F(libyuvTest, CropNV12) { kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12); + NV12ToI420(src_y + crop_y * kWidth, kWidth, + src_uv + (crop_y / 2) * kWidth, kWidth, + dst_y, kDestWidth, + dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), + dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X), + kDestWidth, kDestHeight); + for (int i = 0; i < kDestHeight; ++i) { for (int j = 0; j < kDestWidth; ++j) { EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]); diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index 7f8b748..fd3965c 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -13,6 +13,7 @@ #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" +#include "libyuv/row.h" // For HAS_ARGBSHUFFLEROW_AVX2. #include "libyuv/version.h" #include "../unit_test/unit_test.h" @@ -51,6 +52,26 @@ TEST_F(libyuvTest, TestCpuHas) { printf("Has MIPS DSPR2 %x\n", has_mips_dspr2); } +TEST_F(libyuvTest, TestCompilerHasAVX2) { +#ifdef _MSC_VER +printf("_MSC_VER %d\n", _MSC_VER); +#endif +#if !defined(LIBYUV_DISABLE_X86) && (defined(GCC_HAS_AVX2) || \ + defined(CLANG_HAS_AVX2) || defined(VISUALC_HAS_AVX2)) + printf("Has AVX2 1\n"); + // If compiler supports AVX2, the following function is expected to exist: +#if !defined(HAS_ARGBSHUFFLEROW_AVX2) + EXPECT_TRUE(0); // HAS_ARGBSHUFFLEROW_AVX2 was expected. +#endif +#else + printf("Has AVX2 0\n"); + // If compiler does not support AVX2, the following function not expected: +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + EXPECT_TRUE(0); // HAS_ARGBSHUFFLEROW_AVX2 was not expected. +#endif +#endif +} + #if defined(__i386__) || defined(__x86_64__) || \ defined(_M_IX86) || defined(_M_X64) TEST_F(libyuvTest, TestCpuId) { @@ -105,6 +126,7 @@ TEST_F(libyuvTest, TestLinuxNeon) { if (FileExists("../../unit_test/testdata/arm_v7.txt")) { EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt")); EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt")); + EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt")); } else { printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n"); } diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index 6a2bc79..bbeb4f8 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -223,6 +223,7 @@ TEST_FACTOR(2, 1 / 2, 1 / 2) TEST_FACTOR(4, 1 / 4, 1 / 4) TEST_FACTOR(8, 1 / 8, 1 / 8) TEST_FACTOR(3by4, 3 / 4, 3 / 4) +TEST_FACTOR(3by8, 3 / 8, 3 / 8) #undef TEST_FACTOR1 #undef TEST_FACTOR diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 00f0707..5d08365 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -288,6 +288,7 @@ TEST_FACTOR(2, 1 / 2, 1 / 2) TEST_FACTOR(4, 1 / 4, 1 / 4) TEST_FACTOR(8, 1 / 8, 1 / 8) TEST_FACTOR(3by4, 3 / 4, 3 / 4) +TEST_FACTOR(3by8, 3 / 8, 3 / 8) #undef TEST_FACTOR1 #undef TEST_FACTOR diff --git a/unit_test/testdata/juno.txt b/unit_test/testdata/juno.txt new file mode 100644 index 0000000..c275be7 --- /dev/null +++ b/unit_test/testdata/juno.txt @@ -0,0 +1,15 @@ +Processor : AArch64 Processor rev 0 (aarch64)
+processor : 0
+processor : 1
+processor : 2
+processor : 3
+processor : 4
+processor : 5
+Features : fp asimd evtstrm aes pmull sha1 sha2 crc32
+CPU implementer : 0x41
+CPU architecture: AArch64
+CPU variant : 0x0
+CPU part : 0xd07
+CPU revision : 0
+
+Hardware : Juno
diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h index cfce548..0151796 100644 --- a/unit_test/unit_test.h +++ b/unit_test/unit_test.h @@ -26,11 +26,14 @@ static __inline int Abs(int v) { return v >= 0 ? v : -v; } +#define OFFBY 0 + #define align_buffer_page_end(var, size) \ uint8* var; \ uint8* var##_mem; \ - var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095) & ~4095)); \ - var = var##_mem + (-(size) & 4095); + var##_mem = reinterpret_cast<uint8*>(malloc((((size) + 4095) & ~4095) + \ + OFFBY)); \ + var = var##_mem + (-(size) & 4095) + OFFBY; #define free_aligned_buffer_page_end(var) \ free(var##_mem); \ diff --git a/unit_test/version_test.cc b/unit_test/version_test.cc index cddc019..723a2f2 100644 --- a/unit_test/version_test.cc +++ b/unit_test/version_test.cc @@ -36,6 +36,8 @@ TEST_F(libyuvTest, TestVersion) { if (LIBYUV_VERSION != svn_revision) { printf("WARNING - Versions do not match.\n"); } +#else + printf("WARNING - SVN Version unavailable. Test not run.\n"); #endif } |