diff options
author | Android Chromium Automerger <chromium-automerger@android> | 2014-10-17 14:05:25 +0000 |
---|---|---|
committer | Android Chromium Automerger <chromium-automerger@android> | 2014-10-17 14:05:25 +0000 |
commit | 5483945ff766e6f8833bfcaccae8e08ccc7649cf (patch) | |
tree | 59075f011dc3ce3ca6d0f8069ef8736ee7a16e76 | |
parent | 28ec957dad68c351e1f1f7b54870a676dc97a0bf (diff) | |
parent | 9107460c7f76a10cc4122d91e62b0580eacd376e (diff) | |
download | libyuv-5483945ff766e6f8833bfcaccae8e08ccc7649cf.tar.gz |
Merge third_party/libyuv from https://chromium.googlesource.com/external/libyuv.git at 9107460c7f76a10cc4122d91e62b0580eacd376e
This commit was generated by merge_from_chromium.py.
Change-Id: I741ba43f863602b704886db45985dcdb405dc9e3
31 files changed, 790 insertions, 4081 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 4db18f1..6f0fccf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,21 +1,23 @@ cmake_minimum_required(VERSION 2.8)
-
+
+# Basic CMakeLists for libyuv, compiles w/o the jpeg library
+# created for "roxlu build system" to compile libyuv on windows
+
set(ly_base_dir ${CMAKE_CURRENT_LIST_DIR})
set(ly_src_dir ${ly_base_dir}/source/)
set(ly_inc_dir ${ly_base_dir}/include)
set(ly_lib_name "yuv")
-
+
set(ly_source_files
${ly_src_dir}/compare.cc
${ly_src_dir}/compare_common.cc
${ly_src_dir}/compare_neon.cc
${ly_src_dir}/compare_posix.cc
${ly_src_dir}/compare_win.cc
- ${ly_src_dir}/convert_argb.cc
${ly_src_dir}/convert.cc
- ${ly_src_dir}/convert_from_argb.cc
+ ${ly_src_dir}/convert_argb.cc
${ly_src_dir}/convert_from.cc
- ${ly_src_dir}/convert_jpeg.cc
+ ${ly_src_dir}/convert_from_argb.cc
${ly_src_dir}/convert_to_argb.cc
${ly_src_dir}/convert_to_i420.cc
${ly_src_dir}/cpu_id.cc
@@ -23,8 +25,8 @@ set(ly_source_files ${ly_src_dir}/mjpeg_decoder.cc
${ly_src_dir}/mjpeg_validate.cc
${ly_src_dir}/planar_functions.cc
- ${ly_src_dir}/rotate_argb.cc
${ly_src_dir}/rotate.cc
+ ${ly_src_dir}/rotate_argb.cc
${ly_src_dir}/rotate_mips.cc
${ly_src_dir}/rotate_neon.cc
${ly_src_dir}/row_any.cc
@@ -33,48 +35,48 @@ set(ly_source_files ${ly_src_dir}/row_neon.cc
${ly_src_dir}/row_posix.cc
${ly_src_dir}/row_win.cc
- ${ly_src_dir}/scale_argb.cc
${ly_src_dir}/scale.cc
+ ${ly_src_dir}/scale_argb.cc
${ly_src_dir}/scale_common.cc
${ly_src_dir}/scale_mips.cc
${ly_src_dir}/scale_neon.cc
- ${ly_src_dir}/scale_posix.cc
${ly_src_dir}/scale_win.cc
${ly_src_dir}/video_common.cc
+# ${ly_src_dir}/convert_jpeg.cc
)
-
+
+if (WIN32)
+ list(APPEND ly_source_files
+ ${ly_src_dir}/scale_win.cc
+ )
+endif()
+
set(ly_header_files
${ly_inc_dir}/libyuv/basic_types.h
${ly_inc_dir}/libyuv/compare.h
+ ${ly_inc_dir}/libyuv/convert.h
${ly_inc_dir}/libyuv/convert_argb.h
- ${ly_inc_dir}/libyuv/convert_from_argb.h
${ly_inc_dir}/libyuv/convert_from.h
- ${ly_inc_dir}/libyuv/convert.h
+ ${ly_inc_dir}/libyuv/convert_from_argb.h
${ly_inc_dir}/libyuv/cpu_id.h
${ly_inc_dir}/libyuv/format_conversion.h
- ${ly_inc_dir}/libyuv/mjpeg_decoder.h
${ly_inc_dir}/libyuv/planar_functions.h
- ${ly_inc_dir}/libyuv/rotate_argb.h
${ly_inc_dir}/libyuv/rotate.h
+ ${ly_inc_dir}/libyuv/rotate_argb.h
${ly_inc_dir}/libyuv/row.h
- ${ly_inc_dir}/libyuv/scale_argb.h
${ly_inc_dir}/libyuv/scale.h
+ ${ly_inc_dir}/libyuv/scale_argb.h
${ly_inc_dir}/libyuv/scale_row.h
${ly_inc_dir}/libyuv/version.h
${ly_inc_dir}/libyuv/video_common.h
+ ${ly_inc_dir}/libyuv/mjpeg_decoder.h
)
-
-add_definitions(
- -DLIBYUV_DISABLE_NEON
- -DLIBYUV_DISABLE_MIPS
-)
-
+
include_directories(${ly_inc_dir})
-
+
add_library(${ly_lib_name} STATIC ${ly_source_files})
-
+
install(TARGETS ${ly_lib_name} DESTINATION lib)
-
install(FILES ${ly_header_files} DESTINATION include/libyuv)
install(FILES ${ly_inc_dir}/libyuv.h DESTINATION include/)
@@ -8,7 +8,7 @@ vars = { # Roll the Chromium Git hash to pick up newer versions of all the # dependencies and tools linked to in setup_links.py. - "chromium_revision": "6455c698e51af65f57a8fe83547296218a5a7251", + "chromium_revision": "2d714fae183152299b3cbf0056eab5fe8bb75e87", } hooks = [ diff --git a/README.chromium b/README.chromium index 91c1ac3..600d33f 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1109 +Version: 1125 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 885f8de..07d3875 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -87,7 +87,6 @@ extern "C" { #define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSHADEROW_SSE2 #define HAS_ARGBSUBTRACTROW_SSE2 -#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 @@ -101,6 +100,7 @@ extern "C" { #define HAS_SOBELYROW_SSE2 // Conversions: +#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 @@ -655,13 +655,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); -void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); -void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, @@ -740,16 +733,6 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int width); void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, @@ -811,15 +794,11 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUV422Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); @@ -857,10 +836,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix); -void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, - uint8* dst_v, int pix); void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, @@ -878,8 +853,6 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, - uint8* dst_uv, int width); void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, @@ -926,8 +899,6 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, @@ -994,7 +965,6 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); @@ -1152,7 +1122,6 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); -// RGB24/RAW are unaligned. void I422ToRGB24Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1163,51 +1132,6 @@ void I422ToRAWRow_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_raw, int width); - -void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, - int width); -void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, - int width); -void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - int width); -void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - int width); -void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_bgra, - int width); -void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_abgr, - int width); -void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, - int width); void I422ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1280,7 +1204,6 @@ void I422ToRGB565Row_Any_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_rgba, int width); -// RGB24/RAW are unaligned. void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1494,12 +1417,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix); -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); @@ -1535,12 +1452,6 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); void UYVYToUV422Row_SSE2(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix); -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 402d859..27aa04b 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -44,21 +44,13 @@ extern "C" { // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWN2_NEON -#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__aarch64__) || defined(LIBYUV_NEON)) -#define HAS_SCALEROWDOWN2_NEON -#define HAS_SCALEROWDOWN4_NEON -#define HAS_SCALEROWDOWN34_NEON -#define HAS_SCALEROWDOWN38_NEON -#define HAS_SCALEARGBROWDOWN2_NEON -#define HAS_SCALEARGBROWDOWNEVEN_NEON #endif // The following are available on Mips platforms: diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 8895d54..349b523 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1109 +#define LIBYUV_VERSION 1125 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT @@ -130,16 +130,6 @@ 'LIBYUV_DISABLE_X86', ], }], - ['OS == "android" and target_arch == "arm64"', { - 'ldflags': [ - '-Wl,--dynamic-linker,/system/bin/linker64', - ], - }], - ['OS == "android" and target_arch != "arm64"', { - 'ldflags': [ - '-Wl,--dynamic-linker,/system/bin/linker', - ], - }], ], #conditions 'defines': [ # Enable the following 3 macros to turn off assembly for specified CPU. @@ -159,6 +149,18 @@ 'include', '.', ], + 'conditions': [ + ['OS == "android" and target_arch == "arm64"', { + 'ldflags': [ + '-Wl,--dynamic-linker,/system/bin/linker64', + ], + }], + ['OS == "android" and target_arch != "arm64"', { + 'ldflags': [ + '-Wl,--dynamic-linker,/system/bin/linker', + ], + }], + ], #conditions }, 'sources': [ '<@(libyuv_sources)', diff --git a/source/convert.cc b/source/convert.cc index f205143..9582b53 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -194,10 +194,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src, 16) && - IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_SSE2; } #endif @@ -291,12 +288,7 @@ static int X420ToI420(const uint8* src_y, if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { SplitUVRow = SplitUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - SplitUVRow = SplitUVRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) && - IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && - IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { - SplitUVRow = SplitUVRow_SSE2; - } + SplitUVRow = SplitUVRow_SSE2; } } #endif @@ -317,15 +309,13 @@ static int X420ToI420(const uint8* src_y, } #endif #if defined(HAS_SPLITUVROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16 && + IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && + IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && + IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { SplitUVRow = SplitUVRow_Any_MIPS_DSPR2; if (IS_ALIGNED(halfwidth, 16)) { - SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2; - if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && - IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && - IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { - SplitUVRow = SplitUVRow_MIPS_DSPR2; - } + SplitUVRow = SplitUVRow_MIPS_DSPR2; } } #endif @@ -440,9 +430,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_SSE2; } #endif @@ -467,14 +455,8 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -543,14 +525,8 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUVRow = YUY2ToUVRow_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } + YUY2ToUVRow = YUY2ToUVRow_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -616,14 +592,8 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, UYVYToUVRow = UYVYToUVRow_Any_SSE2; UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2; - UYVYToYRow = UYVYToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { - UYVYToUVRow = UYVYToUVRow_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - UYVYToYRow = UYVYToYRow_SSE2; - } - } + UYVYToUVRow = UYVYToUVRow_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -694,14 +664,8 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -771,22 +735,17 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, src_bgra = src_bgra + (height - 1) * src_stride_bgra; src_stride_bgra = -src_stride_bgra; } -#if defined(HAS_BGRATOYROW_SSSE3) +#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { BGRAToUVRow = BGRAToUVRow_Any_SSSE3; BGRAToYRow = BGRAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3; - BGRAToYRow = BGRAToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) { - BGRAToUVRow = BGRAToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - BGRAToYRow = BGRAToYRow_SSSE3; - } - } + BGRAToUVRow = BGRAToUVRow_SSSE3; + BGRAToYRow = BGRAToYRow_SSSE3; } } -#elif defined(HAS_BGRATOYROW_NEON) +#endif +#if defined(HAS_BGRATOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { BGRAToYRow = BGRAToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -842,22 +801,17 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) +#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ABGRToUVRow = ABGRToUVRow_Any_SSSE3; ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3; - ABGRToYRow = ABGRToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ABGRToYRow = ABGRToYRow_SSSE3; - } - } + ABGRToUVRow = ABGRToUVRow_SSSE3; + ABGRToYRow = ABGRToYRow_SSSE3; } } -#elif defined(HAS_ABGRTOYROW_NEON) +#endif +#if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ABGRToYRow = ABGRToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -913,22 +867,17 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, src_rgba = src_rgba + (height - 1) * src_stride_rgba; src_stride_rgba = -src_stride_rgba; } -#if defined(HAS_RGBATOYROW_SSSE3) +#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { RGBAToUVRow = RGBAToUVRow_Any_SSSE3; RGBAToYRow = RGBAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3; - RGBAToYRow = RGBAToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) { - RGBAToUVRow = RGBAToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - RGBAToYRow = RGBAToYRow_SSSE3; - } - } + RGBAToUVRow = RGBAToUVRow_SSSE3; + RGBAToYRow = RGBAToYRow_SSSE3; } } -#elif defined(HAS_RGBATOYROW_NEON) +#endif +#if defined(HAS_RGBATOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { RGBAToYRow = RGBAToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -1029,10 +978,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 @@ -1146,10 +1092,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 @@ -1260,10 +1203,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 @@ -1377,10 +1317,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 @@ -1495,10 +1432,7 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 diff --git a/source/convert_argb.cc b/source/convert_argb.cc index ac0bc3d..51e7438 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -82,13 +82,11 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I444ToARGBRow = I444ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; - } + I444ToARGBRow = I444ToARGBRow_SSSE3; } } -#elif defined(HAS_I444TOARGBROW_NEON) +#endif +#if defined(HAS_I444TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I444ToARGBRow = I444ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -144,10 +142,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -224,13 +219,11 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I411ToARGBRow = I411ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I411ToARGBRow = I411ToARGBRow_SSSE3; - } + I411ToARGBRow = I411ToARGBRow_SSSE3; } } -#elif defined(HAS_I411TOARGBROW_NEON) +#endif +#if defined(HAS_I411TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I411ToARGBRow = I411ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -276,14 +269,14 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, src_stride_y = dst_stride_argb = 0; } #if defined(HAS_YTOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { YToARGBRow = YToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { YToARGBRow = YToARGBRow_SSE2; } } -#elif defined(HAS_YTOARGBROW_NEON) +#endif +#if defined(HAS_YTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { YToARGBRow = YToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -329,13 +322,11 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { I400ToARGBRow = I400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I400ToARGBRow = I400ToARGBRow_SSE2; - } + I400ToARGBRow = I400ToARGBRow_SSE2; } } -#elif defined(HAS_I400TOARGBROW_NEON) +#endif +#if defined(HAS_I400TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I400ToARGBRow = I400ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -447,14 +438,14 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, src_stride_rgb24 = dst_stride_argb = 0; } #if defined(HAS_RGB24TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } -#elif defined(HAS_RGB24TOARGBROW_NEON) +#endif +#if defined(HAS_RGB24TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -497,14 +488,14 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, src_stride_raw = dst_stride_argb = 0; } #if defined(HAS_RAWTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RAWToARGBRow = RAWToARGBRow_SSSE3; } } -#elif defined(HAS_RAWTOARGBROW_NEON) +#endif +#if defined(HAS_RAWTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { RAWToARGBRow = RAWToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -547,14 +538,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, src_stride_rgb565 = dst_stride_argb = 0; } #if defined(HAS_RGB565TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { RGB565ToARGBRow = RGB565ToARGBRow_SSE2; } } -#elif defined(HAS_RGB565TOARGBROW_NEON) +#endif +#if defined(HAS_RGB565TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -597,14 +588,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, src_stride_argb1555 = dst_stride_argb = 0; } #if defined(HAS_ARGB1555TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; } } -#elif defined(HAS_ARGB1555TOARGBROW_NEON) +#endif +#if defined(HAS_ARGB1555TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -647,14 +638,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, src_stride_argb4444 = dst_stride_argb = 0; } #if defined(HAS_ARGB4444TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; } } -#elif defined(HAS_ARGB4444TOARGBROW_NEON) +#endif +#if defined(HAS_ARGB4444TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -696,13 +687,11 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; - } + NV12ToARGBRow = NV12ToARGBRow_SSSE3; } } -#elif defined(HAS_NV12TOARGBROW_NEON) +#endif +#if defined(HAS_NV12TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { NV12ToARGBRow = NV12ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -747,10 +736,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - NV21ToARGBRow = NV21ToARGBRow_SSSE3; - } + NV21ToARGBRow = NV21ToARGBRow_SSSE3; } } #endif @@ -798,13 +784,11 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; - } + NV12ToARGBRow = NV12ToARGBRow_SSSE3; } } -#elif defined(HAS_NV12TOARGBROW_NEON) +#endif +#if defined(HAS_NV12TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { NV12ToARGBRow = NV12ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -856,14 +840,11 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; - } + YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; } } -#elif defined(HAS_YUY2TOARGBROW_NEON) +#endif +#if defined(HAS_YUY2TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -909,14 +890,11 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - UYVYToARGBRow = UYVYToARGBRow_SSSE3; - } + UYVYToARGBRow = UYVYToARGBRow_SSSE3; } } -#elif defined(HAS_UYVYTOARGBROW_NEON) +#endif +#if defined(HAS_UYVYTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { UYVYToARGBRow = UYVYToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { diff --git a/source/convert_from.cc b/source/convert_from.cc index c1a2f62..2220005 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -180,7 +180,8 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, I422ToYUY2Row = I422ToYUY2Row_SSE2; } } -#elif defined(HAS_I422TOYUY2ROW_NEON) +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -226,7 +227,8 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, I422ToYUY2Row = I422ToYUY2Row_SSE2; } } -#elif defined(HAS_I422TOYUY2ROW_NEON) +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -286,7 +288,8 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, I422ToUYVYRow = I422ToUYVYRow_SSE2; } } -#elif defined(HAS_I422TOUYVYROW_NEON) +#endif +#if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -332,7 +335,8 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, I422ToUYVYRow = I422ToUYVYRow_SSE2; } } -#elif defined(HAS_I422TOUYVYROW_NEON) +#endif +#if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -400,12 +404,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_Unaligned_SSE2; - if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && - IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && - IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } + MergeUVRow_ = MergeUVRow_SSE2; } } #endif @@ -479,10 +478,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -551,20 +547,19 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToBGRARow = I422ToBGRARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { - I422ToBGRARow = I422ToBGRARow_SSSE3; - } + I422ToBGRARow = I422ToBGRARow_SSSE3; } } -#elif defined(HAS_I422TOBGRAROW_NEON) +#endif +#if defined(HAS_I422TOBGRAROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToBGRARow = I422ToBGRARow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToBGRARow = I422ToBGRARow_NEON; } } -#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) +#endif +#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && @@ -613,13 +608,11 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToABGRRow = I422ToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { - I422ToABGRRow = I422ToABGRRow_SSSE3; - } + I422ToABGRRow = I422ToABGRRow_SSSE3; } } -#elif defined(HAS_I422TOABGRROW_NEON) +#endif +#if defined(HAS_I422TOABGRROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToABGRRow = I422ToABGRRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -667,13 +660,11 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } + I422ToRGBARow = I422ToRGBARow_SSSE3; } } -#elif defined(HAS_I422TORGBAROW_NEON) +#endif +#if defined(HAS_I422TORGBAROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToRGBARow = I422ToRGBARow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -724,7 +715,8 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } -#elif defined(HAS_I422TORGB24ROW_NEON) +#endif +#if defined(HAS_I422TORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToRGB24Row = I422ToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -775,7 +767,8 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, I422ToRAWRow = I422ToRAWRow_SSSE3; } } -#elif defined(HAS_I422TORAWROW_NEON) +#endif +#if defined(HAS_I422TORAWROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToRAWRow = I422ToRAWRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -826,7 +819,8 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; } } -#elif defined(HAS_I422TOARGB1555ROW_NEON) +#endif +#if defined(HAS_I422TOARGB1555ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -878,7 +872,8 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; } } -#elif defined(HAS_I422TOARGB4444ROW_NEON) +#endif +#if defined(HAS_I422TOARGB4444ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -929,7 +924,8 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, I422ToRGB565Row = I422ToRGB565Row_SSSE3; } } -#elif defined(HAS_I422TORGB565ROW_NEON) +#endif +#if defined(HAS_I422TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToRGB565Row = I422ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index de461dd..1e465ab 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -54,13 +54,11 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; - } + ARGBToUV444Row = ARGBToUV444Row_SSSE3; } } -#elif defined(HAS_ARGBTOUV444ROW_NEON) +#endif +#if defined(HAS_ARGBTOUV444ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToUV444Row = ARGBToUV444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -72,15 +70,12 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -133,13 +128,11 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } + ARGBToUV422Row = ARGBToUV422Row_SSSE3; } } -#elif defined(HAS_ARGBTOUV422ROW_NEON) +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { ARGBToUV422Row = ARGBToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -152,14 +145,11 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -212,11 +202,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -285,17 +271,12 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -315,10 +296,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } + MergeUVRow_ = MergeUVRow_SSE2; } } #endif @@ -392,17 +370,12 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -422,10 +395,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } + MergeUVRow_ = MergeUVRow_SSE2; } } #endif @@ -503,13 +473,11 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } + ARGBToUV422Row = ARGBToUV422Row_SSSE3; } } -#elif defined(HAS_ARGBTOUV422ROW_NEON) +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { ARGBToUV422Row = ARGBToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -521,13 +489,11 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -543,7 +509,8 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, I422ToYUY2Row = I422ToYUY2Row_SSE2; } } -#elif defined(HAS_I422TOYUY2ROW_NEON) +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -605,13 +572,11 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } + ARGBToUV422Row = ARGBToUV422Row_SSSE3; } } -#elif defined(HAS_ARGBTOUV422ROW_NEON) +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { ARGBToUV422Row = ARGBToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -623,13 +588,11 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -645,7 +608,8 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, I422ToUYVYRow = I422ToUYVYRow_SSE2; } } -#elif defined(HAS_I422TOUYVYROW_NEON) +#endif +#if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 16) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -700,11 +664,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -779,7 +739,8 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; } } -#elif defined(HAS_ARGBTORGB24ROW_NEON) +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -826,7 +787,8 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, ARGBToRAWRow = ARGBToRAWRow_SSSE3; } } -#elif defined(HAS_ARGBTORAWROW_NEON) +#endif +#if defined(HAS_ARGBTORAWROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToRAWRow = ARGBToRAWRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -867,14 +829,14 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_rgb565 = 0; } #if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } } -#elif defined(HAS_ARGBTORGB565ROW_NEON) +#endif +#if defined(HAS_ARGBTORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -915,14 +877,14 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb1555 = 0; } #if defined(HAS_ARGBTOARGB1555ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; } } -#elif defined(HAS_ARGBTOARGB1555ROW_NEON) +#endif +#if defined(HAS_ARGBTOARGB1555ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -963,14 +925,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb4444 = 0; } #if defined(HAS_ARGBTOARGB4444ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; } } -#elif defined(HAS_ARGBTOARGB4444ROW_NEON) +#endif +#if defined(HAS_ARGBTOARGB4444ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -1015,14 +977,8 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3; - ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; - if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; - } - } + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -1094,11 +1050,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; - } + ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 38e0f4e..1efa265 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -52,7 +52,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { #if defined(_MSC_VER) && !defined(__clang__) #if (_MSC_FULL_VER >= 160040219) __cpuidex((int*)(cpu_info), info_eax, info_ecx); -#elif defined(_M_IX86) +#endif +#if defined(_M_IX86) __asm { mov eax, info_eax mov ecx, info_ecx @@ -98,13 +99,15 @@ int TestOsSaveYmm() { uint32 xcr0 = 0u; #if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. -#elif defined(_M_IX86) && defined(_MSC_VER) +#endif +#if defined(_M_IX86) && defined(_MSC_VER) __asm { xor ecx, ecx // xcr 0 _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. mov xcr0, eax } -#elif defined(__i386__) || defined(__x86_64__) +#endif +#if defined(__i386__) || defined(__x86_64__) asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); #endif // defined(_MSC_VER) return((xcr0 & 6) == 6); // Is ymm saved? @@ -246,7 +249,8 @@ int InitCpuFlags(void) { if (TestEnv("LIBYUV_DISABLE_FMA3")) { cpu_info_ &= ~kCpuHasFMA3; } -#elif defined(__mips__) && defined(__linux__) +#endif +#if defined(__mips__) && defined(__linux__) // Linux mips parse text file for dsp detect. cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. #if defined(__mips_dspr2) @@ -263,7 +267,8 @@ int InitCpuFlags(void) { if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) { cpu_info_ &= ~kCpuHasMIPS_DSPR2; } -#elif defined(__arm__) || defined(__aarch64__) +#endif +#if defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. // For Linux, /proc/cpuinfo can be tested but without that assume Neon. @@ -272,7 +277,8 @@ int InitCpuFlags(void) { // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon // flag in it. // So for aarch64, neon enabling is hard coded here. -#elif defined(__aarch64__) +#endif +#if defined(__aarch64__) cpu_info_ = kCpuHasNEON; #else // Linux arm parse text file for neon detect. diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 3c17371..21d224f 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -77,14 +77,14 @@ int ARGBToBayer(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; } } -#elif defined(HAS_ARGBTOBAYERROW_NEON) +#endif +#if defined(HAS_ARGBTOBAYERROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -319,14 +319,12 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer, ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -462,7 +460,8 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, ARGBToBayerRow = ARGBToBayerRow_SSSE3; } } -#elif defined(HAS_ARGBTOBAYERROW_NEON) +#endif +#if defined(HAS_ARGBTOBAYERROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_NEON; if (IS_ALIGNED(width, 8)) { diff --git a/source/planar_functions.cc b/source/planar_functions.cc index d5111dc..661cad9 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -47,9 +47,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_SSE2; } #endif @@ -101,9 +99,7 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, } #endif #if defined(HAS_COPYROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_16_SSE2; } #endif @@ -254,9 +250,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { MirrorRow = MirrorRow_SSSE3; } #endif @@ -307,14 +301,8 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -385,14 +373,8 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; - UYVYToYRow = UYVYToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { - UYVYToUV422Row = UYVYToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - UYVYToYRow = UYVYToYRow_SSE2; - } - } + UYVYToUV422Row = UYVYToUV422Row_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -504,9 +486,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, } #if defined(HAS_ARGBMIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { ARGBMirrorRow = ARGBMirrorRow_SSSE3; } #endif @@ -820,17 +800,16 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y, I422ToBGRARow = I422ToBGRARow_NEON; } } -#elif defined(HAS_I422TOBGRAROW_SSSE3) +#endif +#if defined(HAS_I422TOBGRAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToBGRARow = I422ToBGRARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { - I422ToBGRARow = I422ToBGRARow_SSSE3; - } + I422ToBGRARow = I422ToBGRARow_SSSE3; } } -#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) +#endif +#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && @@ -890,14 +869,12 @@ int I422ToABGR(const uint8* src_y, int src_stride_y, I422ToABGRRow = I422ToABGRRow_NEON; } } -#elif defined(HAS_I422TOABGRROW_SSSE3) +#endif +#if defined(HAS_I422TOABGRROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToABGRRow = I422ToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { - I422ToABGRRow = I422ToABGRRow_SSSE3; - } + I422ToABGRRow = I422ToABGRRow_SSSE3; } } #endif @@ -952,14 +929,12 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y, I422ToRGBARow = I422ToRGBARow_NEON; } } -#elif defined(HAS_I422TORGBAROW_SSSE3) +#endif +#if defined(HAS_I422TORGBAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } + I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif @@ -1002,7 +977,8 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; } } -#elif defined(HAS_NV12TORGB565ROW_NEON) +#endif +#if defined(HAS_NV12TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -1050,7 +1026,8 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y, NV21ToRGB565Row = NV21ToRGB565Row_SSSE3; } } -#elif defined(HAS_NV21TORGB565ROW_NEON) +#endif +#if defined(HAS_NV21TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -1084,9 +1061,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y, dst_stride_y = 0; } #if defined(HAS_SETROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { SetRow = SetRow_NEON; } #endif @@ -1150,8 +1125,7 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_SETROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height); return 0; } @@ -1202,9 +1176,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSE2; @@ -1317,12 +1289,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBGRAYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_SSSE3; } -#elif defined(HAS_ARGBGRAYROW_NEON) +#endif +#if defined(HAS_ARGBGRAYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_NEON; } @@ -1355,11 +1326,11 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBGRAYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_SSSE3; } -#elif defined(HAS_ARGBGRAYROW_NEON) +#endif +#if defined(HAS_ARGBGRAYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_NEON; } @@ -1388,11 +1359,11 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBSEPIAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBSepiaRow = ARGBSepiaRow_SSSE3; } -#elif defined(HAS_ARGBSEPIAROW_NEON) +#endif +#if defined(HAS_ARGBSEPIAROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBSepiaRow = ARGBSepiaRow_NEON; } @@ -1430,11 +1401,11 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; } -#elif defined(HAS_ARGBCOLORMATRIXROW_NEON) +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; } @@ -1573,11 +1544,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBQUANTIZEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBQuantizeRow = ARGBQuantizeRow_SSE2; } -#elif defined(HAS_ARGBQUANTIZEROW_NEON) +#endif +#if defined(HAS_ARGBQUANTIZEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBQuantizeRow = ARGBQuantizeRow_NEON; } @@ -1748,12 +1719,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBSHADEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBShadeRow = ARGBShadeRow_SSE2; } -#elif defined(HAS_ARGBSHADEROW_NEON) +#endif +#if defined(HAS_ARGBSHADEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBShadeRow = ARGBShadeRow_NEON; } @@ -1882,11 +1852,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBShuffleRow = ARGBShuffleRow_SSSE3; - } + ARGBShuffleRow = ARGBShuffleRow_SSSE3; } } #endif @@ -1942,8 +1908,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } // ARGBToBayer used to select G channel from ARGB. #if defined(HAS_ARGBTOBAYERGGROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerGGRow_SSE2; @@ -1951,8 +1916,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; @@ -2043,8 +2007,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelRow = SobelRow_SSE2; } #endif @@ -2065,8 +2028,7 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_, int width) = SobelToPlaneRow_C; #if defined(HAS_SOBELTOPLANEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelToPlaneRow = SobelToPlaneRow_SSE2; } #endif @@ -2088,8 +2050,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelXYRow = SobelXYRow_SSE2; } #endif @@ -2213,10 +2174,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && - IS_ALIGNED(width, 8)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; } #endif @@ -2259,10 +2217,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, src_stride_y = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && - IS_ALIGNED(width, 8)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; } #endif diff --git a/source/rotate.cc b/source/rotate.cc index 34b6666..8218609 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -42,11 +42,7 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_MIRRORROW_NEON -void MirrorRow_NEON(const uint8* src, uint8* dst, int width); -#define HAS_MIRRORROW_UV_NEON -void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_TRANSPOSE_WX8_NEON void TransposeWx8_NEON(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width); @@ -55,23 +51,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width); -//following symbol is temporally enable for aarch64, until all neon optimized -//functions have been ported to aarch64 -#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__aarch64__) || defined(LIBYUV_NEON)) -// #define HAS_MIRRORROW_NEON -// void MirrorRow_NEON(const uint8* src, uint8* dst, int width); -// #define HAS_MIRRORROW_UV_NEON -// void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); -#define HAS_TRANSPOSE_WX8_NEON -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -#define HAS_TRANSPOSE_UVWX8_NEON -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width); -#endif // defined(__ARM_NEON__) +#endif #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ defined(__mips__) && \ @@ -312,7 +292,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ret } } -#elif !defined(LIBYUV_DISABLE_X86) && \ +#endif +#if !defined(LIBYUV_DISABLE_X86) && \ (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) #define HAS_TRANSPOSE_WX8_SSSE3 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, @@ -530,7 +511,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "ret \n" #endif ); -#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ +#endif +#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ defined(__x86_64__) // 64 bit version has enough registers to do 16x8 to 8x16 at a time. #define HAS_TRANSPOSE_WX8_FAST_SSSE3 @@ -1024,11 +1006,13 @@ void TransposeUV(const uint8* src, int src_stride, if (TestCpuFlag(kCpuHasNEON)) { TransposeUVWx8 = TransposeUVWx8_NEON; } -#elif defined(HAS_TRANSPOSE_UVWX8_SSE2) +#endif +#if defined(HAS_TRANSPOSE_UVWX8_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { TransposeUVWx8 = TransposeUVWx8_SSE2; } -#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) +#endif +#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; @@ -1096,11 +1080,13 @@ void RotateUV180(const uint8* src, int src_stride, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { MirrorRowUV = MirrorUVRow_NEON; } -#elif defined(HAS_MIRRORROW_UV_SSSE3) +#endif +#if defined(HAS_MIRRORROW_UV_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { MirrorRowUV = MirrorUVRow_SSSE3; } -#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2) +#endif +#if defined(HAS_MIRRORUVROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { MirrorRowUV = MirrorUVRow_MIPS_DSPR2; diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index a8d7fc2..b05977e 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -50,13 +50,12 @@ static void ARGBTranspose(const uint8* src, int src_stride, void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest. - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; } -#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest. - IS_ALIGNED(src, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; } #endif @@ -103,9 +102,7 @@ void ARGBRotate180(const uint8* src, int src_stride, ARGBMirrorRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_ARGBMIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { ARGBMirrorRow = ARGBMirrorRow_SSSE3; } #endif @@ -130,9 +127,7 @@ void ARGBRotate180(const uint8* src, int src_stride, } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32)) { CopyRow = CopyRow_SSE2; } #endif diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc index d54378e..92358af 100644 --- a/source/rotate_neon64.cc +++ b/source/rotate_neon64.cc @@ -27,6 +27,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { const uint8* src_temp = NULL; + int64 width64 = (int64) width; // Work around clang 3.4 warning. asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter @@ -237,10 +238,10 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, : "+r"(src_temp), // %0 "+r"(src), // %1 "+r"(dst), // %2 - "+r"(width) // %3 + "+r"(width64) // %3 : "r"(&kVTbl4x4Transpose), // %4 - "r"((ptrdiff_t)src_stride), // %5 - "r"((ptrdiff_t)dst_stride) // %6 + "r"(static_cast<ptrdiff_t>(src_stride)), // %5 + "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" ); @@ -255,6 +256,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_b, int dst_stride_b, int width) { const uint8* src_temp = NULL; + int64 width64 = (int64) width; // Work around clang 3.4 warning. asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter @@ -522,7 +524,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "+r"(src), // %1 "+r"(dst_a), // %2 "+r"(dst_b), // %3 - "+r"(width) // %4 + "+r"(width64) // %4 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 diff --git a/source/row_any.cc b/source/row_any.cc index 31ab08a..9d8a5e5 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -35,19 +35,19 @@ extern "C" { } #ifdef HAS_I422TOARGBROW_SSSE3 -YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, +YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C, 1, 4, 7) #endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_I444TOARGBROW_SSSE3 -YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, +YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C, 0, 4, 7) -YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, +YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, I411ToARGBRow_C, 2, 4, 7) -YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, +YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, I422ToBGRARow_C, 1, 4, 7) -YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, +YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, I422ToABGRRow_C, 1, 4, 7) -YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, +YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, I422ToRGBARow_C, 1, 4, 7) // I422ToRGB565Row_SSSE3 is unaligned. YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C, @@ -102,9 +102,9 @@ YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) } #ifdef HAS_NV12TOARGBROW_SSSE3 -NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, +NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C, 0, 4) -NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, +NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C, 0, 4) #endif // HAS_NV12TOARGBROW_SSSE3 #ifdef HAS_NV12TOARGBROW_NEON @@ -145,15 +145,15 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, 3, 4, 2) #endif #if defined(HAS_I400TOARGBROW_SSE2) -RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C, +RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 7, 1, 4) #endif #if defined(HAS_YTOARGBROW_SSE2) RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 7, 1, 4) -RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C, +RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C, 15, 2, 4) -RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C, +RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, UYVYToARGBRow_C, 15, 2, 4) // These require alignment on ARGB, so C is used for remainder. RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C, @@ -231,17 +231,17 @@ YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32) YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32) #endif #ifdef HAS_ARGBTOYROW_SSSE3 -YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 4, 1, 16) #endif #ifdef HAS_BGRATOYROW_SSSE3 -YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16) -YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16) +YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 4, 1, 16) +YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 4, 1, 16) +YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 4, 1, 16) +YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 2, 1, 16) +YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 2, 1, 16) #endif #ifdef HAS_ARGBTOYJROW_SSSE3 -YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16) +YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 4, 1, 16) #endif #ifdef HAS_ARGBTOYROW_NEON YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8) @@ -349,14 +349,15 @@ UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31) UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31) #endif #ifdef HAS_ARGBTOUVROW_SSSE3 -UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15) -UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C, - 4, 15) -UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15) -UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15) -UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15) -UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15) -UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15) +UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, ARGBToUVRow_C, 4, 15) +UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, ARGBToUVJRow_C, 4, 15) +UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, BGRAToUVRow_C, 4, 15) +UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, ABGRToUVRow_C, 4, 15) +UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, RGBAToUVRow_C, 4, 15) +#endif +#ifdef HAS_YUY2TOUVROW_SSE2 +UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, YUY2ToUVRow_C, 2, 15) +UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, UYVYToUVRow_C, 2, 15) #endif #ifdef HAS_ARGBTOUVROW_NEON UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) @@ -408,7 +409,7 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) } #ifdef HAS_ARGBTOUV444ROW_SSSE3 -UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3, +UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, ARGBToUV444Row_C, 4, 15, 0) #endif #ifdef HAS_YUY2TOUV422ROW_AVX2 @@ -417,12 +418,14 @@ UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, UYVYToUV422Row_C, 2, 31, 1) #endif -#ifdef HAS_ARGBTOUVROW_SSSE3 -UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3, +#ifdef HAS_ARGBTOUV422ROW_SSSE3 +UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, ARGBToUV422Row_C, 4, 15, 1) -UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, +#endif +#ifdef HAS_YUY2TOUV422ROW_SSE2 +UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, YUY2ToUV422Row_C, 2, 15, 1) -UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, +UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, UYVYToUV422Row_C, 2, 15, 1) #endif #ifdef HAS_YUY2TOUV422ROW_NEON @@ -451,7 +454,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, } #ifdef HAS_SPLITUVROW_SSE2 -SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15) +SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, SplitUVRow_C, 15) #endif #ifdef HAS_SPLITUVROW_AVX2 SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31) @@ -460,7 +463,7 @@ SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31) SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15) #endif #ifdef HAS_SPLITUVROW_MIPS_DSPR2 -SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, +SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, SplitUVRow_C, 15) #endif #undef SPLITUVROWANY @@ -477,7 +480,7 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, } #ifdef HAS_MERGEUVROW_SSE2 -MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15) +MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, MergeUVRow_C, 15) #endif #ifdef HAS_MERGEUVROW_AVX2 MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31) @@ -548,7 +551,7 @@ YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, ARGBShuffleRow_C, 4, 4, 3) #endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 -YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3, +YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, ARGBShuffleRow_C, 4, 4, 7) #endif #ifdef HAS_ARGBSHUFFLEROW_AVX2 diff --git a/source/row_common.cc b/source/row_common.cc index 40a8261..afc74c0 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2137,19 +2137,6 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, free_aligned_buffer_64(row_y); } -void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - int width) { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; - YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width); - YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width); - I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); - free_aligned_buffer_64(row_y); -} - void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { @@ -2163,19 +2150,6 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, free_aligned_buffer_64(row_y); } -void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - int width) { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; - UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width); - UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width); - I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); - free_aligned_buffer_64(row_y); -} - #endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // !defined(LIBYUV_DISABLE_X86) diff --git a/source/row_mips.cc b/source/row_mips.cc index da7183b..d713321 100644 --- a/source/row_mips.cc +++ b/source/row_mips.cc @@ -447,89 +447,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ); } -void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, - uint8* dst_v, int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "srl $t4, %[width], 4 \n" // multiplies of 16 - "blez $t4, 2f \n" - " andi %[width], %[width], 0xf \n" // residual - - ".p2align 2 \n" - "1: \n" - "addiu $t4, $t4, -1 \n" - "lwr $t0, 0(%[src_uv]) \n" - "lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0 - "lwr $t1, 4(%[src_uv]) \n" - "lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2 - "lwr $t2, 8(%[src_uv]) \n" - "lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4 - "lwr $t3, 12(%[src_uv]) \n" - "lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6 - "lwr $t5, 16(%[src_uv]) \n" - "lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8 - "lwr $t6, 20(%[src_uv]) \n" - "lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10 - "lwr $t7, 24(%[src_uv]) \n" - "lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12 - "lwr $t8, 28(%[src_uv]) \n" - "lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14 - "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 - "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 - "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 - "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 - "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 - "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 - "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 - "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 - "addiu %[src_uv], %[src_uv], 32 \n" - "swr $t9, 0(%[dst_v]) \n" - "swl $t9, 3(%[dst_v]) \n" - "swr $t0, 0(%[dst_u]) \n" - "swl $t0, 3(%[dst_u]) \n" - "swr $t1, 4(%[dst_v]) \n" - "swl $t1, 7(%[dst_v]) \n" - "swr $t2, 4(%[dst_u]) \n" - "swl $t2, 7(%[dst_u]) \n" - "swr $t3, 8(%[dst_v]) \n" - "swl $t3, 11(%[dst_v]) \n" - "swr $t5, 8(%[dst_u]) \n" - "swl $t5, 11(%[dst_u]) \n" - "swr $t6, 12(%[dst_v]) \n" - "swl $t6, 15(%[dst_v]) \n" - "swr $t7, 12(%[dst_u]) \n" - "swl $t7, 15(%[dst_u]) \n" - "addiu %[dst_u], %[dst_u], 16 \n" - "bgtz $t4, 1b \n" - " addiu %[dst_v], %[dst_v], 16 \n" - - "beqz %[width], 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, 0(%[src_uv]) \n" - "lbu $t1, 1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], 2 \n" - "addiu %[width], %[width], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[width], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r" (src_uv), - [width] "+r" (width), - [dst_u] "+r" (dst_u), - [dst_v] "+r" (dst_v) - : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6", "t7", "t8", "t9" - ); -} - void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { __asm__ __volatile__ ( ".set push \n" diff --git a/source/row_posix.cc b/source/row_posix.cc index 36111c1..9e514dd 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -221,7 +221,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { "1: \n" "movq " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x8,0) ",%0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -252,37 +252,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, - int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" @@ -318,17 +287,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" "sub $0x10,%2 \n" - "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" "jg 1b \n" : "+r"(src_rgb24), // %0 @@ -359,17 +328,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" "sub $0x10,%2 \n" - "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" "jg 1b \n" : "+r"(src_raw), // %0 @@ -418,8 +387,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" BUNDLEALIGN - MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) - MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) + MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -475,8 +444,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" BUNDLEALIGN - MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) - MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) + MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -519,8 +488,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) - MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -631,7 +600,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "pslld $0x8,%%xmm0 \n" @@ -672,7 +641,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0xf,%%xmm7 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm0,%%xmm3 \n" @@ -712,7 +681,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { "psrlw $0x8,%%xmm3 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm3,%%xmm0 \n" "pand %%xmm4,%%xmm1 \n" @@ -744,43 +713,6 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kARGBToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -820,44 +752,6 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -912,15 +806,19 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -962,7 +860,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif ); } +#endif // HAS_ARGBTOUVROW_SSSE3 +#ifdef HAS_ARGBTOUVJROW_SSSE3 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { @@ -979,156 +879,19 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToUJ), // %0 - "m"(kARGBToVJ), // %1 - "m"(kAddUVJ128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1161,7 +924,7 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)) + : "r"((intptr_t)(src_stride_argb)) // %4 : "memory", "cc" #if defined(__native_client__) && defined(__x86_64__) , "r14" @@ -1171,7 +934,9 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif ); } +#endif // HAS_ARGBTOUVJROW_SSSE3 +#ifdef HAS_ARGBTOUV444ROW_SSSE3 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -1187,71 +952,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6" -#endif - ); -} - -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, - uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1301,7 +1001,9 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, #endif ); } +#endif // HAS_ARGBTOUV444ROW_SSSE3 +#ifdef HAS_ARGBTOUV422ROW_SSSE3 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -1317,67 +1019,6 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1423,6 +1064,7 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, #endif ); } +#endif // HAS_ARGBTOUV422ROW_SSSE3 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { asm volatile ( @@ -1430,43 +1072,6 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kBGRAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1513,85 +1118,19 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_bgra0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_bgra)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kBGRAToU), // %0 - "m"(kBGRAToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1640,43 +1179,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kABGRToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1714,43 +1216,6 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kRGBAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1797,85 +1262,19 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_abgr0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kABGRToU), // %0 - "m"(kABGRToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1933,85 +1332,19 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_rgba0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)) - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kRGBAToU), // %0 - "m"(kRGBAToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -2043,7 +1376,7 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)) // %4 + : "r"((intptr_t)(src_stride_rgba)) : "memory", "cc" #if defined(__native_client__) && defined(__x86_64__) , "r14" @@ -2053,7 +1386,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, #endif ); } -#endif // HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3 #define UB 127 /* min(63,(int8)(2.018 * 64)) */ @@ -2199,8 +1531,8 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2354,8 +1686,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2393,8 +1725,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2430,8 +1762,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2464,191 +1796,6 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" - // Does not use r14. -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV444 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV411 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READNV12 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" - // Does not use r14. -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READNV12 - YVUTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" @@ -2686,8 +1833,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" - "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" + "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2725,8 +1872,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, "movdqa %%xmm2,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm2 \n" "punpckhwd %%xmm0,%%xmm1 \n" - "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" + "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" "sub $0x8,%[width] \n" "jg 1b \n" @@ -2765,125 +1912,6 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" - "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" - "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_bgra, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "pcmpeqb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm5 \n" - "movdqa %%xmm5,%%xmm0 \n" - "punpcklwd %%xmm1,%%xmm5 \n" - "punpckhwd %%xmm1,%%xmm0 \n" - "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" - "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" - "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_abgr, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" - "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "pcmpeqb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "movdqa %%xmm5,%%xmm0 \n" - "punpcklwd %%xmm1,%%xmm5 \n" - "punpckhwd %%xmm1,%%xmm0 \n" "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" @@ -2939,8 +1967,8 @@ void YToARGBRow_SSE2(const uint8* y_buf, "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "por %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" @@ -2970,7 +1998,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { "lea " MEMLEA(-0x10,0) ",%0 \n" LABELALIGN "1: \n" - MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 + MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 "pshufb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" @@ -3104,45 +2132,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -3183,38 +2172,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2" -#endif - ); -} - -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, - uint8* dst_uv, int width) { - asm volatile ( - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -3285,7 +2242,7 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { : : "memory", "cc" #if defined(__SSE2__) - , "ymm0", "ymm1" + , "xmm0", "xmm1" #endif ); } @@ -3307,7 +2264,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) { #endif // HAS_COPYROW_X86 #ifdef HAS_COPYROW_ERMS -// Unaligned Multiple of 1. +// Multiple of 1. void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { size_t width_tmp = (size_t)(width); asm volatile ( @@ -3405,16 +2362,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { "punpcklbw %%xmm2,%%xmm2 \n" "punpckhwd %%xmm2,%%xmm3 \n" "punpcklwd %%xmm2,%%xmm2 \n" - "movdqa " MEMACCESS(1) ",%%xmm4 \n" - "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "movdqu " MEMACCESS(1) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" "pand %%xmm0,%%xmm2 \n" "pand %%xmm0,%%xmm3 \n" "pand %%xmm1,%%xmm4 \n" "pand %%xmm1,%%xmm5 \n" "por %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm3 \n" - "movdqa %%xmm2," MEMACCESS(1) " \n" - "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -3498,128 +2455,15 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { "psrlw $0x8,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 @@ -3632,9 +2476,8 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, ); } -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, - int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -3678,8 +2521,8 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, ); } -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -3722,117 +2565,6 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { - asm volatile ( - LABELALIGN - "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -3854,8 +2586,8 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, ); } -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -3899,8 +2631,8 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ); } -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -4014,7 +2746,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jge 41b \n" @@ -4132,16 +2864,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // 4 pixel loop. LABELALIGN "40: \n" - "movdqa " MEMACCESS(0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" "pshufb %4,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movdqa " MEMACCESS(1) ",%%xmm1 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" @@ -4151,7 +2883,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jge 40b \n" "jmp 49f \n" @@ -4178,7 +2910,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jge 41b \n" @@ -4237,17 +2969,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pshufhw $0xff,%%xmm0,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "punpckhbw %%xmm1,%%xmm1 \n" "pshufhw $0xff,%%xmm1,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm1 \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "pand %%xmm4,%%xmm2 \n" @@ -4256,7 +2988,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "pand %%xmm5,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -4389,16 +3121,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm0 \n" "paddw %%xmm5,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrld $0x18,%%xmm2 \n" "psrld $0x18,%%xmm3 \n" @@ -4411,8 +3143,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "punpcklwd %%xmm3,%%xmm0 \n" "punpckhwd %%xmm3,%%xmm1 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -4455,30 +3187,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm6 \n" "phaddw %%xmm6,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm5 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm5 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm5 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm5 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "psrld $0x18,%%xmm6 \n" "psrld $0x18,%%xmm1 \n" "packuswb %%xmm1,%%xmm6 \n" @@ -4488,8 +3220,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { "punpcklwd %%xmm5,%%xmm0 \n" "punpckhwd %%xmm5,%%xmm1 \n" "sub $0x8,%1 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 @@ -4520,12 +3252,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddsw %%xmm7,%%xmm0 \n" @@ -4535,13 +3267,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm6,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm7 \n" "phaddsw %%xmm7,%%xmm1 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm5,%%xmm6 \n" "pmaddubsw %%xmm5,%%xmm7 \n" "phaddsw %%xmm7,%%xmm6 \n" @@ -4554,8 +3286,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "punpcklwd %%xmm1,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm6 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,1) ",%1 \n" "jg 1b \n" @@ -4593,14 +3325,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "punpckhbw %%xmm5,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n" "pmullw %%xmm3,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm7 \n" "pmullw %%xmm3,%%xmm1 \n" "pand %%xmm6,%%xmm7 \n" "paddw %%xmm4,%%xmm0 \n" @@ -4608,7 +3340,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, "packuswb %%xmm1,%%xmm0 \n" "por %%xmm7,%%xmm0 \n" "sub $0x4,%1 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" "lea " MEMLEA(0x10,0) ",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 @@ -4637,7 +3369,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -4648,7 +3380,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 @@ -4901,8 +3633,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "paddusb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -4919,10 +3651,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movdqa %%xmm1," MEMACCESS(2) " \n" - "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" - "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" - "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" + "movdqu %%xmm1," MEMACCESS(2) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" + "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" + "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" "lea " MEMLEA(0x40,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 @@ -4953,12 +3685,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x10,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 @@ -4992,8 +3724,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "paddusb %%xmm1,%%xmm2 \n" @@ -5010,10 +3742,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "punpcklwd %%xmm0,%%xmm7 \n" "punpckhwd %%xmm0,%%xmm1 \n" "sub $0x10,%3 \n" - "movdqa %%xmm6," MEMACCESS(2) " \n" - "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" - "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" - "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" + "movdqu %%xmm6," MEMACCESS(2) " \n" + "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" + "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" + "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" "lea " MEMLEA(0x40,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 @@ -5060,22 +3792,22 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, "punpcklwd %%xmm1,%%xmm4 \n" "punpckhwd %%xmm1,%%xmm5 \n" "paddd %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(2) ",%%xmm2 \n" + "movdqu " MEMACCESS(2) ",%%xmm2 \n" "paddd %%xmm0,%%xmm2 \n" "paddd %%xmm3,%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" + "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" "paddd %%xmm0,%%xmm3 \n" "paddd %%xmm4,%%xmm0 \n" - "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" "paddd %%xmm0,%%xmm4 \n" "paddd %%xmm5,%%xmm0 \n" - "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" "lea " MEMLEA(0x40,2) ",%2 \n" "paddd %%xmm0,%%xmm5 \n" - "movdqa %%xmm2," MEMACCESS(1) " \n" - "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" - "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" - "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" "sub $0x4,%3 \n" "jge 40b \n" @@ -5140,10 +3872,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 4 pixel small loop \n" LABELALIGN "4: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" BUNDLEALIGN MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 @@ -5174,10 +3906,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 4 pixel loop \n" LABELALIGN "40: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" BUNDLEALIGN MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 @@ -5221,7 +3953,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 1 pixel loop \n" LABELALIGN "10: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 "lea " MEMLEA(0x10,0) ",%0 \n" "psubd " MEMACCESS(1) ",%%xmm0 \n" @@ -5581,238 +4313,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_SSE2 -#ifdef HAS_INTERPOLATEROW_SSSE3 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm2) - "movdqu %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "pmaddubsw %%xmm5,%%xmm0 \n" - "pmaddubsw %%xmm5,%%xmm1 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - LABELALIGN - "25: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - LABELALIGN - "75: \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm0) - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "sub $0x10,%2 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm5" -#endif - ); -} -#endif // HAS_INTERPOLATEROW_SSSE3 - -#ifdef HAS_INTERPOLATEROW_SSE2 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm2 \n" - "punpckhbw %%xmm4,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "punpckhbw %%xmm4,%%xmm1 \n" - "psubw %%xmm0,%%xmm2 \n" - "psubw %%xmm1,%%xmm3 \n" - "paddw %%xmm2,%%xmm2 \n" - "paddw %%xmm3,%%xmm3 \n" - "pmulhw %%xmm5,%%xmm2 \n" - "pmulhw %%xmm5,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - LABELALIGN - "25: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - LABELALIGN - "75: \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "sub $0x10,%2 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} -#endif // HAS_INTERPOLATEROW_SSE2 - #ifdef HAS_ARGBTOBAYERROW_SSSE3 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { @@ -5822,8 +4322,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, "pshufd $0x0,%%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" @@ -5852,8 +4352,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, "psrld $0x18,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrld $0x8,%%xmm0 \n" "psrld $0x8,%%xmm1 \n" @@ -5882,34 +4382,7 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( - "movdqa " MEMACCESS(3) ",%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "r"(shuffler) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { - asm volatile ( - "movdqa " MEMACCESS(3) ",%%xmm5 \n" + "movdqu " MEMACCESS(3) ",%%xmm5 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" diff --git a/source/row_win.cc b/source/row_win.cc index 1cf0b9a..969d78c 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -89,8 +89,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); - xmm1 = _mm_load_si128(&xmm0); - xmm2 = _mm_load_si128(&xmm0); + xmm1 = _mm_loadu_si128(&xmm0); + xmm2 = _mm_loadu_si128(&xmm0); xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); @@ -112,60 +112,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, xmm2 = _mm_packus_epi16(xmm2, xmm2); xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); - xmm1 = _mm_load_si128(&xmm0); - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); - - _mm_store_si128((__m128i *)dst_argb, xmm0); - _mm_store_si128((__m128i *)(dst_argb + 16), xmm1); - - y_buf += 8; - u_buf += 4; - dst_argb += 32; - width -= 8; - } -} - -// Unaligned destination version. -void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3; - const __m128i xmm5 = _mm_set1_epi8(-1); - const __m128i xmm4 = _mm_setzero_si128(); - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; - - while (width > 0) { - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); - xmm1 = _mm_load_si128(&xmm0); - xmm2 = _mm_load_si128(&xmm0); - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); - xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); - xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); - xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); - xmm3 = _mm_loadl_epi64((__m128i*)y_buf); - xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); - xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); - xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); - xmm0 = _mm_adds_epi16(xmm0, xmm3); - xmm1 = _mm_adds_epi16(xmm1, xmm3); - xmm2 = _mm_adds_epi16(xmm2, xmm3); - xmm0 = _mm_srai_epi16(xmm0, 6); - xmm1 = _mm_srai_epi16(xmm1, 6); - xmm2 = _mm_srai_epi16(xmm2, 6); - xmm0 = _mm_packus_epi16(xmm0, xmm0); - xmm1 = _mm_packus_epi16(xmm1, xmm1); - xmm2 = _mm_packus_epi16(xmm2, xmm2); - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); - xmm1 = _mm_load_si128(&xmm0); + xmm1 = _mm_loadu_si128(&xmm0); xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); @@ -178,6 +125,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, width -= 8; } } + // 32 bit #else // defined(_M_X64) @@ -326,35 +274,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { punpckhwd xmm1, xmm1 por xmm0, xmm5 por xmm1, xmm5 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, - int pix) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - - align 4 - convertloop: - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 - punpckhwd xmm1, xmm1 - por xmm0, xmm5 - por xmm1, xmm5 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] @@ -386,17 +305,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 - movdqa [edx + 32], xmm2 + movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 - movdqa [edx + 16], xmm1 + movdqu [edx + 16], xmm1 por xmm3, xmm5 sub ecx, 16 - movdqa [edx + 48], xmm3 + movdqu [edx + 48], xmm3 lea edx, [edx + 64] jg convertloop ret @@ -426,17 +345,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 - movdqa [edx + 32], xmm2 + movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 - movdqa [edx + 16], xmm1 + movdqu [edx + 16], xmm1 por xmm3, xmm5 sub ecx, 16 - movdqa [edx + 48], xmm3 + movdqu [edx + 48], xmm3 lea edx, [edx + 64] jg convertloop ret @@ -491,8 +410,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 - movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -545,8 +464,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 - movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -585,8 +504,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, movdqa xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -688,7 +607,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G pslld xmm0, 8 // R @@ -728,7 +647,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G movdqa xmm3, xmm0 // R @@ -766,7 +685,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 pand xmm0, xmm3 // low nibble pand xmm1, xmm4 // high nibble @@ -795,10 +714,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -811,7 +730,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { packuswb xmm0, xmm2 paddb xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -830,10 +749,10 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -847,7 +766,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop ret @@ -937,75 +856,6 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { #endif // HAS_ARGBTOYJROW_AVX2 __declspec(naked) __declspec(align(16)) -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 - movdqa xmm4, kARGBToY - - align 4 - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm4, kARGBToYJ - movdqa xmm5, kAddYJ64 - - align 4 - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - paddw xmm0, xmm5 - paddw xmm2, xmm5 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1016,40 +866,6 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 - movdqa xmm4, kBGRAToY - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -1084,40 +900,6 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 - movdqa xmm4, kABGRToY - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -1152,40 +934,6 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 - movdqa xmm4, kRGBAToY - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -1228,14 +976,19 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1294,14 +1047,19 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1410,147 +1168,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOUVROW_AVX2 __declspec(naked) __declspec(align(16)) -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kARGBToUJ - movdqa xmm6, kARGBToVJ - movdqa xmm5, kAddUVJ128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - paddw xmm0, xmm5 // +.5 rounding -> unsigned - paddw xmm1, xmm5 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1567,64 +1184,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, align 4 convertloop: /* convert to U and V */ - movdqa xmm0, [eax] // U - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - - movdqa xmm0, [eax] // V - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm6 - pmaddubsw xmm1, xmm6 - pmaddubsw xmm2, xmm6 - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - lea eax, [eax + 64] - movdqa [edx + edi], xmm0 - lea edx, [edx + 16] - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* convert to U and V */ movdqu xmm0, [eax] // U movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -1683,65 +1242,6 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -1803,84 +1303,19 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kBGRAToU - movdqa xmm6, kBGRAToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] + movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1939,84 +1374,19 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kABGRToU - movdqa xmm6, kABGRToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] + movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -2075,84 +1445,19 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kRGBAToU - movdqa xmm6, kRGBAToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] + movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -2191,6 +1496,7 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, ret } } + #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_I422TOARGBROW_AVX2 @@ -2423,8 +1729,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2529,7 +1835,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest unaligned. +// 8 pixels // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void I422ToRGB565Row_SSSE3(const uint8* y_buf, @@ -2633,8 +1939,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2678,8 +1984,8 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2718,8 +2024,8 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2756,214 +2062,6 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - ret - } -} - -// 8 pixels, unaligned. -// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV444 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -// 8 pixels, unaligned. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -// 8 pixels, unaligned. -// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -// Similar to I420 but duplicate UV once more. -__declspec(naked) __declspec(align(16)) -void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ecx, [esp + 12 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV411 // modifies EBX - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - pop ebx - ret - } -} - -// 8 pixels, dest aligned 16. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // Y - mov esi, [esp + 4 + 8] // UV - mov edx, [esp + 4 + 12] // argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READNV12 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - ret - } -} - -// 8 pixels, dest aligned 16. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // Y - mov esi, [esp + 4 + 8] // VU - mov edx, [esp + 4 + 12] // argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READNV12 - YVUTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] @@ -3004,47 +2102,6 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, movdqa xmm0, xmm5 punpcklwd xmm5, xmm1 // BGRA first 4 pixels punpckhwd xmm0, xmm1 // BGRA next 4 pixels - movdqa [edx], xmm5 - movdqa [edx + 16], xmm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_bgra, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // bgra - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into BGRA - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - punpcklbw xmm1, xmm0 // GB - punpcklbw xmm5, xmm2 // AR - movdqa xmm0, xmm5 - punpcklwd xmm5, xmm1 // BGRA first 4 pixels - punpckhwd xmm0, xmm1 // BGRA next 4 pixels movdqu [edx], xmm5 movdqu [edx + 16], xmm0 lea edx, [edx + 32] @@ -3086,47 +2143,6 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, movdqa xmm1, xmm2 punpcklwd xmm2, xmm0 // RGBA first 4 pixels punpckhwd xmm1, xmm0 // RGBA next 4 pixels - movdqa [edx], xmm2 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_abgr, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // abgr - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm2, xmm1 // RG - punpcklbw xmm0, xmm5 // BA - movdqa xmm1, xmm2 - punpcklwd xmm2, xmm0 // RGBA first 4 pixels - punpckhwd xmm1, xmm0 // RGBA next 4 pixels movdqu [edx], xmm2 movdqu [edx + 16], xmm1 lea edx, [edx + 32] @@ -3168,47 +2184,6 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, movdqa xmm0, xmm5 punpcklwd xmm5, xmm1 // RGBA first 4 pixels punpckhwd xmm0, xmm1 // RGBA next 4 pixels - movdqa [edx], xmm5 - movdqa [edx + 16], xmm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgba - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into RGBA - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - punpcklbw xmm1, xmm2 // GR - punpcklbw xmm5, xmm0 // AB - movdqa xmm0, xmm5 - punpcklwd xmm5, xmm1 // RGBA first 4 pixels - punpckhwd xmm0, xmm1 // RGBA next 4 pixels movdqu [edx], xmm5 movdqu [edx + 16], xmm0 lea edx, [edx + 32] @@ -3260,8 +2235,8 @@ void YToARGBRow_SSE2(const uint8* y_buf, punpckhwd xmm1, xmm1 // BGRA next 4 pixels por xmm0, xmm4 por xmm1, xmm4 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3331,8 +2306,6 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORROW_SSE2 -// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 -// version can not. __declspec(naked) __declspec(align(16)) void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { @@ -3468,43 +2441,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - pand xmm0, xmm5 // even bytes - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm2, 8 // odd bytes - psrlw xmm3, 8 - packuswb xmm2, xmm3 - movdqa [edx], xmm0 - movdqa [edx + edi], xmm2 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] @@ -3526,6 +2462,7 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ret } } + #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_AVX2 @@ -3581,36 +2518,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, align 4 convertloop: - movdqa xmm0, [eax] // read 16 U's - movdqa xmm1, [eax + edx] // and 16 V's - lea eax, [eax + 16] - movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 // first 8 UV pairs - punpckhbw xmm2, xmm1 // next 8 UV pairs - movdqa [edi], xmm0 - movdqa [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, - uint8* dst_uv, int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width - sub edx, eax - - align 4 - convertloop: movdqu xmm0, [eax] // read 16 U's movdqu xmm1, [eax + edx] // and 16 V's lea eax, [eax + 16] @@ -3713,7 +2620,7 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_AVX -// Unaligned Multiple of 1. +// Multiple of 1. __declspec(naked) __declspec(align(16)) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { __asm { @@ -3730,7 +2637,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { } #ifdef HAS_COPYROW_X86 -// Unaligned Multiple of 4. +// Multiple of 4. __declspec(naked) __declspec(align(16)) void CopyRow_X86(const uint8* src, uint8* dst, int count) { __asm { @@ -3763,19 +2670,19 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { align 4 convertloop: - movdqa xmm2, [eax] - movdqa xmm3, [eax + 16] + movdqu xmm2, [eax] + movdqu xmm3, [eax + 16] lea eax, [eax + 32] - movdqa xmm4, [edx] - movdqa xmm5, [edx + 16] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] pand xmm2, xmm0 pand xmm3, xmm0 pand xmm4, xmm1 pand xmm5, xmm1 por xmm2, xmm4 por xmm3, xmm5 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3835,16 +2742,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { punpcklbw xmm2, xmm2 punpckhwd xmm3, xmm2 punpcklwd xmm2, xmm2 - movdqa xmm4, [edx] - movdqa xmm5, [edx + 16] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] pand xmm2, xmm0 pand xmm3, xmm0 pand xmm4, xmm1 pand xmm5, xmm1 por xmm2, xmm4 por xmm3, xmm5 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -4173,113 +3080,6 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y - pand xmm1, xmm5 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] @@ -4295,8 +3095,8 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, } __declspec(naked) __declspec(align(16)) -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { __asm { push esi push edi @@ -4339,8 +3139,8 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, } __declspec(naked) __declspec(align(16)) -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 @@ -4385,111 +3185,6 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - - align 4 - convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] @@ -4505,8 +3200,8 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, } __declspec(naked) __declspec(align(16)) -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { __asm { push esi push edi @@ -4549,8 +3244,8 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, } __declspec(naked) __declspec(align(16)) -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 @@ -4666,7 +3361,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jge convertloop4 @@ -4782,16 +3477,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // 4 pixel loop. convertloop4: - movdqa xmm3, [eax] // src argb + movdqu xmm3, [eax] // src argb lea eax, [eax + 16] movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [esi] // _r_b + movdqu xmm2, [esi] // _r_b pshufb xmm3, kShuffleAlpha // alpha pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [esi] // _a_g + movdqu xmm1, [esi] // _a_g lea esi, [esi + 16] psrlw xmm1, 8 // _a_g por xmm0, xmm4 // set alpha to 255 @@ -4801,7 +3496,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jge convertloop4 jmp convertloop4b @@ -4827,7 +3522,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jge convertuloop4 @@ -4883,17 +3578,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm0 // first 2 pshufhw xmm2, xmm0, 0FFh // 8 alpha words pshuflw xmm2, xmm2, 0FFh pmulhuw xmm0, xmm2 // rgb * a - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels punpckhbw xmm1, xmm1 // next 2 pixels pshufhw xmm2, xmm1, 0FFh // 8 alpha words pshuflw xmm2, xmm2, 0FFh pmulhuw xmm1, xmm2 // rgb * a - movdqa xmm2, [eax] // alphas + movdqu xmm2, [eax] // alphas lea eax, [eax + 16] psrlw xmm0, 8 pand xmm2, xmm4 @@ -4902,7 +3597,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { pand xmm0, xmm5 // keep original alphas por xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop @@ -5177,16 +3872,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { align 4 convertloop: - movdqa xmm0, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm0, xmm1 paddw xmm0, xmm5 // Add .5 for rounding. psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 G bytes - movdqa xmm2, [eax] // A - movdqa xmm3, [eax + 16] + movdqu xmm2, [eax] // A + movdqu xmm3, [eax + 16] lea eax, [eax + 32] psrld xmm2, 24 psrld xmm3, 24 @@ -5199,8 +3894,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { punpcklwd xmm0, xmm3 // GGGA first 4 punpckhwd xmm1, xmm3 // GGGA next 4 sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] jg convertloop ret @@ -5237,30 +3932,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { align 4 convertloop: - movdqa xmm0, [eax] // B - movdqa xmm6, [eax + 16] + movdqu xmm0, [eax] // B + movdqu xmm6, [eax + 16] pmaddubsw xmm0, xmm2 pmaddubsw xmm6, xmm2 phaddw xmm0, xmm6 psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 B values - movdqa xmm5, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm5, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm3 pmaddubsw xmm1, xmm3 phaddw xmm5, xmm1 psrlw xmm5, 7 packuswb xmm5, xmm5 // 8 G values punpcklbw xmm0, xmm5 // 8 BG values - movdqa xmm5, [eax] // R - movdqa xmm1, [eax + 16] + movdqu xmm5, [eax] // R + movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm5, xmm1 psrlw xmm5, 7 packuswb xmm5, xmm5 // 8 R values - movdqa xmm6, [eax] // A - movdqa xmm1, [eax + 16] + movdqu xmm6, [eax] // A + movdqu xmm1, [eax + 16] psrld xmm6, 24 psrld xmm1, 24 packuswb xmm6, xmm1 @@ -5270,8 +3965,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { punpcklwd xmm0, xmm5 // BGRA first 4 punpckhwd xmm1, xmm5 // BGRA next 4 sub ecx, 8 - movdqa [eax], xmm0 - movdqa [eax + 16], xmm1 + movdqu [eax], xmm0 + movdqu [eax + 16], xmm1 lea eax, [eax + 32] jg convertloop ret @@ -5300,12 +3995,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, align 4 convertloop: - movdqa xmm0, [eax] // B - movdqa xmm7, [eax + 16] + movdqu xmm0, [eax] // B + movdqu xmm7, [eax + 16] pmaddubsw xmm0, xmm2 pmaddubsw xmm7, xmm2 - movdqa xmm6, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm6, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 phaddsw xmm0, xmm7 // B @@ -5315,13 +4010,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, packuswb xmm0, xmm0 // 8 B values packuswb xmm6, xmm6 // 8 G values punpcklbw xmm0, xmm6 // 8 BG values - movdqa xmm1, [eax] // R - movdqa xmm7, [eax + 16] + movdqu xmm1, [eax] // R + movdqu xmm7, [eax + 16] pmaddubsw xmm1, xmm4 pmaddubsw xmm7, xmm4 phaddsw xmm1, xmm7 // R - movdqa xmm6, [eax] // A - movdqa xmm7, [eax + 16] + movdqu xmm6, [eax] // A + movdqu xmm7, [eax + 16] pmaddubsw xmm6, xmm5 pmaddubsw xmm7, xmm5 phaddsw xmm6, xmm7 // A @@ -5334,8 +4029,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, punpcklwd xmm0, xmm1 // BGRA first 4 punpckhwd xmm6, xmm1 // BGRA next 4 sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm6 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm6 lea eax, [eax + 32] lea edx, [edx + 32] jg convertloop @@ -5368,14 +4063,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm5 // first 2 pixels pmulhuw xmm0, xmm2 // pixel * scale >> 16 - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels punpckhbw xmm1, xmm5 // next 2 pixels pmulhuw xmm1, xmm2 pmullw xmm0, xmm3 // * interval_size - movdqa xmm7, [eax] // read 4 pixels + movdqu xmm7, [eax] // read 4 pixels pmullw xmm1, xmm3 pand xmm7, xmm6 // mask alpha paddw xmm0, xmm4 // + interval_size / 2 @@ -5383,7 +4078,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, packuswb xmm0, xmm1 por xmm0, xmm7 sub ecx, 4 - movdqa [eax], xmm0 + movdqu [eax], xmm0 lea eax, [eax + 16] jg convertloop ret @@ -5407,7 +4102,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 // first 2 @@ -5418,7 +4113,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, psrlw xmm1, 8 packuswb xmm0, xmm1 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop @@ -5775,8 +4470,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely movdqa xmm2, xmm0 // GG @@ -5793,10 +4488,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, por xmm3, xmm5 // GGGA por xmm0, xmm5 sub ecx, 16 - movdqa [edx], xmm1 - movdqa [edx + 16], xmm2 - movdqa [edx + 32], xmm3 - movdqa [edx + 48], xmm0 + movdqu [edx], xmm1 + movdqu [edx + 16], xmm2 + movdqu [edx + 32], xmm3 + movdqu [edx + 48], xmm0 lea edx, [edx + 64] jg convertloop @@ -5821,12 +4516,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg convertloop @@ -5856,8 +4551,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] movdqa xmm2, xmm0 paddusb xmm2, xmm1 // sobel = sobelx + sobely @@ -5874,10 +4569,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, punpcklwd xmm7, xmm0 // Next 4 punpckhwd xmm1, xmm0 // Last 4 sub ecx, 16 - movdqa [edx], xmm6 - movdqa [edx + 16], xmm4 - movdqa [edx + 32], xmm7 - movdqa [edx + 48], xmm1 + movdqu [edx], xmm6 + movdqu [edx + 16], xmm4 + movdqu [edx + 32], xmm7 + movdqu [edx + 48], xmm1 lea edx, [edx + 64] jg convertloop @@ -5933,10 +4628,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, align 4 s4: // top left - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] // - top right psubd xmm0, [eax + edx * 4] @@ -5976,10 +4671,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, align 4 l4: // top left - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] // - top right psubd xmm0, [eax + edx * 4] @@ -6028,7 +4723,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 1 pixel loop align 4 l1: - movdqa xmm0, [eax] + movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] lea eax, [eax + 16] psubd xmm0, [esi] @@ -6084,26 +4779,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, punpckhwd xmm5, xmm1 paddd xmm0, xmm2 - movdqa xmm2, [esi] // previous row above. + movdqu xmm2, [esi] // previous row above. paddd xmm2, xmm0 paddd xmm0, xmm3 - movdqa xmm3, [esi + 16] + movdqu xmm3, [esi + 16] paddd xmm3, xmm0 paddd xmm0, xmm4 - movdqa xmm4, [esi + 32] + movdqu xmm4, [esi + 32] paddd xmm4, xmm0 paddd xmm0, xmm5 - movdqa xmm5, [esi + 48] + movdqu xmm5, [esi + 48] lea esi, [esi + 64] paddd xmm5, xmm0 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 - movdqa [edx + 32], xmm4 - movdqa [edx + 48], xmm5 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + movdqu [edx + 32], xmm4 + movdqu [edx + 48], xmm5 lea edx, [edx + 64] sub ecx, 4 @@ -6552,8 +5247,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] pshufb xmm0, xmm5 pshufb xmm1, xmm5 @@ -6580,8 +5275,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrld xmm0, 8 // Move green to bottom. psrld xmm1, 8 @@ -6605,33 +5300,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler - movdqa xmm5, [ecx] - mov ecx, [esp + 16] // pix - - align 4 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pshufb xmm0, xmm5 - pshufb xmm1, xmm5 - sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - jg wloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - movdqa xmm5, [ecx] + movdqu xmm5, [ecx] mov ecx, [esp + 16] // pix align 4 diff --git a/source/scale.cc b/source/scale.cc index f37da7e..09b8a2c 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -57,13 +57,15 @@ static void ScalePlaneDown2(int src_width, int src_height, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON; } -#elif defined(HAS_SCALEROWDOWN2_SSE2) +#endif +#if defined(HAS_SCALEROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : ScaleRowDown2Box_SSE2); } -#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -105,13 +107,15 @@ static void ScalePlaneDown2_16(int src_width, int src_height, ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; } -#elif defined(HAS_SCALEROWDOWN2_16_SSE2) +#endif +#if defined(HAS_SCALEROWDOWN2_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : ScaleRowDown2Box_16_SSE2); } -#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -153,13 +157,13 @@ static void ScalePlaneDown4(int src_width, int src_height, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; } -#elif defined(HAS_SCALEROWDOWN4_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN4_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; } -#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -197,14 +201,14 @@ static void ScalePlaneDown4_16(int src_width, int src_height, ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; } -#elif defined(HAS_SCALEROWDOWN4_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN4_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } -#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -256,8 +260,7 @@ static void ScalePlaneDown34(int src_width, int src_height, } #endif #if defined(HAS_SCALEROWDOWN34_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_SSSE3; @@ -336,8 +339,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height, } #endif #if defined(HAS_SCALEROWDOWN34_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; @@ -430,9 +432,9 @@ static void ScalePlaneDown38(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; } } -#elif defined(HAS_SCALEROWDOWN38_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_SSSE3; @@ -441,7 +443,8 @@ static void ScalePlaneDown38(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; } } -#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -507,9 +510,9 @@ static void ScalePlaneDown38_16(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON; } } -#elif defined(HAS_SCALEROWDOWN38_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN38_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; @@ -518,7 +521,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; } } -#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -743,11 +747,11 @@ static void ScalePlaneBox(int src_width, int src_height, uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C; #if defined(HAS_SCALEADDROWS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) #ifdef AVOID_OVERREAD - IS_ALIGNED(src_width, 16) && + && IS_ALIGNED(src_width, 16) #endif - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ) { ScaleAddRows = ScaleAddRows_SSE2; } #endif @@ -815,11 +819,11 @@ static void ScalePlaneBox_16(int src_width, int src_height, uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C; #if defined(HAS_SCALEADDROWS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) #ifdef AVOID_OVERREAD - IS_ALIGNED(src_width, 16) && + && IS_ALIGNED(src_width, 16) #endif - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ) { ScaleAddRows = ScaleAddRows_16_SSE2; } #endif @@ -1111,9 +1115,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleColsUp2_SSE2; } #endif @@ -1244,9 +1246,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_16_C; #if defined(HAS_SCALECOLS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleColsUp2_16_SSE2; } #endif @@ -1327,9 +1327,7 @@ static void ScalePlaneSimple(int src_width, int src_height, if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleCols = ScaleColsUp2_SSE2; } #endif @@ -1362,9 +1360,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height, if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_16_C; #if defined(HAS_SCALECOLS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleCols = ScaleColsUp2_16_SSE2; } #endif diff --git a/source/scale_argb.cc b/source/scale_argb.cc index b6d5129..a798cad 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -53,16 +53,14 @@ static void ScaleARGBDown2(int src_width, int src_height, } #if defined(HAS_SCALEARGBROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : ScaleARGBRowDown2Box_SSE2); } -#elif defined(HAS_SCALEARGBROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON : ScaleARGBRowDown2_NEON; } @@ -98,14 +96,12 @@ static void ScaleARGBDown4Box(int src_width, int src_height, assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. #if defined(HAS_SCALEARGBROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; } -#elif defined(HAS_SCALEARGBROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; } #endif @@ -139,14 +135,13 @@ static void ScaleARGBDownEven(int src_width, int src_height, assert(IS_ALIGNED(src_height, 2)); src_argb += (y >> 16) * src_stride + (x >> 16) * 4; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; } -#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_argb, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; } @@ -334,9 +329,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif @@ -510,9 +503,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif @@ -619,9 +610,7 @@ static void ScaleARGBSimple(int src_width, int src_height, if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBCols = ScaleARGBColsUp2_SSE2; } #endif diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index e31a6c9..933abd4 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -20,7 +20,6 @@ extern "C" { // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -#ifdef HAS_SCALEROWDOWN2_NEON // Read 32x1 throw away even pixels, and write 16x1. void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { @@ -40,9 +39,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, : "v0", "v1" // Clobber List ); } -#endif //HAS_SCALEROWDOWN2_NEON -#ifdef HAS_SCALEROWDOWN2_NEON // Read 32x2 average down and write 16x1. void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { @@ -72,9 +69,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, : "v0", "v1", "v2", "v3" // Clobber List ); } -#endif //HAS_SCALEROWDOWN2_NEON -#ifdef HAS_SCALEROWDOWN4_NEON void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -92,9 +87,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, : "v0", "v1", "v2", "v3", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN4_NEON -#ifdef HAS_SCALEROWDOWN4_NEON void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { const uint8* src_ptr1 = src_ptr + src_stride; @@ -130,9 +123,7 @@ asm volatile ( : "v0", "v1", "v2", "v3", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN4_NEON -#ifdef HAS_SCALEROWDOWN34_NEON // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. @@ -155,9 +146,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, : "v0", "v1", "v2", "v3", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN34_NEON -#ifdef HAS_SCALEROWDOWN34_NEON void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -217,9 +206,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "v20", "memory", "cc" ); } -#endif //ScaleRowDown34_0_Box_NEON -#ifdef HAS_SCALEROWDOWN34_NEON void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -262,9 +249,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN34_NEON -#ifdef HAS_SCALEROWDOWN38_NEON static uvec8 kShuf38 = { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; static uvec8 kShuf38_2 = @@ -301,9 +286,6 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ); } -#endif //HAS_SCALEROWDOWN38_NEON - -#ifdef HAS_SCALEROWDOWN38_NEON // 32x3 -> 12x1 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, @@ -432,9 +414,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, "v30", "v31", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN38_NEON -#ifdef HAS_SCALEROWDOWN38_NEON // 32x2 -> 12x1 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, @@ -456,7 +436,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" "subs %3, %3, #12 \n" // Shuffle the input data around to get align the data @@ -541,7 +521,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, "v18", "v19", "v30", "v31", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN38_NEON // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, @@ -643,7 +622,6 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ); } -#ifdef HAS_SCALEARGBROWDOWN2_NEON void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( @@ -666,9 +644,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List ); } -#endif //HAS_SCALEARGBROWDOWN2_NEON -#ifdef HAS_SCALEARGBROWDOWN2_NEON void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( @@ -703,9 +679,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" ); } -#endif //HAS_SCALEARGBROWDOWN2_NEON -#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, @@ -731,9 +705,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, : "memory", "cc", "v0" ); } -#endif //HAS_SCALEARGBROWDOWNEVEN_NEON -#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. // TODO, might be worth another optimization pass in future. @@ -786,7 +758,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } -#endif // HAS_SCALEARGBROWDOWNEVEN_NEON #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/source/scale_posix.cc b/source/scale_posix.cc index 6320e50..92e3354 100644 --- a/source/scale_posix.cc +++ b/source/scale_posix.cc @@ -526,8 +526,9 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 + MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" + "pavgb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm2,%%xmm1 \n" "movdqa %%xmm0,%%xmm6 \n" diff --git a/source/scale_win.cc b/source/scale_win.cc index 091587e..8370ef4 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -589,8 +589,9 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, align 4 xloop: movdqu xmm0, [eax] // average 2 rows into xmm0 - pavgb xmm0, [eax + esi] + movdqu xmm1, [eax + esi] lea eax, [eax + 16] + pavgb xmm0, xmm1 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 pshufb xmm1, xmm2 diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index a50b33f..fd82ed1 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -430,8 +430,8 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ align_buffer_64(src_y, kWidth * kHeight + OFF); \ align_buffer_64(src_u, kSizeUV + OFF); \ align_buffer_64(src_v, kSizeUV + OFF); \ - align_buffer_64(dst_argb_c, kStrideB * kHeight); \ - align_buffer_64(dst_argb_opt, kStrideB * kHeight); \ + align_buffer_64(dst_argb_c, kStrideB * kHeight + OFF); \ + align_buffer_64(dst_argb_opt, kStrideB * kHeight + OFF); \ srandom(time(NULL)); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (random() & 0xff); \ @@ -440,20 +440,20 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ src_u[i + OFF] = (random() & 0xff); \ src_v[i + OFF] = (random() & 0xff); \ } \ - memset(dst_argb_c, 1, kStrideB * kHeight); \ - memset(dst_argb_opt, 101, kStrideB * kHeight); \ + memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ MaskCpuFlags(0); \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ - dst_argb_c, kStrideB, \ + dst_argb_c + OFF, kStrideB, \ kWidth, NEG kHeight); \ MaskCpuFlags(-1); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ - dst_argb_opt, kStrideB, \ + dst_argb_opt + OFF, kStrideB, \ kWidth, NEG kHeight); \ } \ int max_diff = 0; \ @@ -462,10 +462,10 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ align_buffer_64(dst_argb32_opt, kWidth * BPP_C * kHeight); \ memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight); \ memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight); \ - FMT_B##To##FMT_C(dst_argb_c, kStrideB, \ + FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, \ dst_argb32_c, kWidth * BPP_C , \ kWidth, kHeight); \ - FMT_B##To##FMT_C(dst_argb_opt, kStrideB, \ + FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, \ dst_argb32_opt, kWidth * BPP_C , \ kWidth, kHeight); \ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index 6a2bc79..bbeb4f8 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -223,6 +223,7 @@ TEST_FACTOR(2, 1 / 2, 1 / 2) TEST_FACTOR(4, 1 / 4, 1 / 4) TEST_FACTOR(8, 1 / 8, 1 / 8) TEST_FACTOR(3by4, 3 / 4, 3 / 4) +TEST_FACTOR(3by8, 3 / 8, 3 / 8) #undef TEST_FACTOR1 #undef TEST_FACTOR diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 00f0707..5d08365 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -288,6 +288,7 @@ TEST_FACTOR(2, 1 / 2, 1 / 2) TEST_FACTOR(4, 1 / 4, 1 / 4) TEST_FACTOR(8, 1 / 8, 1 / 8) TEST_FACTOR(3by4, 3 / 4, 3 / 4) +TEST_FACTOR(3by8, 3 / 8, 3 / 8) #undef TEST_FACTOR1 #undef TEST_FACTOR diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h index cfce548..0151796 100644 --- a/unit_test/unit_test.h +++ b/unit_test/unit_test.h @@ -26,11 +26,14 @@ static __inline int Abs(int v) { return v >= 0 ? v : -v; } +#define OFFBY 0 + #define align_buffer_page_end(var, size) \ uint8* var; \ uint8* var##_mem; \ - var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095) & ~4095)); \ - var = var##_mem + (-(size) & 4095); + var##_mem = reinterpret_cast<uint8*>(malloc((((size) + 4095) & ~4095) + \ + OFFBY)); \ + var = var##_mem + (-(size) & 4095) + OFFBY; #define free_aligned_buffer_page_end(var) \ free(var##_mem); \ |