diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-07-12 16:58:33 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-07-12 16:58:33 +0000 |
commit | 3ffef2f45795a505468f8bfe0f7000c8276033ea (patch) | |
tree | c9445be70dbfa7de3e9a3d43358d10e5582e5097 | |
parent | a9d9cdc6db5e02f60d4676b391b4599204d38278 (diff) | |
parent | eee6aacd0da152622c837c25d87908eebece5f28 (diff) | |
download | libgav1-android13-mainline-go-uwb-release.tar.gz |
Snap for 8821814 from eee6aacd0da152622c837c25d87908eebece5f28 to mainline-go-uwb-releaseaml_go_uwb_330912000android13-mainline-go-uwb-release
Change-Id: I0ab584bd5d845b0072708bc6991954abdc7fd2af
85 files changed, 6117 insertions, 3927 deletions
@@ -79,6 +79,7 @@ cc_library_static { "libgav1/src/dsp/arm/intrapred_smooth_neon.cc", "libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc", "libgav1/src/dsp/arm/inverse_transform_neon.cc", + "libgav1/src/dsp/arm/loop_filter_10bit_neon.cc", "libgav1/src/dsp/arm/loop_filter_neon.cc", "libgav1/src/dsp/arm/loop_restoration_10bit_neon.cc", "libgav1/src/dsp/arm/loop_restoration_neon.cc", diff --git a/README.version b/README.version index 53d5b62..13d4b14 100644 --- a/README.version +++ b/README.version @@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/codecs/libgav1 -Version: v0.17.0 +Version: v0.18.0 BugComponent: 324837 Local Modifications: None diff --git a/libgav1/CMakeLists.txt b/libgav1/CMakeLists.txt index 4029de1..52b1b32 100644 --- a/libgav1/CMakeLists.txt +++ b/libgav1/CMakeLists.txt @@ -48,6 +48,8 @@ libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations." VALUE ON) libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING "Enables sse4.1 optimizations." VALUE ON) +libgav1_option(NAME LIBGAV1_ENABLE_EXAMPLES HELPSTRING "Enables examples." VALUE + ON) libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON) libgav1_option( NAME LIBGAV1_VERBOSE HELPSTRING @@ -101,6 +103,12 @@ libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY) # Controls use of std::mutex and absl::Mutex in ThreadPool. libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX) +if((DEFINED + LIBGAV1_THREADPOOL_USE_STD_MUTEX + AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX) + OR NOT (DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX OR ANDROID OR IOS)) + set(use_absl_threading TRUE) +endif() if(LIBGAV1_VERBOSE) libgav1_dump_cmake_flag_variables() @@ -124,18 +132,22 @@ endif() libgav1_set_test_flags() set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp") -if(NOT EXISTS "${libgav1_abseil}") - message( - FATAL_ERROR - "Abseil not found. This dependency is required by the" - " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is" - " not defined. To continue, download the Abseil repository to" - " third_party/abseil-cpp:\n git \\\n -C ${libgav1_root} \\\n" - " clone \\\n" - " https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp") +if(EXISTS "${libgav1_abseil}") + set(ABSL_PROPAGATE_CXX_STD ON) + add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" + EXCLUDE_FROM_ALL) +else() + if(use_absl_threading OR LIBGAV1_ENABLE_EXAMPLES OR LIBGAV1_ENABLE_TESTS) + message( + FATAL_ERROR + "Abseil not found. This dependency is required by the" + " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is" + " not defined. To continue, download the Abseil repository to" + " third_party/abseil-cpp:\n git \\\n -C ${libgav1_root} \\\n" + " clone \\\n" + " https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp") + endif() endif() -set(ABSL_PROPAGATE_CXX_STD ON) -add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" EXCLUDE_FROM_ALL) libgav1_reset_target_lists() libgav1_add_dsp_targets() diff --git a/libgav1/README.md b/libgav1/README.md index 6744291..04c6a94 100644 --- a/libgav1/README.md +++ b/libgav1/README.md @@ -1,7 +1,7 @@ # libgav1 -- an AV1 decoder -libgav1 is a Main profile (0) & High profile (1) compliant AV1 decoder. More -information on the AV1 video format can be found at +libgav1 is a Main profile (0), High profile (1) & Professional profile (2) +compliant AV1 decoder. More information on the AV1 video format can be found at [aomedia.org](https://aomedia.org). [TOC] diff --git a/libgav1/cmake/libgav1_build_definitions.cmake b/libgav1/cmake/libgav1_build_definitions.cmake index 0d00bb6..3885dcd 100644 --- a/libgav1/cmake/libgav1_build_definitions.cmake +++ b/libgav1/cmake/libgav1_build_definitions.cmake @@ -142,8 +142,10 @@ macro(libgav1_set_build_definitions) if(NOT LIBGAV1_MAX_BITDEPTH) set(LIBGAV1_MAX_BITDEPTH 10) - elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10) - libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8 or 10.") + elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 + AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10 + AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 12) + libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12.") endif() list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}") diff --git a/libgav1/cmake/libgav1_install.cmake b/libgav1/cmake/libgav1_install.cmake index b7f6006..e2c79b9 100644 --- a/libgav1/cmake/libgav1_install.cmake +++ b/libgav1/cmake/libgav1_install.cmake @@ -48,8 +48,10 @@ macro(libgav1_setup_install_target) FILES ${libgav1_api_includes} DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1") - install(TARGETS gav1_decode DESTINATION - "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}") + if(LIBGAV1_ENABLE_EXAMPLES) + install(TARGETS gav1_decode DESTINATION + "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}") + endif() install(TARGETS libgav1_static DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") if(BUILD_SHARED_LIBS) diff --git a/libgav1/cmake/toolchains/android.cmake b/libgav1/cmake/toolchains/android.cmake index 492957b..b550397 100644 --- a/libgav1/cmake/toolchains/android.cmake +++ b/libgav1/cmake/toolchains/android.cmake @@ -30,9 +30,9 @@ if(NOT ANDROID_ABI) set(ANDROID_ABI arm64-v8a) endif() -# Force arm mode for 32-bit targets (instead of the default thumb) to improve -# performance. -if(NOT ANDROID_ARM_MODE) +# Force arm mode for 32-bit arm targets (instead of the default thumb) to +# improve performance. +if(ANDROID_ABI MATCHES "^armeabi" AND NOT ANDROID_ARM_MODE) set(ANDROID_ARM_MODE arm) endif() diff --git a/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake b/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake index 7448f54..7d58ce1 100644 --- a/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake +++ b/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake @@ -27,10 +27,13 @@ endif() if(NOT CMAKE_C_COMPILER) set(CMAKE_C_COMPILER ${CROSS}gcc) endif() -set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm") +# Note: -march=armv7-a+fp is an alternative to -mfpu with newer versions of +# gcc: +# https://gcc.gnu.org/git/?p=gcc.git&a=commit;h=dff2abcbee65dbb4b7ca3ade0f7622ffdc0af391 +set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3") if(NOT CMAKE_CXX_COMPILER) set(CMAKE_CXX_COMPILER ${CROSS}g++) endif() -set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm") +set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3") set(CMAKE_SYSTEM_PROCESSOR "armv7") set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon") diff --git a/libgav1/examples/libgav1_examples.cmake b/libgav1/examples/libgav1_examples.cmake index 1f949f3..a3ec156 100644 --- a/libgav1/examples/libgav1_examples.cmake +++ b/libgav1/examples/libgav1_examples.cmake @@ -17,6 +17,13 @@ if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_) endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1) +if(NOT LIBGAV1_ENABLE_EXAMPLES) + macro(libgav1_add_examples_targets) + + endmacro() + return() +endif() + set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc" "${libgav1_examples}/file_reader.h" "${libgav1_examples}/file_reader_constants.cc" diff --git a/libgav1/src/buffer_pool.cc b/libgav1/src/buffer_pool.cc index c1a5606..582f13c 100644 --- a/libgav1/src/buffer_pool.cc +++ b/libgav1/src/buffer_pool.cc @@ -156,19 +156,15 @@ bool BufferPool::OnFrameBufferSizeChanged(int bitdepth, } RefCountedBufferPtr BufferPool::GetFreeBuffer() { - // In frame parallel mode, the GetFreeBuffer() calls from ObuParser all happen - // from the same thread serially, but the GetFreeBuffer() call in - // DecoderImpl::ApplyFilmGrain can happen from multiple threads at the same - // time. So this function has to be thread safe. - // TODO(b/142583029): Investigate if the GetFreeBuffer() call in - // DecoderImpl::ApplyFilmGrain() call can be serialized so that this function - // need not be thread safe. std::unique_lock<std::mutex> lock(mutex_); for (auto buffer : buffers_) { if (!buffer->in_use_) { buffer->in_use_ = true; buffer->progress_row_ = -1; buffer->frame_state_ = kFrameStateUnknown; + buffer->hdr_cll_set_ = false; + buffer->hdr_mdcv_set_ = false; + buffer->itut_t35_set_ = false; lock.unlock(); return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool); } diff --git a/libgav1/src/buffer_pool.h b/libgav1/src/buffer_pool.h index d9eba6d..d4e50e0 100644 --- a/libgav1/src/buffer_pool.h +++ b/libgav1/src/buffer_pool.h @@ -33,6 +33,7 @@ #include "src/symbol_decoder_context.h" #include "src/utils/compiler_attributes.h" #include "src/utils/constants.h" +#include "src/utils/dynamic_buffer.h" #include "src/utils/reference_info.h" #include "src/utils/segmentation.h" #include "src/utils/segmentation_map.h" @@ -134,6 +135,36 @@ class RefCountedBuffer : public MaxAlignedAllocable { int temporal_id() const { return temporal_id_; } void set_temporal_id(int value) { temporal_id_ = value; } + ObuMetadataHdrCll hdr_cll() const { return hdr_cll_; } + void set_hdr_cll(const ObuMetadataHdrCll& hdr_cll) { + hdr_cll_set_ = true; + hdr_cll_ = hdr_cll; + } + bool hdr_cll_set() const { return hdr_cll_set_; } + + ObuMetadataHdrMdcv hdr_mdcv() const { return hdr_mdcv_; } + void set_hdr_mdcv(const ObuMetadataHdrMdcv& hdr_mdcv) { + hdr_mdcv_set_ = true; + hdr_mdcv_ = hdr_mdcv; + } + bool hdr_mdcv_set() const { return hdr_mdcv_set_; } + + ObuMetadataItutT35 itut_t35() const { return itut_t35_; } + bool set_itut_t35(const ObuMetadataItutT35& itut_t35, + const uint8_t* const payload) { + itut_t35_ = itut_t35; + if (itut_t35.payload_size > 0) { + if (!itut_t35_payload_.Resize(itut_t35.payload_size)) return false; + memcpy(itut_t35_payload_.get(), payload, itut_t35.payload_size); + itut_t35_.payload_bytes = itut_t35_payload_.get(); + } else { + itut_t35_.payload_bytes = nullptr; + } + itut_t35_set_ = true; + return true; + } + bool itut_t35_set() const { return itut_t35_set_; } + SegmentationMap* segmentation_map() { return &segmentation_map_; } const SegmentationMap* segmentation_map() const { return &segmentation_map_; } @@ -317,6 +348,14 @@ class RefCountedBuffer : public MaxAlignedAllocable { int spatial_id_ = 0; int temporal_id_ = 0; + ObuMetadataHdrCll hdr_cll_ = {}; + bool hdr_cll_set_ = false; // Set to true when set_hdr_cll() is called. + ObuMetadataHdrMdcv hdr_mdcv_ = {}; + bool hdr_mdcv_set_ = false; // Set to true when set_hdr_mdcv() is called. + ObuMetadataItutT35 itut_t35_ = {}; + DynamicBuffer<uint8_t> itut_t35_payload_; + bool itut_t35_set_ = false; // Set to true when set_itut_t35() is called. + // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array. SegmentationMap segmentation_map_; diff --git a/libgav1/src/decoder_impl.cc b/libgav1/src/decoder_impl.cc index dbb9e81..e8de64a 100644 --- a/libgav1/src/decoder_impl.cc +++ b/libgav1/src/decoder_impl.cc @@ -1171,6 +1171,24 @@ StatusCode DecoderImpl::CopyFrameToOutputBuffer( buffer_.spatial_id = frame->spatial_id(); buffer_.temporal_id = frame->temporal_id(); buffer_.buffer_private_data = frame->buffer_private_data(); + if (frame->hdr_cll_set()) { + buffer_.has_hdr_cll = 1; + buffer_.hdr_cll = frame->hdr_cll(); + } else { + buffer_.has_hdr_cll = 0; + } + if (frame->hdr_mdcv_set()) { + buffer_.has_hdr_mdcv = 1; + buffer_.hdr_mdcv = frame->hdr_mdcv(); + } else { + buffer_.has_hdr_mdcv = 0; + } + if (frame->itut_t35_set()) { + buffer_.has_itut_t35 = 1; + buffer_.itut_t35 = frame->itut_t35(); + } else { + buffer_.has_itut_t35 = 0; + } output_frame_ = frame; return kStatusOk; } @@ -1602,7 +1620,7 @@ StatusCode DecoderImpl::ApplyFilmGrain( (*film_grain_frame)->buffer()->stride(kPlaneV)); const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU); #if LIBGAV1_MAX_BITDEPTH >= 10 - if (displayable_frame->buffer()->bitdepth() > 8) { + if (displayable_frame->buffer()->bitdepth() == 10) { FilmGrain<10> film_grain(displayable_frame->film_grain_params(), displayable_frame->buffer()->is_monochrome(), color_matrix_is_identity, @@ -1625,6 +1643,30 @@ StatusCode DecoderImpl::ApplyFilmGrain( return kStatusOk; } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 + if (displayable_frame->buffer()->bitdepth() == 12) { + FilmGrain<12> film_grain(displayable_frame->film_grain_params(), + displayable_frame->buffer()->is_monochrome(), + color_matrix_is_identity, + displayable_frame->buffer()->subsampling_x(), + displayable_frame->buffer()->subsampling_y(), + displayable_frame->upscaled_width(), + displayable_frame->frame_height(), thread_pool); + if (!film_grain.AddNoise( + displayable_frame->buffer()->data(kPlaneY), + displayable_frame->buffer()->stride(kPlaneY), + displayable_frame->buffer()->data(kPlaneU), + displayable_frame->buffer()->data(kPlaneV), input_stride_uv, + (*film_grain_frame)->buffer()->data(kPlaneY), + (*film_grain_frame)->buffer()->stride(kPlaneY), + (*film_grain_frame)->buffer()->data(kPlaneU), + (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) { + LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed."); + return kStatusOutOfMemory; + } + return kStatusOk; + } +#endif // LIBGAV1_MAX_BITDEPTH == 12 FilmGrain<8> film_grain(displayable_frame->film_grain_params(), displayable_frame->buffer()->is_monochrome(), color_matrix_is_identity, diff --git a/libgav1/src/decoder_impl.h b/libgav1/src/decoder_impl.h index b52ecdf..b75417d 100644 --- a/libgav1/src/decoder_impl.h +++ b/libgav1/src/decoder_impl.h @@ -141,8 +141,9 @@ class DecoderImpl : public Allocable { int64_t user_private_data, void* buffer_private_data); StatusCode DequeueFrame(const DecoderBuffer** out_ptr); static constexpr int GetMaxBitdepth() { - static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10, - "LIBGAV1_MAX_BITDEPTH must be 8 or 10."); + static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10 || + LIBGAV1_MAX_BITDEPTH == 12, + "LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12."); return LIBGAV1_MAX_BITDEPTH; } diff --git a/libgav1/src/dsp/arm/common_neon.h b/libgav1/src/dsp/arm/common_neon.h index 9c46525..c0af2c1 100644 --- a/libgav1/src/dsp/arm/common_neon.h +++ b/libgav1/src/dsp/arm/common_neon.h @@ -309,6 +309,12 @@ inline uint8x16_t MaskOverreadsQ(const uint8x16_t source, return dst; } +inline uint16x8_t MaskOverreadsQ(const uint16x8_t source, + const ptrdiff_t over_read_in_bytes) { + return vreinterpretq_u16_u8( + MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes)); +} + inline uint8x8_t Load1MsanU8(const uint8_t* const source, const ptrdiff_t over_read_in_bytes) { return MaskOverreads(vld1_u8(source), over_read_in_bytes); @@ -325,20 +331,6 @@ inline uint16x8_t Load1QMsanU16(const uint16_t* const source, vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes)); } -inline uint16x8x2_t Load2QMsanU16(const uint16_t* const source, - const ptrdiff_t over_read_in_bytes) { - // Relative source index of elements (2 bytes each): - // dst.val[0]: 00 02 04 06 08 10 12 14 - // dst.val[1]: 01 03 05 07 09 11 13 15 - uint16x8x2_t dst = vld2q_u16(source); - dst.val[0] = vreinterpretq_u16_u8(MaskOverreadsQ( - vreinterpretq_u8_u16(dst.val[0]), over_read_in_bytes >> 1)); - dst.val[1] = vreinterpretq_u16_u8( - MaskOverreadsQ(vreinterpretq_u8_u16(dst.val[1]), - (over_read_in_bytes >> 1) + (over_read_in_bytes % 4))); - return dst; -} - inline uint32x4_t Load1QMsanU32(const uint32_t* const source, const ptrdiff_t over_read_in_bytes) { return vreinterpretq_u32_u8(MaskOverreadsQ( @@ -402,6 +394,24 @@ inline void Store8(void* const buf, const uint16x8_t val) { vst1q_u16(static_cast<uint16_t*>(buf), val); } +inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) { +#if LIBGAV1_MSAN + // The memory shadow is incorrect for vst4q_u16, only marking the first 16 + // bytes of the destination as initialized. To avoid missing truly + // uninitialized memory, check the input vectors first, before marking the + // whole 64 bytes initialized. If any input vector contains unused values, it + // should pass through MaskOverreadsQ first. + __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0])); + __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1])); + __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2])); + __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3])); + vst4q_s16(static_cast<int16_t*>(buf), src); + __msan_unpoison(buf, sizeof(int16x8x4_t)); +#else + vst4q_s16(static_cast<int16_t*>(buf), src); +#endif // LIBGAV1_MSAN +} + //------------------------------------------------------------------------------ // Pointer helpers. @@ -587,7 +597,8 @@ inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) { //------------------------------------------------------------------------------ // Saturation helpers. -inline int16x4_t Clip3S16(int16x4_t val, int16x4_t low, int16x4_t high) { +inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low, + const int16x4_t high) { return vmin_s16(vmax_s16(val, low), high); } @@ -596,7 +607,7 @@ inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low, return vminq_s16(vmaxq_s16(val, low), high); } -inline uint16x8_t ConvertToUnsignedPixelU16(int16x8_t val, int bitdepth) { +inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) { const int16x8_t low = vdupq_n_s16(0); const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1); @@ -727,7 +738,7 @@ inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); } // Output: // b0.val[0]: 00 01 02 03 16 17 18 19 // b0.val[1]: 04 05 06 07 20 21 22 23 -inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) { +inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) { int16x8x2_t b0; b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); @@ -736,7 +747,7 @@ inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) { return b0; } -inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { +inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) { uint16x8x2_t b0; b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1))); @@ -750,6 +761,11 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { // 10 11 12 13 // 20 21 22 23 // 30 31 32 33 +// Output: +// 00 10 20 30 +// 01 11 21 31 +// 02 12 22 32 +// 03 13 23 33 inline void Transpose4x4(uint16x4_t a[4]) { // b: // 00 10 02 12 diff --git a/libgav1/src/dsp/arm/convolve_10bit_neon.cc b/libgav1/src/dsp/arm/convolve_10bit_neon.cc index b7205df..389f029 100644 --- a/libgav1/src/dsp/arm/convolve_10bit_neon.cc +++ b/libgav1/src/dsp/arm/convolve_10bit_neon.cc @@ -45,12 +45,12 @@ namespace { // Pixel output range: [ 0, 1023] // Compound output range: [ 3988, 61532] -template <int filter_index> +template <int num_taps> int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, const int16x4_t* const taps) { const auto* ssrc = reinterpret_cast<const int16x8_t*>(src); int32x4x2_t sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -65,7 +65,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -84,7 +84,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -106,12 +106,12 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, return sum; } -template <int filter_index> +template <int num_taps> int32x4_t SumOnePassTaps(const uint16x4_t* const src, const int16x4_t* const taps) { const auto* ssrc = reinterpret_cast<const int16x4_t*>(src); int32x4_t sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -119,7 +119,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, sum = vmlal_s16(sum, ssrc[3], taps[3]); sum = vmlal_s16(sum, ssrc[4], taps[4]); sum = vmlal_s16(sum, ssrc[5], taps[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -129,7 +129,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, sum = vmlal_s16(sum, ssrc[5], taps[5]); sum = vmlal_s16(sum, ssrc[6], taps[6]); sum = vmlal_s16(sum, ssrc[7], taps[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -143,7 +143,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, return sum; } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -162,15 +162,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const uint16x8_t src_long_hi = vld1q_u16(s + 8); uint16x8_t v_src[8]; int32x4x2_t v_sum; - if (filter_index < 2) { + if (num_taps == 6) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); v_src[4] = vextq_u16(src_long, src_long_hi, 4); v_src[5] = vextq_u16(src_long, src_long_hi, 5); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1); - } else if (filter_index == 2) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1); + } else if (num_taps == 8) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); @@ -179,17 +179,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, v_src[5] = vextq_u16(src_long, src_long_hi, 5); v_src[6] = vextq_u16(src_long, src_long_hi, 6); v_src[7] = vextq_u16(src_long, src_long_hi, 7); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap); - } else if (filter_index == 3) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap); + } else if (num_taps == 2) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); - } else { // filter_index > 3 + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); + } else { // 4 taps v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } const int16x4_t d0 = @@ -213,15 +213,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const uint16x8_t src_long_hi = vld1q_u16(src + x + 8); uint16x8_t v_src[8]; int32x4x2_t v_sum; - if (filter_index < 2) { + if (num_taps == 6) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); v_src[4] = vextq_u16(src_long, src_long_hi, 4); v_src[5] = vextq_u16(src_long, src_long_hi, 5); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1); - } else if (filter_index == 2) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1); + } else if (num_taps == 8) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); @@ -230,17 +230,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, v_src[5] = vextq_u16(src_long, src_long_hi, 5); v_src[6] = vextq_u16(src_long, src_long_hi, 6); v_src[7] = vextq_u16(src_long, src_long_hi, 7); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap); - } else if (filter_index == 3) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap); + } else if (num_taps == 2) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); - } else { // filter_index > 3 + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); + } else { // 4 taps v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } if (is_compound) { const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset); @@ -276,7 +276,7 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, } while (--y != 0); } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -291,14 +291,14 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, int32x4_t v_sum; const uint16x8_t src_long = vld1q_u16(src); v_src[0] = vget_low_u16(src_long); - if (filter_index == 3) { + if (num_taps == 2) { v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1)); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); } else { v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1)); v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2)); v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3)); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } if (is_compound || is_2d) { const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1); @@ -321,7 +321,7 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, } while (--y != 0); } -template <int filter_index, bool is_2d> +template <int num_taps, bool is_2d> void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -336,7 +336,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride)); const int16x8x2_t input = vzipq_s16(input0, input1); int32x4_t v_sum; - if (filter_index == 3) { + if (num_taps == 2) { v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]); v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)), @@ -387,7 +387,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src)); int32x4_t v_sum; - if (filter_index == 3) { + if (num_taps == 2) { v_sum = vmull_s16(vget_low_s16(input), v_tap[3]); v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]); @@ -406,17 +406,17 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, } } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t pred_stride, const int width, const int height, const int16x4_t* const v_tap) { - assert(width < 8 || filter_index <= 3); + assert(width < 8 || num_taps != 4); // Don't simplify the redundant if conditions with the template parameters, // which helps the compiler generate compact code. - if (width >= 8 && filter_index <= 3) { - FilterHorizontalWidth8AndUp<filter_index, is_compound, is_2d>( + if (width >= 8 && num_taps != 4) { + FilterHorizontalWidth8AndUp<num_taps, is_compound, is_2d>( src, src_stride, dest, pred_stride, width, height, v_tap); return; } @@ -424,17 +424,17 @@ void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, // Horizontal passes only needs to account for number of taps 2 and 4 when // |width| <= 4. assert(width <= 4); - assert(filter_index >= 3 && filter_index <= 5); - if (filter_index >= 3 && filter_index <= 5) { + assert(num_taps == 2 || num_taps == 4); + if (num_taps == 2 || num_taps == 4) { if (width == 4) { - FilterHorizontalWidth4<filter_index, is_compound, is_2d>( + FilterHorizontalWidth4<num_taps, is_compound, is_2d>( src, src_stride, dest, pred_stride, height, v_tap); return; } assert(width == 2); if (!is_compound) { - FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest, - pred_stride, height, v_tap); + FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest, + pred_stride, height, v_tap); } } } @@ -455,22 +455,17 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( } if (filter_index == 2) { // 8 tap. - FilterHorizontal<2, is_compound, is_2d>(src, src_stride, dst, dst_stride, + FilterHorizontal<8, is_compound, is_2d>(src, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 1) { // 6 tap. - FilterHorizontal<1, is_compound, is_2d>(src + 1, src_stride, dst, + } else if (filter_index < 2) { // 6 tap. + FilterHorizontal<6, is_compound, is_2d>(src + 1, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 0) { // 6 tap. - FilterHorizontal<0, is_compound, is_2d>(src + 1, src_stride, dst, - dst_stride, width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. - FilterHorizontal<5, is_compound, is_2d>(src + 2, src_stride, dst, - dst_stride, width, height, v_tap); } else { // 2 tap. - FilterHorizontal<3, is_compound, is_2d>(src + 3, src_stride, dst, + FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst, dst_stride, width, height, v_tap); } } @@ -510,13 +505,12 @@ void ConvolveCompoundHorizontal_NEON( filter_index); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* const dst16 = static_cast<uint16_t*>(dst); @@ -555,7 +549,7 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, srcs[next_row] = vld1q_u16(src_x); src_x += src_stride; - const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); + const int32x4x2_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); if (is_compound) { const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset); const int16x4_t d0 = @@ -593,13 +587,12 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, } while (x < width); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* dst16 = static_cast<uint16_t*>(dst); @@ -633,8 +626,8 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, srcs[num_taps] = vld1_u16(src); src += src_stride; - const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); - const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps); + const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); + const int32x4_t v_sum_1 = SumOnePassTaps<num_taps>(srcs + 1, taps); if (is_compound) { const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1); const int16x4_t d1 = @@ -673,13 +666,12 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, } while (y != 0); } -template <int filter_index> +template <int num_taps> void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* dst16 = static_cast<uint16_t*>(dst); @@ -718,7 +710,7 @@ void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src, src += src_stride; srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2); - const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); + const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); const uint16x4_t d0 = vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth); Store2<0>(dst16, d0); @@ -1180,13 +1172,13 @@ void ConvolveVertical_NEON( if (filter_index == 0) { // 6 tap. if (width == 2) { - FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { - FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else { - FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if ((static_cast<int>(filter_index == 1) & @@ -1196,33 +1188,33 @@ void ConvolveVertical_NEON( static_cast<int>(vertical_filter_id == 9) | static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap. if (width == 2) { - FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { - FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else { - FilterVertical<1>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if (filter_index == 2) { // 8 tap. if (width == 2) { - FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<8>(src, src_stride, dest, dest_stride, width, height, taps); } } else if (filter_index == 3) { // 2 tap. if (width == 2) { - FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps + 3); } else if (width == 4) { - FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps + 3); } else { - FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps + 3); } } else { @@ -1240,13 +1232,13 @@ void ConvolveVertical_NEON( // treating it as though it has 4. if (filter_index == 1) src += src_stride; if (width == 2) { - FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps + 2); } else if (width == 4) { - FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps + 2); } else { - FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps + 2); } } @@ -1274,10 +1266,10 @@ void ConvolveCompoundVertical_NEON( if (filter_index == 0) { // 6 tap. if (width == 4) { - FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { - FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } } else if ((static_cast<int>(filter_index == 1) & @@ -1287,26 +1279,26 @@ void ConvolveCompoundVertical_NEON( static_cast<int>(vertical_filter_id == 9) | static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap. if (width == 4) { - FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { - FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } } else if (filter_index == 2) { // 8 tap. if (width == 4) { - FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps); } else { - FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } } else if (filter_index == 3) { // 2 tap. if (width == 4) { - FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 3); } else { - FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 3); } } else { @@ -1323,10 +1315,10 @@ void ConvolveCompoundVertical_NEON( // treating it as though it has 4. if (filter_index == 1) src += src_stride; if (width == 4) { - FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 2); } else { - FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 2); } } @@ -1980,7 +1972,7 @@ inline void ConvolveKernelHorizontal2Tap( PermuteSrcVals(src_bytes, src_lookup[1])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2034,13 +2026,12 @@ inline void ConvolveKernelHorizontal2Tap( const uint16x4_t src_high[2] = {vget_high_u16(src[0]), vget_high_u16(src[1])}; - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2123,7 +2114,7 @@ inline void ConvolveKernelHorizontalPositive4Tap( PermuteSrcVals(src_bytes, src_lookup[3])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/5>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2202,7 +2193,7 @@ inline void ConvolveKernelHorizontalSigned4Tap( PermuteSrcVals(src_bytes, src_lookup[3])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/4>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2297,13 +2288,12 @@ inline void ConvolveKernelHorizontalSigned6Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2401,13 +2391,12 @@ inline void ConvolveKernelHorizontalMixed6Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2505,13 +2494,12 @@ inline void ConvolveKernelHorizontalSigned8Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; diff --git a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc index 7d287c8..6087276 100644 --- a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc +++ b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc @@ -36,44 +36,48 @@ constexpr int kInterPostRoundBit = 4; namespace low_bitdepth { namespace { -inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0, +inline uint8x8_t ComputeWeightedAverage8(const int16x8_t pred0, const int16x8_t pred1, - const int16x4_t weights[2]) { - // TODO(https://issuetracker.google.com/issues/150325685): Investigate range. - const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0)); - const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0)); - const int32x4_t blended_lo = - vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1)); - const int32x4_t blended_hi = - vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1)); - - return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4), - vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4)); + const int16x8_t weight) { + // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0 + // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >> + // 8(=kInterPostRoundBit + 4) + // The formula is manipulated to avoid lengthening to 32 bits. + // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1 + // = (p0 - p1) * w0 + 16 * p1 + // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808. + const int16x8_t diff = vsubq_s16(pred0, pred1); + // (((p0 - p1) * (w0 << 11) << 1) >> 16) + ((16 * p1) >> 4) + const int16x8_t weighted_diff = vqdmulhq_s16(diff, weight); + // ((p0 - p1) * w0 >> 4) + p1 + const int16x8_t upscaled_average = vaddq_s16(weighted_diff, pred1); + // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4 + return vqrshrun_n_s16(upscaled_average, kInterPostRoundBit); } -template <int width, int height> +template <int width> inline void DistanceWeightedBlendSmall_NEON( const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2], - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT prediction_1, const int height, + const int16x8_t weight, void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); constexpr int step = 16 / width; - for (int y = 0; y < height; y += step) { + int y = height; + do { const int16x8_t src_00 = vld1q_s16(prediction_0); const int16x8_t src_10 = vld1q_s16(prediction_1); prediction_0 += 8; prediction_1 += 8; - const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights); + const uint8x8_t result0 = ComputeWeightedAverage8(src_00, src_10, weight); const int16x8_t src_01 = vld1q_s16(prediction_0); const int16x8_t src_11 = vld1q_s16(prediction_1); prediction_0 += 8; prediction_1 += 8; - const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights); + const uint8x8_t result1 = ComputeWeightedAverage8(src_01, src_11, weight); - const uint8x8_t result0 = vqmovun_s16(res0); - const uint8x8_t result1 = vqmovun_s16(res1); if (width == 4) { StoreLo4(dst, result0); dst += dest_stride; @@ -90,12 +94,13 @@ inline void DistanceWeightedBlendSmall_NEON( vst1_u8(dst, result1); dst += dest_stride; } - } + y -= step; + } while (y != 0); } inline void DistanceWeightedBlendLarge_NEON( const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2], + const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x8_t weight, const int width, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); @@ -106,16 +111,15 @@ inline void DistanceWeightedBlendLarge_NEON( do { const int16x8_t src0_lo = vld1q_s16(prediction_0 + x); const int16x8_t src1_lo = vld1q_s16(prediction_1 + x); - const int16x8_t res_lo = - ComputeWeightedAverage8(src0_lo, src1_lo, weights); + const uint8x8_t res_lo = + ComputeWeightedAverage8(src0_lo, src1_lo, weight); const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8); const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8); - const int16x8_t res_hi = - ComputeWeightedAverage8(src0_hi, src1_hi, weights); + const uint8x8_t res_hi = + ComputeWeightedAverage8(src0_hi, src1_hi, weight); - const uint8x16_t result = - vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi)); + const uint8x16_t result = vcombine_u8(res_lo, res_hi); vst1q_u8(dst + x, result); x += 16; } while (x < width); @@ -128,52 +132,25 @@ inline void DistanceWeightedBlendLarge_NEON( inline void DistanceWeightedBlend_NEON( const void* LIBGAV1_RESTRICT prediction_0, const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, const int height, + const uint8_t /*weight_1*/, const int width, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)}; - // TODO(johannkoenig): Investigate the branching. May be fine to call with a - // variable height. + // Upscale the weight for vqdmulh. + const int16x8_t weight = vdupq_n_s16(weight_0 << 11); if (width == 4) { - if (height == 4) { - DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest, - dest_stride); - } else if (height == 8) { - DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest, - dest_stride); - } else { - assert(height == 16); - DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest, - dest_stride); - } + DistanceWeightedBlendSmall_NEON<4>(pred_0, pred_1, height, weight, dest, + dest_stride); return; } if (width == 8) { - switch (height) { - case 4: - DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest, - dest_stride); - return; - case 8: - DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest, - dest_stride); - return; - case 16: - DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest, - dest_stride); - return; - default: - assert(height == 32); - DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest, - dest_stride); - - return; - } + DistanceWeightedBlendSmall_NEON<8>(pred_0, pred_1, height, weight, dest, + dest_stride); + return; } - DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest, + DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weight, width, height, dest, dest_stride); } diff --git a/libgav1/src/dsp/arm/film_grain_neon.cc b/libgav1/src/dsp/arm/film_grain_neon.cc index 0b1b481..76e1151 100644 --- a/libgav1/src/dsp/arm/film_grain_neon.cc +++ b/libgav1/src/dsp/arm/film_grain_neon.cc @@ -18,23 +18,21 @@ #if LIBGAV1_ENABLE_NEON #include <arm_neon.h> -#include <algorithm> #include <cassert> #include <cstddef> #include <cstdint> #include <cstring> -#include <new> #include "src/dsp/arm/common_neon.h" -#include "src/dsp/arm/film_grain_neon.h" -#include "src/dsp/common.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/film_grain_common.h" +#include "src/utils/array_2d.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" -#include "src/utils/logging.h" +#include "src/utils/constants.h" #include "src/utils/memory.h" +#include "src/utils/types.h" namespace libgav1 { namespace dsp { @@ -52,10 +50,8 @@ inline int16x8_t GetSignedSource8(const uint8_t* src) { return ZeroExtend(vld1_u8(src)); } -inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int /*valid_range*/) { - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return ZeroExtend(Load1MsanU8(src, 0)); +inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int valid_range) { + return ZeroExtend(Load1MsanU8(src, 8 - valid_range)); } inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) { @@ -69,11 +65,8 @@ inline int16x8_t GetSignedSource8(const uint16_t* src) { return vreinterpretq_s16_u16(vld1q_u16(src)); } -inline int16x8_t GetSignedSource8Msan(const uint16_t* src, - int /*valid_range*/) { - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return vreinterpretq_s16_u16(Load1QMsanU16(src, 0)); +inline int16x8_t GetSignedSource8Msan(const uint16_t* src, int valid_range) { + return vreinterpretq_s16_u16(Load1QMsanU16(src, 16 - valid_range)); } inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) { @@ -198,17 +191,13 @@ inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) { } inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma, - int subsampling_x, int /*valid_range*/) { + int subsampling_x, int valid_range) { if (subsampling_x != 0) { - // TODO(b/194217060): restore |valid_range| usage after correcting call - // sites causing test vector failures. - const uint8x16_t src = Load1QMsanU8(luma, 0); - + const uint8x16_t src = MaskOverreadsQ(vld1q_u8(luma), 16 - valid_range); + // MemorySanitizer registers vpaddlq_u8 as a use of the memory. return vrshrq_n_u16(vpaddlq_u8(src), 1); } - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return vmovl_u8(Load1MsanU8(luma, 0)); + return MaskOverreadsQ(vmovl_u8(vld1_u8(luma)), 16 - valid_range); } #if LIBGAV1_MAX_BITDEPTH >= 10 @@ -252,16 +241,13 @@ inline uint16x8_t GetAverageLuma(const uint16_t* const luma, } inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma, - int subsampling_x, int /*valid_range*/) { + int subsampling_x, int valid_range) { if (subsampling_x != 0) { - // TODO(b/194217060): restore |valid_range| usage after correcting call - // sites causing test vector failures. - const uint16x8x2_t src = Load2QMsanU16(luma, 0); - return vrhaddq_u16(src.val[0], src.val[1]); + const uint16x8x2_t src = vld2q_u16(luma); + const uint16x8_t result = vrhaddq_u16(src.val[0], src.val[1]); + return MaskOverreadsQ(result, 16 - valid_range); } - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return Load1QMsanU16(luma, 0); + return Load1QMsanU16(luma, 16 - valid_range); } #endif // LIBGAV1_MAX_BITDEPTH >= 10 @@ -614,8 +600,7 @@ void InitializeScalingLookupTable_NEON(int num_points, } static_assert(sizeof(scaling_lut[0]) == 2, ""); Memset(scaling_lut, point_scaling[0], - std::max(static_cast<int>(point_value[0]), 1) - << (bitdepth - kBitdepth8)); + (static_cast<int>(point_value[0]) + 1) << (bitdepth - kBitdepth8)); const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000)); const int32x4_t rounding = vdupq_n_s32(32768); for (int i = 0; i < num_points - 1; ++i) { @@ -666,7 +651,7 @@ void InitializeScalingLookupTable_NEON(int num_points, const int16x8x4_t result = { start, vaddq_s16(start, vrshrq_n_s16(delta, 2)), vaddq_s16(start, delta2), vaddq_s16(start, delta3)}; - vst4q_s16(&scaling_lut[x_base], result); + Store4QMsanS16(&scaling_lut[x_base], result); } else { vst1q_s16(&scaling_lut[x_base], full_interp); } @@ -696,13 +681,29 @@ inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low, } template <int bitdepth, typename Pixel> -inline int16x8_t GetScalingFactors( - const int16_t scaling_lut[kScalingLookupTableSize], const Pixel* source) { +inline int16x8_t GetScalingFactors(const int16_t scaling_lut[], + const Pixel* source) { int16_t start_vals[8]; static_assert(bitdepth <= kBitdepth10, "NEON Film Grain is not yet implemented for 12bpp."); +#if LIBGAV1_MSAN + memset(start_vals, 0, sizeof(start_vals)); +#endif for (int i = 0; i < 8; ++i) { - assert(source[i] < kScalingLookupTableSize << (bitdepth - 2)); + assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8))); + start_vals[i] = scaling_lut[source[i]]; + } + return vld1q_s16(start_vals); +} + +template <int bitdepth, typename Pixel> +inline int16x8_t GetScalingFactors(const int16_t scaling_lut[], + const Pixel* source, const int valid_range) { + int16_t start_vals[8]; + static_assert(bitdepth <= kBitdepth10, + "NEON Film Grain is not yet implemented for 12bpp."); + for (int i = 0; i < valid_range; ++i) { + assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8))); start_vals[i] = scaling_lut[source[i]]; } return vld1q_s16(start_vals); @@ -743,10 +744,11 @@ void BlendNoiseWithImageLuma_NEON( const int16x8_t scaling_shift_vect = vdupq_n_s16( (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift); + const int safe_width = width & ~15; int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_width; x += 8) { // This operation on the unsigned input is safe in 8bpp because the vector // is widened before it is reinterpreted. const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]); @@ -767,8 +769,8 @@ void BlendNoiseWithImageLuma_NEON( // This operation on the unsigned input is safe in 8bpp because the vector // is widened before it is reinterpreted. const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]); - const int16x8_t scaling1 = GetScalingFactors<bitdepth, Pixel>( - scaling_lut_y, &in_y_row[std::min(x, width)]); + const int16x8_t scaling1 = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect); @@ -778,8 +780,41 @@ void BlendNoiseWithImageLuma_NEON( // function for just that case, though the gain would be very small. StoreUnsigned8(&out_y_row[x], vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling))); - x += 8; - } while (x < width); + } + + if (x < width) { + assert(width - x < 16); + if (x < width - 8) { + const int16x8_t orig = GetSignedSource8(&in_y_row[x]); + const int16x8_t scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); + int16x8_t noise = + GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); + + noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect); + const int16x8_t combined = vaddq_s16(orig, noise); + // In 8bpp, when params_.clip_to_restricted_range == false, we can + // replace clipping with vqmovun_s16, but it's not likely to be worth + // copying the function for just that case, though the gain would be + // very small. + StoreUnsigned8(&out_y_row[x], + vreinterpretq_u16_s16(Clip3(combined, floor, ceiling))); + x += 8; + } + const int valid_range_pixels = width - x; + const int valid_range_bytes = (width - x) * sizeof(in_y_row[0]); + const int16x8_t orig = + GetSignedSource8Msan(&in_y_row[x], valid_range_bytes); + const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>( + scaling_lut_y, &in_y_row[x], valid_range_pixels); + int16x8_t noise = + GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); + noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect); + + const int16x8_t combined = vaddq_s16(orig, noise); + StoreUnsigned8(&out_y_row[x], + vreinterpretq_u16_s16(Clip3(combined, floor, ceiling))); + } in_y_row += source_stride_y; out_y_row += dest_stride_y; } while (++y < height); @@ -787,13 +822,9 @@ void BlendNoiseWithImageLuma_NEON( template <int bitdepth, typename GrainType, typename Pixel> inline int16x8_t BlendChromaValsWithCfl( - const Pixel* LIBGAV1_RESTRICT average_luma_buffer, - const int16_t* LIBGAV1_RESTRICT scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor, const GrainType* LIBGAV1_RESTRICT noise_image_cursor, - const int16x8_t scaling_shift_vect) { - const int16x8_t scaling = - GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); + const int16x8_t scaling, const int16x8_t scaling_shift_vect) { const int16x8_t orig = GetSignedSource8(chroma_cursor); int16x8_t noise = GetSignedSource8(noise_image_cursor); noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect); @@ -812,7 +843,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON( const int16x8_t floor = vdupq_n_s16(min_value); const int16x8_t ceiling = vdupq_n_s16(max_chroma); Pixel luma_buffer[16]; - memset(luma_buffer, 0, sizeof(luma_buffer)); // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe // for 16 bit signed integers. In higher bitdepths, however, we have to // expand to 32 to protect the sign bit. @@ -831,40 +861,45 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON( int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const uint16x8_t average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); StoreUnsigned8(average_luma_buffer, average_luma); + const int16x8_t scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); const int16x8_t blended = BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( - average_luma_buffer, scaling_lut, &in_chroma_row[x], - &(noise_image[y + start_height][x]), scaling_shift_vect); + &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling, + scaling_shift_vect); // In 8bpp, when params_.clip_to_restricted_range == false, we can replace // clipping with vqmovun_s16, but it's not likely to be worth copying the // function for just that case. StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); - x += 8; - } while (x < safe_chroma_width); + } if (x < chroma_width) { const int luma_x = x << subsampling_x; const int valid_range_pixels = width - luma_x; + const int valid_range_chroma_pixels = chroma_width - x; const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]); + assert(valid_range_pixels < 16); memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes); luma_buffer[valid_range_pixels] = in_y_row[width - 1]; const uint16x8_t average_luma = GetAverageLumaMsan( - luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])); + luma_buffer, subsampling_x, valid_range_chroma_pixels << 1); StoreUnsigned8(average_luma_buffer, average_luma); + const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>( + scaling_lut, average_luma_buffer, valid_range_chroma_pixels); const int16x8_t blended = BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( - average_luma_buffer, scaling_lut, &in_chroma_row[x], - &(noise_image[y + start_height][x]), scaling_shift_vect); + &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling, + scaling_shift_vect); // In 8bpp, when params_.clip_to_restricted_range == false, we can replace // clipping with vqmovun_s16, but it's not likely to be worth copying the // function for just that case. @@ -915,7 +950,8 @@ inline int16x8_t BlendChromaValsNoCfl( const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig, const int8_t* LIBGAV1_RESTRICT noise_image_cursor, const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect, - const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) { + const int16x8_t& offset, int luma_multiplier, int chroma_multiplier, + bool restrict_scaling_lookup, int valid_range_pixels = 0) { uint8_t merged_buffer[8]; const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier); const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier); @@ -925,8 +961,12 @@ inline int16x8_t BlendChromaValsNoCfl( // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required. const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4); vst1_u8(merged_buffer, merged); + const int16x8_t scaling = - GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer); + restrict_scaling_lookup + ? GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer, + valid_range_pixels) + : GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer); int16x8_t noise = GetSignedSource8(noise_image_cursor); noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect); return vaddq_s16(orig, noise); @@ -952,34 +992,28 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON( const int chroma_width = (width + subsampling_x) >> subsampling_x; const int safe_chroma_width = chroma_width & ~7; uint8_t luma_buffer[16]; -#if LIBGAV1_MSAN - // Quiet msan warnings. - memset(luma_buffer, 0, sizeof(luma_buffer)); -#endif const int16x8_t offset = vdupq_n_s16(chroma_offset << 5); start_height >>= subsampling_y; int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; - const int valid_range = width - luma_x; + const int valid_range_chroma_pixels = chroma_width - x; const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]); - const int16x8_t average_luma = vreinterpretq_s16_u16( - GetAverageLumaMsan(&in_y_row[luma_x], subsampling_x, valid_range)); + const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan( + &in_y_row[luma_x], subsampling_x, valid_range_chroma_pixels << 1)); const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/false); // In 8bpp, when params_.clip_to_restricted_range == false, we can // replace clipping with vqmovun_s16, but the gain would be small. StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); - - x += 8; - } while (x < safe_chroma_width); + } if (x < chroma_width) { // Begin right edge iteration. Same as the normal iterations, but the @@ -988,19 +1022,20 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON( const int luma_x = x << subsampling_x; const int valid_range_pixels = width - luma_x; const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]); + assert(valid_range_pixels < 16); memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes); luma_buffer[valid_range_pixels] = in_y_row[width - 1]; - const int valid_range_chroma_bytes = - (chroma_width - x) * sizeof(in_chroma_row[0]); + const int valid_range_chroma_pixels = chroma_width - x; const int16x8_t orig_chroma = - GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes); + GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_pixels); const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan( - luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]))); + luma_buffer, subsampling_x, valid_range_chroma_pixels << 1)); const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/true, + valid_range_chroma_pixels); StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); // End of right edge iteration. @@ -1267,7 +1302,8 @@ inline int16x8_t BlendChromaValsNoCfl( const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig, const int16_t* LIBGAV1_RESTRICT noise_image_cursor, const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect, - const int32x4_t& offset, int luma_multiplier, int chroma_multiplier) { + const int32x4_t& offset, int luma_multiplier, int chroma_multiplier, + bool restrict_scaling_lookup, int valid_range_pixels = 0) { uint16_t merged_buffer[8]; const int32x4_t weighted_luma_low = vmull_n_s16(vget_low_s16(average_luma), luma_multiplier); @@ -1287,7 +1323,11 @@ inline int16x8_t BlendChromaValsNoCfl( vst1q_u16(merged_buffer, vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel)); const int16x8_t scaling = - GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer); + restrict_scaling_lookup + ? GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer, + valid_range_pixels) + : GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, + merged_buffer); const int16x8_t noise = GetSignedSource8(noise_image_cursor); const int16x8_t scaled_noise = ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect); @@ -1311,11 +1351,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( const int chroma_width = (width + subsampling_x) >> subsampling_x; const int safe_chroma_width = chroma_width & ~7; uint16_t luma_buffer[16]; -#if LIBGAV1_MSAN - // TODO(b/194217060): This can be removed if the range calculations below are - // fixed. - memset(luma_buffer, 0, sizeof(luma_buffer)); -#endif // Offset is added before downshifting in order to take advantage of // saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp. const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2)); @@ -1324,7 +1359,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const int16x8_t average_luma = vreinterpretq_s16_u16( GetAverageLuma(&in_y_row[luma_x], subsampling_x)); @@ -1332,12 +1367,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/false); StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); - - x += 8; - } while (x < safe_chroma_width); + } if (x < chroma_width) { // Begin right edge iteration. Same as the normal iterations, but the @@ -1346,19 +1379,22 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( const int luma_x = x << subsampling_x; const int valid_range_pixels = width - luma_x; const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]); + assert(valid_range_pixels < 16); memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes); luma_buffer[valid_range_pixels] = in_y_row[width - 1]; + const int valid_range_chroma_pixels = chroma_width - x; const int valid_range_chroma_bytes = (chroma_width - x) * sizeof(in_chroma_row[0]); const int16x8_t orig_chroma = GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes); const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan( - luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]))); + luma_buffer, subsampling_x, valid_range_chroma_pixels << 1)); const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/true, + valid_range_chroma_pixels); StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); // End of right edge iteration. @@ -1442,10 +1478,8 @@ void Init10bpp() { dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON<kBitdepth10>; - // TODO(b/194442742): reenable this function after segfault under armv7 ASan - // is fixed. - // dsp->film_grain.blend_noise_luma = - // BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>; + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>; dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON; dsp->film_grain.blend_noise_chroma[1] = BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>; diff --git a/libgav1/src/dsp/arm/film_grain_neon.h b/libgav1/src/dsp/arm/film_grain_neon.h index 3ba2eef..09596e2 100644 --- a/libgav1/src/dsp/arm/film_grain_neon.h +++ b/libgav1/src/dsp/arm/film_grain_neon.h @@ -39,9 +39,7 @@ void FilmGrainInit_NEON(); #define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON #define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON -// TODO(b/194442742): reenable this function after segfault under armv7 ASan is -// fixed. -// #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON diff --git a/libgav1/src/dsp/arm/intrapred_directional_neon.cc b/libgav1/src/dsp/arm/intrapred_directional_neon.cc index 3cad4a6..e9bdcf0 100644 --- a/libgav1/src/dsp/arm/intrapred_directional_neon.cc +++ b/libgav1/src/dsp/arm/intrapred_directional_neon.cc @@ -505,20 +505,12 @@ inline void DirectionalZone1Blend_WxH( } while (++y < height); } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - -// 7.11.2.4 (8) 90 < angle > 180 -// The strategy for these functions (4xH and 8+xH) is to know how many blocks -// can be processed with just pixels from |top_ptr|, then handle mixed blocks, -// then handle only blocks that take from |left_ptr|. Additionally, a fast -// index-shuffle approach is used for pred values from |left_column| in sections -// that permit it. +// 7.11.2.4 (8) 90 < angle > 180 +// The strategy for these functions (4xH and 8+xH) is to know how many blocks +// can be processed with just pixels from |top_ptr|, then handle mixed blocks, +// then handle only blocks that take from |left_ptr|. Additionally, a fast +// index-shuffle approach is used for pred values from |left_column| in +// sections that permit it. inline void DirectionalZone2_4xH( uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, @@ -544,13 +536,6 @@ inline void DirectionalZone2_4xH( assert(xstep >= 3); const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4); - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - // TODO(johannkoenig): Revisit this for |width| == 4. - const int max_shuffle_height = - std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); - // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -569,9 +554,9 @@ inline void DirectionalZone2_4xH( // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. if (min_top_only_x > 0) { - // Round down to the nearest multiple of 8. - // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should. - const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep, upsampled_top); @@ -584,18 +569,11 @@ inline void DirectionalZone2_4xH( // All rows from |min_left_only_y| down for this set of columns only need // |left_column| to compute. const int min_left_only_y = std::min((4 << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); int xstep_bounds = xstep_bounds_base + xstep_y; int top_x = -xstep - xstep_y; // +8 increment is OK because if height is 4 this only goes once. - for (; y < left_shuffle_stop_y; + for (; y < min_left_only_y; y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { DirectionalZone2FromLeftCol_WxH<4>( dst, stride, min_height, @@ -607,21 +585,8 @@ inline void DirectionalZone2_4xH( upsample_top_shift); } - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - const int16_t base_left_y = vgetq_lane_s16(left_y, 0); - for (; y < min_left_only_y; - y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_WxH<4>( - dst, stride, min_height, - left_column + ((y - left_base_increment) << upsample_left_shift), - base_left_y, -ystep, upsample_left_shift); - - DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row, - xstep_bounds, top_x, xstep, - upsample_top_shift); - } // Loop over y for left_only rows. + const int16_t base_left_y = vgetq_lane_s16(left_y, 0); for (; y < height; y += 8, dst += stride8) { DirectionalZone3_WxH<4>( dst, stride, min_height, @@ -634,34 +599,88 @@ inline void DirectionalZone2_4xH( } } -// Process a multiple of 8 |width|. -inline void DirectionalZone2_8( +template <bool shuffle_left_column> +inline void DirectionalZone2_8xH( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, - const uint8_t* LIBGAV1_RESTRICT const left_column, const int width, - const int height, const int xstep, const int ystep, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const int xstep_bounds_base, const int16x8_t left_y, const bool upsampled_top, const bool upsampled_left) { const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); - // Helper vector. - const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; - // Loop incrementers for moving by block (8x8). This function handles blocks // with height 4 as well. They are calculated in one pass so these variables // do not get used. const ptrdiff_t stride8 = stride << 3; const int xstep8 = xstep << 3; - const int ystep8 = ystep << 3; - // Process Wx4 blocks. + // Cover 8x4 case. const int min_height = (height == 4) ? 4 : 8; - // All columns from |min_top_only_x| to the right will only need |top_row| to - // compute and can therefore call the Zone1 functions. This assumes |xstep| is - // at least 3. - assert(xstep >= 3); - const int min_top_only_x = std::min((height * xstep) >> 6, width); + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); + DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, + top_row + (x << upsample_top_shift), -xstep, + upsampled_top); + + if (max_top_only_y == height) return; + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. Round up to the nearest 8. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + const int16_t base_left_y = vgetq_lane_s16(left_y, 0); + for (; y < min_left_only_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), left_y, + upsample_left_shift); + } else { + DirectionalZone3_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep, upsample_left_shift); + } + + DirectionalZone1Blend_WxH<8>( + dst_x, stride, min_height, top_row + (x << upsample_top_shift), + xstep_bounds, top_x, xstep, upsample_top_shift); + } + + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep, upsample_left_shift); + } +} + +// Process a multiple of 8 |width|. +inline void DirectionalZone2_WxH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint8_t* LIBGAV1_RESTRICT const top_row, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int width, + const int height, const int xstep, const int ystep, + const bool upsampled_top, const bool upsampled_left) { + const int ystep8 = ystep << 3; // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -677,90 +696,43 @@ inline void DirectionalZone2_8( // left_y vector omits the portion which is covered under the left_column // offset. Following values need the full ystep as a relative offset. const int16x8_t remainder = vdupq_n_s16(-ystep_remainder); + const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep); + // For ystep > 90, at least two sets of 8 columns can be fully computed from + // top_row only. + const int min_top_only_x = std::min((height * xstep) >> 6, width); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); + // This loop treats each set of 4 columns in 3 stages with y-value boundaries. // The first stage, before the first y-loop, covers blocks that are only // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. int x = 0; - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. |d| represents the number of pixels that can - // fit in one contiguous vector when stepping by |ystep|. For a given x - // position, the left column values can be obtained by VTBL as long as the - // values at row[x + d] and beyond come from the top row. However, this does - // not guarantee that the vector will also contain all of the values needed - // from top row. - const int d = 16 / ((ystep >> 6) + 1); + for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8, + xstep_bounds_base -= (8 << 6), + left_y = vsubq_s16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height, + xstep, ystep, x, left_offset, xstep_bounds_base, + left_y, upsampled_top, upsampled_left); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x; - const int max_shuffle_height = - std::min(((x + d) << 6) / xstep, height) & ~7; - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, - top_row + (x << upsample_top_shift), -xstep, - upsampled_top); - - if (max_top_only_y == height) continue; - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - - // All rows from |min_left_only_y| down for this set of columns only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - int xstep_bounds = xstep_bounds_base + xstep_y; - int top_x = -xstep - xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone2FromLeftCol_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), left_y, - upsample_left_shift); - - DirectionalZone1Blend_WxH<8>( - dst_x, stride, min_height, top_row + (x << upsample_top_shift), - xstep_bounds, top_x, xstep, upsample_top_shift); - } - - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - const int16_t base_left_y = vgetq_lane_s16(left_y, 0); - for (; y < min_left_only_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep, upsample_left_shift); - - DirectionalZone1Blend_WxH<8>( - dst_x, stride, min_height, top_row + (x << upsample_top_shift), - xstep_bounds, top_x, xstep, upsample_top_shift); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep, upsample_left_shift); - } + DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep, + ystep, x, left_offset, xstep_bounds_base, left_y, + upsampled_top, upsampled_left); } - // TODO(johannkoenig): May be able to remove this branch. if (x < width) { + const int upsample_top_shift = static_cast<int>(upsampled_top); DirectionalZone1_WxH(dst + x, stride, width - x, height, top_row + (x << upsample_top_shift), -xstep, upsampled_top); @@ -792,8 +764,8 @@ void DirectionalIntraPredictorZone2_NEON( DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep, upsampled_top, upsampled_left); } else { - DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep, - ystep, upsampled_top, upsampled_left); + DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep, + ystep, upsampled_top, upsampled_left); } } @@ -935,6 +907,16 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, return vrshrq_n_u16(sum, 5 /*log2(32)*/); } +// Blend two values based on weights that sum to 32. +inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, + const uint16x8_t a_weight, + const uint16x8_t b_weight) { + const uint16x8_t a_product = vmulq_u16(a, a_weight); + const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight); + + return vrshrq_n_u16(sum, 5 /*log2(32)*/); +} + // Each element of |dest| contains values associated with one weight value. inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* LIBGAV1_RESTRICT const source, @@ -959,6 +941,24 @@ inline void LoadEdgeVals(uint16x8x2_t* dest, } } +// For Wx4 blocks, load the source for 2 columns. The source for the second +// column is held in the high half of each vector. +inline void LoadEdgeVals2x4(uint16x8x2_t* dest, + const uint16_t* LIBGAV1_RESTRICT const source_low, + const uint16_t* LIBGAV1_RESTRICT const source_high, + const bool upsampled) { + if (upsampled) { + const uint16x4x2_t low = vld2_u16(source_low); + const uint16x4x2_t high = vld2_u16(source_high); + dest->val[0] = vcombine_u16(low.val[0], high.val[0]); + dest->val[1] = vcombine_u16(low.val[1], high.val[1]); + } else { + dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high)); + dest->val[1] = + vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1)); + } +} + template <bool upsampled> inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, @@ -1286,18 +1286,162 @@ inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst, } template <bool upsampled> +inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep, const int base_left_y = 0) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + const uint16x8_t inverter = vdupq_n_u16(32); + + uint16x8x2_t sampled_left_col; + // Compute two columns at a time, then transpose for storage. + uint16x8_t result[4]; + + // The low half of pre-transpose vectors contains columns 0 through 3. + int left_y_low = base_left_y + ystep; + int left_offset_low = left_y_low >> index_scale_bits; + int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + // The high half of pre-transpose vectors contains columns 4 through 7. + int left_y_high = left_y_low + (ystep << 2); + int left_offset_high = left_y_high >> index_scale_bits; + int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + uint16x8_t weights_0 = + vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + uint16x8_t weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_y_high += ystep; + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_high += ystep; + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_y_high += ystep; + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + Transpose4x8(result); + Store8(dst, result[0]); + dst += stride; + Store8(dst, result[1]); + dst += stride; + Store8(dst, result[2]); + dst += stride; + Store8(dst, result[3]); +} + +template <bool upsampled> +inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep, const int base_left_y = 0) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + // Compute one column at a time, then transpose for storage. + uint16x8_t result[4]; + + int left_y = base_left_y + ystep; + int left_offset = left_y >> index_scale_bits; + int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + int shift_1 = 32 - shift_0; + uint16x8x2_t sampled_left_col; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + Transpose4x8(result); + Store4(dst, vget_low_u16(result[0])); + dst += stride; + Store4(dst, vget_low_u16(result[1])); + dst += stride; + Store4(dst, vget_low_u16(result[2])); + dst += stride; + Store4(dst, vget_low_u16(result[3])); + dst += stride; + Store4(dst, vget_high_u16(result[0])); + dst += stride; + Store4(dst, vget_high_u16(result[1])); + dst += stride; + Store4(dst, vget_high_u16(result[2])); + dst += stride; + Store4(dst, vget_high_u16(result[3])); +} + +template <bool upsampled> inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { + assert(height == 8 || height == 16); const int upsample_shift = static_cast<int>(upsampled); - int y = 0; - do { - DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift), + DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep); + if (height == 16) { + dest += stride << 3; + DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift), ystep); - dest += 4 * stride; - y += 4; - } while (y < height); + } } template <bool upsampled> @@ -1305,16 +1449,17 @@ inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int width, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { - int x = 0; - int base_left_y = 0; - do { - // TODO(petersonab): Establish 8x4 transpose to reserve this function for - // 8x4 and 16x4. - DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep, - base_left_y); - base_left_y += 4 * ystep; - x += 4; - } while (x < width); + assert(width <= 16); + if (width == 4) { + DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep); + return; + } + DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep); + if (width == 16) { + const int base_left_y = ystep << 3; + DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left, + ystep, base_left_y); + } } template <bool upsampled> @@ -1460,17 +1605,17 @@ void DirectionalIntraPredictorZone3_NEON( } while (y != 0); return; } - if (width == 4) { + if (height == 4) { if (upsampled_left) { - DirectionalZone3_4xH<true>(dst, stride, height, left, ystep); + DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep); } else { - DirectionalZone3_4xH<false>(dst, stride, height, left, ystep); + DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep); } - } else if (height == 4) { + } else if (width == 4) { if (upsampled_left) { - DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep); + DirectionalZone3_4xH<true>(dst, stride, height, left, ystep); } else { - DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep); + DirectionalZone3_4xH<false>(dst, stride, height, left, ystep); } } else { if (upsampled_left) { @@ -1532,16 +1677,6 @@ inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b, return vrshr_n_u16(sum, 5 /*log2(32)*/); } -// Blend two values based on weight pairs that each sum to 32. -inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, - const uint16x8_t a_weight, - const uint16x8_t b_weight) { - const uint16x8_t a_product = vmulq_u16(a, a_weight); - const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight); - - return vrshrq_n_u16(sum, 5 /*log2(32)*/); -} - // Because the source values "move backwards" as the row index increases, the // indices derived from ystep are generally negative in localized functions. // This is accommodated by making sure the relative indices are within [-15, 0] @@ -1608,8 +1743,8 @@ inline void DirectionalZone2FromLeftCol_4xH( } while (++y < height); } -inline void DirectionalZone2FromLeftCol_8xH( - uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, +inline void DirectionalZone2FromLeftCol_8x8( + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y, const bool upsampled) { const int upsample_shift = static_cast<int>(upsampled); @@ -1653,8 +1788,7 @@ inline void DirectionalZone2FromLeftCol_8xH( vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1)); const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0); - int y = 0; - do { + for (int y = 0; y < 8; ++y) { uint16x8_t src_left, src_right; LoadStepwise( left_column - kPositiveIndexOffsetPixels + (y << upsample_shift), @@ -1664,7 +1798,7 @@ inline void DirectionalZone2FromLeftCol_8xH( Store8(dst, val); dst += stride; - } while (++y < height); + } } template <bool upsampled> @@ -1704,8 +1838,8 @@ inline void DirectionalZone1Blend_4xH( } template <bool upsampled> -inline void DirectionalZone1Blend_8xH( - uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, +inline void DirectionalZone1Blend_8x8( + uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x, const int xstep) { const int upsample_shift = static_cast<int>(upsampled); @@ -1716,8 +1850,7 @@ inline void DirectionalZone1Blend_8xH( const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7}; uint16x8x2_t top_vals; - int y = height; - do { + for (int y = 0; y < 8; ++y) { const uint16_t* const src = top_row + (top_x >> scale_bits_x); LoadEdgeVals(&top_vals, src, upsampled); @@ -1736,20 +1869,9 @@ inline void DirectionalZone1Blend_8xH( dest += stride; zone_bounds += xstep; top_x -= xstep; - } while (--y != 0); + } } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices -// that do not correspond to angle derivatives are left at zero. -// Notably, in cases with upsampling, the shuffle-invalid height is always -// greater than the prediction height (which is 8 at maximum). -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - // 7.11.2.4 (8) 90 < angle > 180 // The strategy for these functions (4xH and 8+xH) is to know how many blocks // can be processed with just pixels from |top_ptr|, then handle mixed blocks, @@ -1796,9 +1918,9 @@ inline void DirectionalZone2_4xH( // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. - // Round down to the nearest multiple of 8. - // TODO(petersonab): Check if rounding to the nearest 4 is okay. - const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst), stride >> 1, max_top_only_y, top_row, -xstep); @@ -1827,12 +1949,15 @@ inline void DirectionalZone2_4xH( xstep_bounds, top_x, xstep); } - // Loop over y for left-only rows. - for (; y < height; y += 8, dst += stride8) { - // Angle expected by Zone3 is flipped about the 180 degree vector, which - // is the x-axis. + // Left-only section. |height| - |y| is assumed equivalent to: + // (y == 0) && (height == 4) + if (height - y == 4) { + DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep); + return; + } + if (y < height) { DirectionalZone3_4xH<upsampled_left>( - dst, stride, min_height, left_column + (y << upsample_left_shift), + dst, stride, height - y, left_column + (y << upsample_left_shift), -ystep); } } @@ -1882,9 +2007,75 @@ inline void DirectionalZone2_Wx4( } } +template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left> +inline void DirectionalZone2_8xH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const top_row, + const uint16_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const int xstep_bounds_base, const int16x8_t left_y) { + const int upsample_left_shift = static_cast<int>(upsampled_left); + const int upsample_top_shift = static_cast<int>(upsampled_top); + + // Loop incrementers for moving by block (8x8). This function handles blocks + // with height 4 as well. They are calculated in one pass so these variables + // do not get used. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x * sizeof(uint16_t); + // Round down to the nearest multiple of 8. + const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; + DirectionalZone1_WxH<upsampled_top>( + reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y, + top_row + (x << upsample_top_shift), -xstep); + + if (max_top_only_y == height) return; + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. Round up to the nearest 8. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + for (; y < min_left_only_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_8x8( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y, + upsampled_left); + } else { + DirectionalZone3_8x8<upsampled_left>( + dst_x, stride, left_column + (y << upsample_left_shift), -ystep, + -ystep * x); + } + + DirectionalZone1Blend_8x8<upsampled_top>( + dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x, + xstep); + } + + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_8x8<upsampled_left>( + dst_x, stride, left_column + (y << upsample_left_shift), -ystep, + -ystep * x); + } +} + // Process a multiple of 8 |width|. template <bool upsampled_top, bool upsampled_left> -inline void DirectionalZone2_8( +inline void DirectionalZone2_NEON( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, const uint16_t* LIBGAV1_RESTRICT const left_column, const int width, @@ -1894,30 +2085,24 @@ inline void DirectionalZone2_8( dst, stride, top_row, left_column, width, xstep, ystep); return; } - const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); // Helper vector. const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; - // Loop increments for moving by block (8x8). This function handles blocks - // with height 4 as well. They are calculated in one pass so these variables - // do not get used. - const ptrdiff_t stride8 = stride << 3; - const int xstep8 = xstep << 3; const int ystep8 = ystep << 3; // All columns from |min_top_only_x| to the right will only need |top_row| to // compute and can therefore call the Zone1 functions. This assumes |xstep| is // at least 3. assert(xstep >= 3); - const int min_top_only_x = std::min((height * xstep) >> 6, width); - - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - const int max_shuffle_height = - std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); + const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -1935,73 +2120,22 @@ inline void DirectionalZone2_8( int16x8_t left_y = vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep); - // This loop treats each set of 4 columns in 3 stages with y-value boundaries. - // The first stage, before the first y-loop, covers blocks that are only - // computed from the top row. The second stage, comprising two y-loops, covers - // blocks that have a mixture of values computed from top or left. The final - // stage covers blocks that are only computed from the left. int x = 0; + for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8, + xstep_bounds_base -= (8 << 6), + left_y = vsubq_s16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH<false, upsampled_top, upsampled_left>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_bounds_base, left_y); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x * sizeof(uint16_t); - - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_WxH<upsampled_top>( - reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y, - top_row + (x << upsample_top_shift), -xstep); - - if (max_top_only_y == height) continue; - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - - // All rows from |min_left_only_y| down for this set of columns only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - int xstep_bounds = xstep_bounds_base + xstep_y; - int top_x = -xstep - xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone2FromLeftCol_8xH( - dst_x, stride, 8, - left_column + ((left_offset + y) << upsample_left_shift), left_y, - upsample_left_shift); - - DirectionalZone1Blend_8xH<upsampled_top>( - dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds, - top_x, xstep); - } - - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - for (; y < min_left_only_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_8x8<upsampled_left>( - dst_x, stride, left_column + (y << upsample_left_shift), -ystep, - -ystep * x); - - DirectionalZone1Blend_8xH<upsampled_top>( - dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds, - top_x, xstep); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_8x8<upsampled_left>( - dst_x, stride, left_column + (y << upsample_left_shift), -ystep, - -ystep * x); - } + DirectionalZone2_8xH<true, upsampled_top, upsampled_left>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_bounds_base, left_y); } // Reached |min_top_only_x|. if (x < width) { @@ -2129,18 +2263,18 @@ void DirectionalIntraPredictorZone2_NEON( } if (upsampled_top) { if (upsampled_left) { - DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } else { - DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } } else if (upsampled_left) { - DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } else { - DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } } diff --git a/libgav1/src/dsp/arm/intrapred_neon.cc b/libgav1/src/dsp/arm/intrapred_neon.cc index cd47a22..d1adbdf 100644 --- a/libgav1/src/dsp/arm/intrapred_neon.cc +++ b/libgav1/src/dsp/arm/intrapred_neon.cc @@ -407,13 +407,9 @@ inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest, inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist, const uint16x8_t top_left_dist_low, const uint16x8_t top_left_dist_high) { - // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of - // using movl(x_dist). - const uint8x8_t x_le_top_left_low = - vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low)); - const uint8x8_t x_le_top_left_high = - vmovn_u16(vcleq_u16(vmovl_u8(vget_high_u8(x_dist)), top_left_dist_high)); - return vcombine_u8(x_le_top_left_low, x_le_top_left_high); + const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low), + vqmovn_u16(top_left_dist_high)); + return vcleq_u8(x_dist, top_left_dist); } // Select the closest values and collect them. diff --git a/libgav1/src/dsp/arm/intrapred_smooth_neon.cc b/libgav1/src/dsp/arm/intrapred_smooth_neon.cc index bcda131..d6c1450 100644 --- a/libgav1/src/dsp/arm/intrapred_smooth_neon.cc +++ b/libgav1/src/dsp/arm/intrapred_smooth_neon.cc @@ -31,7 +31,6 @@ namespace libgav1 { namespace dsp { - namespace low_bitdepth { namespace { @@ -42,20 +41,15 @@ constexpr uint8_t kSmoothWeights[] = { #include "src/dsp/smooth_weights.inc" }; -inline uint16x4_t CalculatePred(const uint16x4_t weighted_top, - const uint16x4_t weighted_left, - const uint16x4_t weighted_bl, - const uint16x4_t weighted_tr) { - const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left); - const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr); - const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1); - return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1); +// 256 - v = vneg_s8(v) +inline uint8x8_t NegateS8(const uint8x8_t v) { + return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v))); } template <int height> -inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { constexpr int width = 4; const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); @@ -68,47 +62,49 @@ inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_y_v))); - const uint16x4_t weighted_bl = - vget_low_u16(vmull_u8(scaled_weights_y, bottom_left_v)); - - const uint16x4_t weighted_top = vget_low_u16(vmull_u8(weights_y_v, top_v)); - const uint16x4_t weighted_left = - vget_low_u16(vmull_u8(weights_x_v, left_v)); - const uint16x4_t weighted_tr = - vget_low_u16(vmull_u8(scaled_weights_x, top_right_v)); - const uint16x4_t result = - CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr); - - StoreLo4(dst, vmovn_u16(vcombine_u16(result, result))); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); + const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_bl, weights_y_v, top_v); + const uint16x8_t weighted_left_tr = + vmlal_u8(weighted_tr, weights_x_v, left_v); + // Maximum value of each parameter: 0xFF00 + const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); + const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale); + + StoreLo4(dst, result); dst += stride; } } -inline uint8x8_t CalculatePred(const uint16x8_t weighted_top, - const uint16x8_t weighted_left, - const uint16x8_t weighted_bl, - const uint16x8_t weighted_tr) { - // Maximum value: 0xFF00 - const uint16x8_t pred_0 = vaddq_u16(weighted_top, weighted_bl); - // Maximum value: 0xFF00 - const uint16x8_t pred_1 = vaddq_u16(weighted_left, weighted_tr); - const uint16x8_t pred_2 = vhaddq_u16(pred_0, pred_1); - return vrshrn_n_u16(pred_2, kSmoothWeightScale); +inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl, + const uint16x8_t weighted_left_tr) { + // Maximum value of each parameter: 0xFF00 + const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); + return vrshrn_n_u16(avg, kSmoothWeightScale); +} + +inline uint8x8_t CalculateWeightsAndPred( + const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr, + const uint8x8_t bottom_left, const uint8x8_t weights_x, + const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) { + const uint16x8_t weighted_top = vmull_u8(weights_y, top); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_top, scaled_weights_y, bottom_left); + const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left); + return CalculatePred(weighted_top_bl, weighted_left_tr); } template <int height> -inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { constexpr int width = 8; const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); @@ -121,21 +117,16 @@ inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); - const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); - - const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); - const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v); - const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint8x8_t result = - CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr); + CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v, + weights_x_v, scaled_weights_y, weights_y_v); vst1_u8(dst, result); dst += stride; @@ -146,28 +137,34 @@ inline uint8x16_t CalculateWeightsAndPred( const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right, const uint8x8_t weights_y, const uint8x16_t weights_x, const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) { - const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); + const uint16x8_t weighted_top_bl_low = + vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); - const uint16x8_t weighted_tr_low = - vmull_u8(vget_low_u8(scaled_weights_x), top_right); - const uint8x8_t result_low = CalculatePred( - weighted_top_low, weighted_left_low, weighted_bl, weighted_tr_low); + const uint16x8_t weighted_left_tr_low = + vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); + const uint8x8_t result_low = + CalculatePred(weighted_top_bl_low, weighted_left_tr_low); - const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); + const uint16x8_t weighted_top_bl_high = + vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); - const uint16x8_t weighted_tr_high = - vmull_u8(vget_high_u8(scaled_weights_x), top_right); - const uint8x8_t result_high = CalculatePred( - weighted_top_high, weighted_left_high, weighted_bl, weighted_tr_high); + const uint16x8_t weighted_left_tr_high = + vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); + const uint8x8_t result_high = + CalculatePred(weighted_top_bl_high, weighted_left_tr_high); return vcombine_u8(result_low, result_high); } +// 256 - v = vneg_s8(v) +inline uint8x16_t NegateS8(const uint8x16_t v) { + return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v))); +} + template <int width, int height> -inline void Smooth16PlusxN_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t top_right = top[width - 1]; @@ -188,9 +185,6 @@ inline void Smooth16PlusxN_NEON( const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); - // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop. - // This currently has a performance slope similar to Paeth so it does not - // appear to be register bound for arm64. uint8x16_t weights_x_v[4]; weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4); if (width > 16) { @@ -202,23 +196,19 @@ inline void Smooth16PlusxN_NEON( } uint8x16_t scaled_weights_x[4]; - scaled_weights_x[0] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0]))); + scaled_weights_x[0] = NegateS8(weights_x_v[0]); if (width > 16) { - scaled_weights_x[1] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1]))); + scaled_weights_x[1] = NegateS8(weights_x_v[1]); if (width == 64) { - scaled_weights_x[2] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2]))); - scaled_weights_x[3] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3]))); + scaled_weights_x[2] = NegateS8(weights_x_v[2]); + scaled_weights_x[3] = NegateS8(weights_x_v[3]); } } for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v, @@ -246,10 +236,10 @@ inline void Smooth16PlusxN_NEON( } template <int width, int height> -inline void SmoothVertical4Or8xN_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t bottom_left = left[height - 1]; @@ -267,17 +257,17 @@ inline void SmoothVertical4Or8xN_NEON( for (int y = 0; y < height; ++y) { const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); - const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); - const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl); - const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); + const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale); if (width == 4) { - StoreLo4(dst, pred_scaled); + StoreLo4(dst, pred); } else { // width == 8 - vst1_u8(dst, pred_scaled); + vst1_u8(dst, pred); } dst += stride; } @@ -286,10 +276,10 @@ inline void SmoothVertical4Or8xN_NEON( inline uint8x16_t CalculateVerticalWeightsAndPred( const uint8x16_t top, const uint8x8_t weights_y, const uint16x8_t weighted_bl) { - const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); - const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); - const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl); - const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl); + const uint16x8_t pred_low = + vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); + const uint16x8_t pred_high = + vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); const uint8x8_t pred_scaled_high = vrshrn_n_u16(pred_high, kSmoothWeightScale); @@ -297,7 +287,7 @@ inline uint8x16_t CalculateVerticalWeightsAndPred( } template <int width, int height> -inline void SmoothVertical16PlusxN_NEON( +void SmoothVertical16PlusxN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -321,7 +311,7 @@ inline void SmoothVertical16PlusxN_NEON( for (int y = 0; y < height; ++y) { const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); const uint8x16_t pred_0 = @@ -349,7 +339,7 @@ inline void SmoothVertical16PlusxN_NEON( } template <int width, int height> -inline void SmoothHorizontal4Or8xN_NEON( +void SmoothHorizontal4Or8xN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -361,22 +351,19 @@ inline void SmoothHorizontal4Or8xN_NEON( const uint8x8_t top_right_v = vdup_n_u8(top_right); // Over-reads for 4xN but still within the array. const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); - - const uint16x8_t weighted_left = vmull_u8(weights_x, left_v); - const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); - const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr); - const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); + const uint16x8_t weighted_left_tr = + vmlal_u8(weighted_tr, weights_x, left_v); + const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale); if (width == 4) { - StoreLo4(dst, pred_scaled); + StoreLo4(dst, pred); } else { // width == 8 - vst1_u8(dst, pred_scaled); + vst1_u8(dst, pred); } dst += stride; } @@ -386,23 +373,22 @@ inline uint8x16_t CalculateHorizontalWeightsAndPred( const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x, const uint8x16_t scaled_weights_x) { const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); - const uint16x8_t weighted_tr_low = - vmull_u8(vget_low_u8(scaled_weights_x), top_right); - const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low); - const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); + const uint16x8_t weighted_left_tr_low = + vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); + const uint8x8_t pred_scaled_low = + vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); - const uint16x8_t weighted_tr_high = - vmull_u8(vget_high_u8(scaled_weights_x), top_right); - const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high); + const uint16x8_t weighted_left_tr_high = + vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); const uint8x8_t pred_scaled_high = - vrshrn_n_u16(pred_high, kSmoothWeightScale); + vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale); return vcombine_u8(pred_scaled_low, pred_scaled_high); } template <int width, int height> -inline void SmoothHorizontal16PlusxN_NEON( +void SmoothHorizontal16PlusxN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -424,16 +410,12 @@ inline void SmoothHorizontal16PlusxN_NEON( } uint8x16_t scaled_weights_x[4]; - scaled_weights_x[0] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0]))); + scaled_weights_x[0] = NegateS8(weights_x[0]); if (width > 16) { - scaled_weights_x[1] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1]))); + scaled_weights_x[1] = NegateS8(weights_x[1]); if (width == 64) { - scaled_weights_x[2] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2]))); - scaled_weights_x[3] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3]))); + scaled_weights_x[2] = NegateS8(weights_x[2]); + scaled_weights_x[3] = NegateS8(weights_x[3]); } } @@ -633,10 +615,15 @@ constexpr uint16_t kSmoothWeights[] = { #include "src/dsp/smooth_weights.inc" }; +// 256 - v = vneg_s8(v) +inline uint16x4_t NegateS8(const uint16x4_t v) { + return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v))); +} + template <int height> -inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[3]; @@ -647,9 +634,7 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint16x4_t top_v = vld1_u16(top); const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights); - const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v); - - // Weighted top right doesn't change with each row. + const uint16x4_t scaled_weights_x = NegateS8(weights_x_v); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { @@ -670,10 +655,10 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, // Common code between 8xH and [16|32|64]xH. inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst, - const uint32x4_t& weighted_corners_low, - const uint32x4_t& weighted_corners_high, - const uint16x4x2_t& top_vals, - const uint16x4x2_t& weights_x, const uint16_t left_y, + const uint32x4_t weighted_corners_low, + const uint32x4_t weighted_corners_high, + const uint16x4x2_t top_vals, + const uint16x4x2_t weights_x, const uint16_t left_y, const uint16_t weight_y) { // Each variable in the running summation is named for the last item to be // accumulated. @@ -697,9 +682,9 @@ inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst, } template <int height> -inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[7]; @@ -712,14 +697,12 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4), vld1_u16(kSmoothWeights + 8)}; - // Weighted top right doesn't change with each row. const uint32x4_t weighted_tr_low = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right); + vmull_n_u16(NegateS8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right); + vmull_n_u16(NegateS8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); const uint32x4_t weighted_corners_low = @@ -735,9 +718,9 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, // For width 16 and above. template <int width, int height> -inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[width - 1]; @@ -746,23 +729,19 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, auto* dst = static_cast<uint8_t*>(dest); - const uint16x4_t weight_scaling = vdup_n_u16(256); // Precompute weighted values that don't vary with |y|. uint32x4_t weighted_tr_low[width >> 3]; uint32x4_t weighted_tr_high[width >> 3]; for (int i = 0; i < width >> 3; ++i) { const int x = i << 3; const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x); - weighted_tr_low[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right); + weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right); const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x); - weighted_tr_high[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right); + weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right); } const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); auto* dst_x = reinterpret_cast<uint16_t*>(dst); @@ -785,10 +764,9 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, } template <int height> -inline void SmoothVertical4xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -812,10 +790,10 @@ inline void SmoothVertical4xH_NEON( } template <int height> -inline void SmoothVertical8xH_NEON( - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -829,7 +807,6 @@ inline void SmoothVertical8xH_NEON( for (int y = 0; y < height; ++y) { auto* dst16 = reinterpret_cast<uint16_t*>(dst); - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); @@ -846,10 +823,10 @@ inline void SmoothVertical8xH_NEON( // For width 16 and above. template <int width, int height> -inline void SmoothVerticalWxH_NEON( - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -865,7 +842,6 @@ inline void SmoothVerticalWxH_NEON( const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); @@ -885,10 +861,10 @@ inline void SmoothVerticalWxH_NEON( } template <int height> -inline void SmoothHorizontal4xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[3]; @@ -896,7 +872,7 @@ inline void SmoothHorizontal4xH_NEON( auto* dst = static_cast<uint8_t*>(dest); const uint16x4_t weights_x = vld1_u16(kSmoothWeights); - const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x); + const uint16x4_t scaled_weights_x = NegateS8(weights_x); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { @@ -909,10 +885,10 @@ inline void SmoothHorizontal4xH_NEON( } template <int height> -inline void SmoothHorizontal8xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[7]; @@ -923,9 +899,9 @@ inline void SmoothHorizontal8xH_NEON( vld1_u16(kSmoothWeights + 8)}; const uint32x4_t weighted_tr_low = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right); + vmull_n_u16(NegateS8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right); + vmull_n_u16(NegateS8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { auto* dst16 = reinterpret_cast<uint16_t*>(dst); @@ -943,18 +919,16 @@ inline void SmoothHorizontal8xH_NEON( // For width 16 and above. template <int width, int height> -inline void SmoothHorizontalWxH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[width - 1]; auto* dst = static_cast<uint8_t*>(dest); - const uint16x4_t weight_scaling = vdup_n_u16(256); - uint16x4_t weights_x_low[width >> 3]; uint16x4_t weights_x_high[width >> 3]; uint32x4_t weighted_tr_low[width >> 3]; @@ -962,11 +936,9 @@ inline void SmoothHorizontalWxH_NEON( for (int i = 0; i < width >> 3; ++i) { const int x = i << 3; weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x); - weighted_tr_low[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right); + weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right); weights_x_high[i] = vld1_u16(kSmoothWeights + width + x); - weighted_tr_high[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right); + weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right); } for (int y = 0; y < height; ++y) { @@ -1141,6 +1113,7 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = SmoothHorizontalWxH_NEON<64, 64>; } + } // namespace } // namespace high_bitdepth #endif // LIBGAV1_MAX_BITDEPTH >= 10 diff --git a/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc b/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc index 617accc..e6f0d9d 100644 --- a/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc +++ b/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc @@ -282,9 +282,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row, const int32x4_t max = vdupq_n_s32((1 << range) - 1); int32x4_t s[4], x[4]; - LoadSrc<4>(dst, step, 0, x); if (is_row) { - Transpose4x4(x, x); + assert(step == 4); + int32x4x4_t y = vld4q_s32(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; + } else { + LoadSrc<4>(dst, step, 0, x); } // stage 1. @@ -301,9 +304,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row, for (auto& i : s) { i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift))); } - Transpose4x4(s, s); + int32x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = s[i]; + vst4q_s32(dst, y); + } else { + StoreDst<4>(dst, step, 0, s); } - StoreDst<4>(dst, step, 0, s); } template <ButterflyRotationFunc butterfly_rotation, @@ -937,9 +943,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row, int32x4_t s[8]; int32x4_t x[4]; - LoadSrc<4>(dst, step, 0, x); if (is_row) { - Transpose4x4(x, x); + assert(step == 4); + int32x4x4_t y = vld4q_s32(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; + } else { + LoadSrc<4>(dst, step, 0, x); } // stage 1. @@ -981,9 +990,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row, x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift))); x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift))); x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift))); - Transpose4x4(x, x); + int32x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = x[i]; + vst4q_s32(dst, y); + } else { + StoreDst<4>(dst, step, 0, x); } - StoreDst<4>(dst, step, 0, x); } alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344, diff --git a/libgav1/src/dsp/arm/inverse_transform_neon.cc b/libgav1/src/dsp/arm/inverse_transform_neon.cc index 1c2e111..452f14a 100644 --- a/libgav1/src/dsp/arm/inverse_transform_neon.cc +++ b/libgav1/src/dsp/arm/inverse_transform_neon.cc @@ -41,50 +41,6 @@ namespace { //------------------------------------------------------------------------------ -// TODO(slavarnway): Move transpose functions to transpose_neon.h or -// common_neon.h. - -LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int16x8_t in[4], - int16x8_t out[4]) { - // Swap 16 bit elements. Goes from: - // a0: 00 01 02 03 - // a1: 10 11 12 13 - // a2: 20 21 22 23 - // a3: 30 31 32 33 - // to: - // b0.val[0]: 00 10 02 12 - // b0.val[1]: 01 11 03 13 - // b1.val[0]: 20 30 22 32 - // b1.val[1]: 21 31 23 33 - const int16x4_t a0 = vget_low_s16(in[0]); - const int16x4_t a1 = vget_low_s16(in[1]); - const int16x4_t a2 = vget_low_s16(in[2]); - const int16x4_t a3 = vget_low_s16(in[3]); - - const int16x4x2_t b0 = vtrn_s16(a0, a1); - const int16x4x2_t b1 = vtrn_s16(a2, a3); - - // Swap 32 bit elements resulting in: - // c0.val[0]: 00 10 20 30 04 14 24 34 - // c0.val[1]: 02 12 22 32 06 16 26 36 - // c1.val[0]: 01 11 21 31 05 15 25 35 - // c1.val[1]: 03 13 23 33 07 17 27 37 - const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), - vreinterpret_s32_s16(b1.val[0])); - const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), - vreinterpret_s32_s16(b1.val[1])); - - const int16x4_t d0 = vreinterpret_s16_s32(c0.val[0]); - const int16x4_t d1 = vreinterpret_s16_s32(c1.val[0]); - const int16x4_t d2 = vreinterpret_s16_s32(c0.val[1]); - const int16x4_t d3 = vreinterpret_s16_s32(c1.val[1]); - - out[0] = vcombine_s16(d0, d0); - out[1] = vcombine_s16(d1, d1); - out[2] = vcombine_s16(d2, d2); - out[3] = vcombine_s16(d3, d3); -} - // Note this is only used in the final stage of Dct32/64 and Adst16 as the in // place version causes additional stack usage with clang. LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8], @@ -580,16 +536,19 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) { if (stage_is_rectangular) { if (transpose) { - int16x8_t input[8]; - LoadSrc<8, 8>(dst, step, 0, input); - Transpose4x8To8x4(input, x); + assert(step == 4); + int16x8x4_t y = vld4q_s16(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; } else { LoadSrc<16, 4>(dst, step, 0, x); } } else { - LoadSrc<8, 4>(dst, step, 0, x); if (transpose) { - Transpose4x4(x, x); + assert(step == 4); + int16x4x4_t y = vld4_s16(dst); + for (int i = 0; i < 4; ++i) x[i] = vcombine_s16(y.val[i], y.val[i]); + } else { + LoadSrc<8, 4>(dst, step, 0, x); } } @@ -604,17 +563,20 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) { if (stage_is_rectangular) { if (transpose) { - int16x8_t output[8]; - Transpose8x4To4x8(s, output); - StoreDst<8, 8>(dst, step, 0, output); + int16x8x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = s[i]; + vst4q_s16(dst, y); } else { StoreDst<16, 4>(dst, step, 0, s); } } else { if (transpose) { - Transpose4x4(s, s); + int16x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = vget_low_s16(s[i]); + vst4_s16(dst, y); + } else { + StoreDst<8, 4>(dst, step, 0, s); } - StoreDst<8, 4>(dst, step, 0, s); } } @@ -1204,45 +1166,41 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) { //------------------------------------------------------------------------------ // Asymmetric Discrete Sine Transforms (ADST). -template <bool stage_is_rectangular> + LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool transpose) { auto* const dst = static_cast<int16_t*>(dest); - int32x4_t s[8]; - int16x8_t x[4]; + int32x4_t s[7]; + int16x4_t x[4]; - if (stage_is_rectangular) { - if (transpose) { - int16x8_t input[8]; - LoadSrc<8, 8>(dst, step, 0, input); - Transpose4x8To8x4(input, x); - } else { - LoadSrc<16, 4>(dst, step, 0, x); - } + if (transpose) { + assert(step == 4); + int16x4x4_t y = vld4_s16(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; } else { - LoadSrc<8, 4>(dst, step, 0, x); - if (transpose) { - Transpose4x4(x, x); - } + x[0] = vld1_s16(dst); + x[1] = vld1_s16(dst + 1 * step); + x[2] = vld1_s16(dst + 2 * step); + x[3] = vld1_s16(dst + 3 * step); } // stage 1. - s[5] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[1]); - s[6] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[3]); + s[5] = vmull_n_s16(x[3], kAdst4Multiplier[1]); + s[6] = vmull_n_s16(x[3], kAdst4Multiplier[3]); // stage 2. - const int32x4_t a7 = vsubl_s16(vget_low_s16(x[0]), vget_low_s16(x[2])); - const int32x4_t b7 = vaddw_s16(a7, vget_low_s16(x[3])); + const int32x4_t a7 = vsubl_s16(x[0], x[2]); + const int32x4_t b7 = vaddw_s16(a7, x[3]); // stage 3. - s[0] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[0]); - s[1] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[1]); + s[0] = vmull_n_s16(x[0], kAdst4Multiplier[0]); + s[1] = vmull_n_s16(x[0], kAdst4Multiplier[1]); // s[0] = s[0] + s[3] - s[0] = vmlal_n_s16(s[0], vget_low_s16(x[2]), kAdst4Multiplier[3]); + s[0] = vmlal_n_s16(s[0], x[2], kAdst4Multiplier[3]); // s[1] = s[1] - s[4] - s[1] = vmlsl_n_s16(s[1], vget_low_s16(x[2]), kAdst4Multiplier[0]); + s[1] = vmlsl_n_s16(s[1], x[2], kAdst4Multiplier[0]); - s[3] = vmull_n_s16(vget_low_s16(x[1]), kAdst4Multiplier[2]); + s[3] = vmull_n_s16(x[1], kAdst4Multiplier[2]); s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]); // stage 4. @@ -1259,24 +1217,20 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12); const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12); - x[0] = vcombine_s16(dst_0, dst_0); - x[1] = vcombine_s16(dst_1, dst_1); - x[2] = vcombine_s16(dst_2, dst_2); - x[3] = vcombine_s16(dst_3, dst_3); + x[0] = dst_0; + x[1] = dst_1; + x[2] = dst_2; + x[3] = dst_3; - if (stage_is_rectangular) { - if (transpose) { - int16x8_t output[8]; - Transpose8x4To4x8(x, output); - StoreDst<8, 8>(dst, step, 0, output); - } else { - StoreDst<16, 4>(dst, step, 0, x); - } + if (transpose) { + int16x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = x[i]; + vst4_s16(dst, y); } else { - if (transpose) { - Transpose4x4(x, x); - } - StoreDst<8, 4>(dst, step, 0, x); + vst1_s16(dst, x[0]); + vst1_s16(dst + 1 * step, x[1]); + vst1_s16(dst + 2 * step, x[2]); + vst1_s16(dst + 3 * step, x[3]); } } @@ -2705,7 +2659,7 @@ void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/, int i = adjusted_tx_height; auto* data = src; do { - Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true); + Adst4_NEON(data, /*step=*/4, /*transpose=*/true); data += 16; i -= 4; } while (i != 0); @@ -2732,7 +2686,7 @@ void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, int i = tx_width; auto* data = src; do { - Adst4_NEON<false>(data, tx_width, /*transpose=*/false); + Adst4_NEON(data, tx_width, /*transpose=*/false); data += 4; i -= 4; } while (i != 0); diff --git a/libgav1/src/dsp/arm/loop_filter_10bit_neon.cc b/libgav1/src/dsp/arm/loop_filter_10bit_neon.cc new file mode 100644 index 0000000..a9dd98f --- /dev/null +++ b/libgav1/src/dsp/arm/loop_filter_10bit_neon.cc @@ -0,0 +1,1218 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_filter.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 + +#include <arm_neon.h> + +#include <cassert> +#include <cstddef> +#include <cstdint> + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) +inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) { + const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh)); + return vorr_u16(vget_low_u16(a), vget_high_u16(a)); +} + +// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh +inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0, + const uint16x4_t q0, const uint16x4_t q1, + const uint16_t outer_thresh) { + const uint16x4_t abd_p0q0 = vabd_u16(p0, q0); + const uint16x4_t abd_p1q1 = vabd_u16(p1, q1); + const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1); + const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1); + const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half); + return vcle_u16(sum, vdup_n_u16(outer_thresh)); +} + +// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && +// OuterThreshold() +inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a)); + return vand_u16(inner_mask, outer_mask); +} + +// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && +// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh && +// OuterThreshold() +inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p1p2_q1q2, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); + const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b)); + return vand_u16(inner_mask, outer_mask); +} + +// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && +// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && +// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh +// OuterThreshold() +inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p1p2_q1q2, + const uint16x8_t abd_p2p3_q2q3, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); + const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3); + const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c)); + return vand_u16(inner_mask, outer_mask); +} + +// ----------------------------------------------------------------------------- +// FilterNMasks functions. + +inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1, + const uint16_t hev_thresh, const uint16x4_t outer_mask, + const uint16_t inner_thresh, + uint16x4_t* const hev_mask, + uint16x4_t* const needs_filter4_mask) { + const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + // This includes cases where NeedsFilter4() is not true and so Filter2() will + // not be applied. + const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh); + + *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask); + + // Filter2() will only be applied if both NeedsFilter4() and Hev() are true. + *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask); +} + +// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh && +// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh +// |flat_thresh| == 4 for 10 bit decode. +inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p0p2_q0q2) { + constexpr int flat_thresh = 1 << 2; + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2); + const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh)); + return vand_u16(vget_low_u16(b), vget_high_u16(b)); +} + +inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, const uint16_t hev_thresh, + const uint16x4_t outer_mask, + const uint16_t inner_thresh, + uint16x4_t* const needs_filter6_mask, + uint16x4_t* const is_flat3_mask, + uint16x4_t* const hev_mask) { + const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); + *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2)); + *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), + inner_thresh, outer_mask); +} + +// IsFlat4 uses N=1, IsFlatOuter4 uses N=4. +// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh && +// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh && +// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh +// |flat_thresh| == 4 for 10 bit decode. +inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0, + const uint16x8_t abd_pn1p0_qn1q0, + const uint16x8_t abd_pn2p0_qn2q0) { + constexpr int flat_thresh = 1 << 2; + const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0); + const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0); + const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh)); + return vand_u16(vget_low_u16(c), vget_high_u16(c)); +} + +inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2, + const uint16x8_t p1q1, const uint16x8_t p0q0, + const uint16_t hev_thresh, const uint16x4_t outer_mask, + const uint16_t inner_thresh, + uint16x4_t* const needs_filter8_mask, + uint16x4_t* const is_flat4_mask, + uint16x4_t* const hev_mask) { + const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); + const uint16x4_t is_flat4 = + IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3)); + *needs_filter8_mask = + NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3), + inner_thresh, outer_mask); + // |is_flat4_mask| is used to decide where to use the result of Filter8. + // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false, + // overriding the question of whether to use Filter8. Because Filter4 doesn't + // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the + // source value. To be correct, the mask must account for this override. + *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask); +} + +// ----------------------------------------------------------------------------- +// FilterN functions. + +// Calculate Filter4() or Filter2() based on |hev_mask|. +inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1, + const uint16x8_t p1q1, const uint16x4_t hev_mask, + uint16x8_t* const p1q1_result, + uint16x8_t* const p0q0_result) { + const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4); + // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val); + // q0mp0 means "q0 minus p0". + const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1)); + const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3); + + // If this is for Filter2() then include |p1mq1|. Otherwise zero it. + const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/))); + const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1); + const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1); + const int16x4_t p1mq1_saturated = + Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel); + const int16x4_t hev_option = + vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated); + + const int16x4_t a = vadd_s16(q0mp0_3, hev_option); + + // Need to figure out what's going on here because there are some unnecessary + // tricks to accommodate 8x8 as smallest 8bpp vector + + // We can not shift with rounding because the clamp comes *before* the + // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 = + // Clip3(a + 3, min_signed_val, max_signed_val) >> 3; + const int16x4_t plus_four = + Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel); + const int16x4_t plus_three = + Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel); + const int16x4_t a1 = vshr_n_s16(plus_four, 3); + const int16x4_t a2 = vshr_n_s16(plus_three, 3); + + // a3 = (a1 + 1) >> 1; + const int16x4_t a3 = vrshr_n_s16(a1, 1); + + const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3)); + const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3); + + // Need to shift the second term or we end up with a2_ma2. + const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1)); + const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1); + *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10); + *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10); +} + +void Horizontal4_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + + const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), + vld1_u16(dst_q0), vld1_u16(dst_q1)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); + const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); + Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, + &needs_filter4_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter4_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter4_mask_8 = + vcombine_u16(needs_filter4_mask, needs_filter4_mask); + + uint16x8_t f_p1q1; + uint16x8_t f_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); + + // Already integrated the Hev mask when calculating the filtered values. + const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); + + // p1/q1 are unmodified if only Hev() is true. This works because it was and'd + // with |needs_filter4_mask| previously. + const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); + const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); + + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); +} + +void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + // Offset by 2 uint16_t values to load from first p1 position. + auto* dst = static_cast<uint8_t*>(dest) - 4; + auto* dst_p1 = reinterpret_cast<uint16_t*>(dst); + auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride); + auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2); + auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3); + + uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1)}; + Transpose4x4(src); + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); + const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); + Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, + &needs_filter4_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter4_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter4_mask_8 = + vcombine_u16(needs_filter4_mask, needs_filter4_mask); + + uint16x8_t f_p1q1; + uint16x8_t f_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); + + // Already integrated the Hev mask when calculating the filtered values. + const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); + + // p1/q1 are unmodified if only Hev() is true. This works because it was and'd + // with |needs_filter4_mask| previously. + const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); + const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); + + uint16x4_t output[4] = { + vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), + vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output), + }; + Transpose4x4(output); + + vst1_u16(dst_p1, output[0]); + vst1_u16(dst_p0, output[1]); + vst1_u16(dst_q0, output[2]); + vst1_u16(dst_q1, output[3]); +} + +inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, uint16x8_t* const p1q1_output, + uint16x8_t* const p0q0_output) { + // Sum p1 and q1 output from opposite directions. + // The formula is regrouped to allow 3 doubling operations to be combined. + // + // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 + // ^^^^^^^^ + // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2) + // ^^^^^^^^ + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^^^^^^ + uint16x8_t sum = vaddq_u16(p2q2, p1q1); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^ + sum = vaddq_u16(sum, p0q0); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^ + sum = vshlq_n_u16(sum, 1); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^ ^^^^^^ + // Should dual issue with the left shift. + const uint16x8_t q0p0 = Transpose64(p0q0); + const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0); + sum = vaddq_u16(sum, outer_sum); + + *p1q1_output = vrshrq_n_u16(sum, 3); + + // Convert to p0 and q0 output: + // p0 = p1 - (2 * p2) + q0 + q1 + // q0 = q1 - (2 * q2) + p0 + p1 + // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 + // ^^^^^^^^ + const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1); + // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 + // ^^^^^^^^ + sum = vsubq_u16(sum, p2q2_double); + const uint16x8_t q1p1 = Transpose64(p1q1); + sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1)); + + *p0q0_output = vrshrq_n_u16(sum, 3); +} + +void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + + const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1), + vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1), vld1_u16(dst_q2)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat3_mask; + const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); + const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); + const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); + Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat3_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or + // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 + // output is not used. + uint16x8_t f6_p1q1, f6_p0q0; + const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); + if (vget_lane_u64(need_filter6, 0) == 0) { + // Filter6() does not apply, but Filter4() applies to one or more values. + p0q0_output = p0q0; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); + p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); +} + +void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + // Left side of the filter window. + auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t); + auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + // Overread by 2 values. These overreads become the high halves of src_raw[2] + // and src_raw[3] after transpose. + uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3)}; + Transpose4x8(src_raw); + // p2, p1, p0, q0, q1, q2 + const uint16x4_t src[6] = { + vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]), + vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]), + vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]), + }; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat3_mask; + const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); + const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); + const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); + Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat3_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or + // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 + // output is not used. + uint16x8_t f6_p1q1, f6_p0q0; + const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); + if (vget_lane_u64(need_filter6, 0) == 0) { + // Filter6() does not apply, but Filter4() applies to one or more values. + p0q0_output = p0q0; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); + p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + uint16x4_t output[4] = { + vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), + vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output), + }; + Transpose4x4(output); + + // dst_n starts at p2, so adjust to p1. + vst1_u16(dst_0 + 1, output[0]); + vst1_u16(dst_1 + 1, output[1]); + vst1_u16(dst_2 + 1, output[2]); + vst1_u16(dst_3 + 1, output[3]); +} + +inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2, + const uint16x8_t p1q1, const uint16x8_t p0q0, + uint16x8_t* const p2q2_output, + uint16x8_t* const p1q1_output, + uint16x8_t* const p0q0_output) { + // Sum p2 and q2 output from opposite directions. + // The formula is regrouped to allow 2 doubling operations to be combined. + // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 + // ^^^^^^^^ + // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) + // ^^^^^^^^ + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^ + const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^ + uint16x8_t sum = vshlq_n_u16(p23q23, 1); + + // Add two other terms to make dual issue with shift more likely. + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^ + const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^^^ + sum = vaddq_u16(sum, p01q01); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^ + sum = vaddq_u16(sum, p3q3); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^ + const uint16x8_t q0p0 = Transpose64(p0q0); + sum = vaddq_u16(sum, q0p0); + + *p2q2_output = vrshrq_n_u16(sum, 3); + + // Convert to p1 and q1 output: + // p1 = p2 - p3 - p2 + p1 + q1 + // q1 = q2 - q3 - q2 + q0 + p1 + sum = vsubq_u16(sum, p23q23); + const uint16x8_t q1p1 = Transpose64(p1q1); + sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1)); + + *p1q1_output = vrshrq_n_u16(sum, 3); + + // Convert to p0 and q0 output: + // p0 = p1 - p3 - p1 + p0 + q2 + // q0 = q1 - q3 - q1 + q0 + p2 + sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1)); + const uint16x8_t q2p2 = Transpose64(p2q2); + sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2)); + + *p0q0_output = vrshrq_n_u16(sum, 3); +} + +void Horizontal8_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); + auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + const uint16x4_t src[8] = { + vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), + vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]); + const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]); + const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]); + const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]); + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() does not apply, but Filter4() applies to one or more values. + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t is_flat4_mask_8 = + vcombine_u16(is_flat4_mask, is_flat4_mask); + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + vst1_u16(dst_p2, vget_low_u16(p2q2_output)); + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); + vst1_u16(dst_q2, vget_high_u16(p2q2_output)); +} + +inline uint16x8_t ReverseLowHalf(const uint16x8_t a) { + return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a)); +} + +void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t); + auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n. + // To get desired pairs after transpose, one half should be reversed. + uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3)}; + + // src[0] = p0q0 + // src[1] = p1q1 + // src[2] = p2q2 + // src[3] = p3q3 + LoopFilterTranspose4x8(src); + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = OuterThreshold( + vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]), + vget_high_u16(src[1]), outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = src[0]; + const uint16x8_t p1q1 = src[1]; + const uint16x8_t p2q2 = src[2]; + const uint16x8_t p3q3 = src[3]; + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() does not apply, but Filter4() applies to one or more values. + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t is_flat4_mask_8 = + vcombine_u16(is_flat4_mask, is_flat4_mask); + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3}; + // After transpose, |output| will contain rows of the form: + // p0 p1 p2 p3 q0 q1 q2 q3 + Transpose4x8(output); + + // Reverse p values to produce original order: + // p3 p2 p1 p0 q0 q1 q2 q3 + vst1q_u16(dst_0, ReverseLowHalf(output[0])); + vst1q_u16(dst_1, ReverseLowHalf(output[1])); + vst1q_u16(dst_2, ReverseLowHalf(output[2])); + vst1q_u16(dst_3, ReverseLowHalf(output[3])); +} + +inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5, + const uint16x8_t p4q4, const uint16x8_t p3q3, + const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, uint16x8_t* const p5q5_output, + uint16x8_t* const p4q4_output, + uint16x8_t* const p3q3_output, + uint16x8_t* const p2q2_output, + uint16x8_t* const p1q1_output, + uint16x8_t* const p0q0_output) { + // Sum p5 and q5 output from opposite directions. + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^ + const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^^^^^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^^^^^^^^^^^^ + uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1); + sum = vaddq_u16(sum, p6q6_x7); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^ + sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^ + sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^ + const uint16x8_t q0p0 = Transpose64(p0q0); + sum = vaddq_u16(sum, q0p0); + + *p5q5_output = vrshrq_n_u16(sum, 4); + + // Convert to p4 and q4 output: + // p4 = p5 - (2 * p6) + p3 + q1 + // q4 = q5 - (2 * q6) + q3 + p1 + sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1)); + const uint16x8_t q1p1 = Transpose64(p1q1); + sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum); + + *p4q4_output = vrshrq_n_u16(sum, 4); + + // Convert to p3 and q3 output: + // p3 = p4 - p6 - p5 + p2 + q2 + // q3 = q4 - q6 - q5 + q2 + p2 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5)); + const uint16x8_t q2p2 = Transpose64(p2q2); + sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum); + + *p3q3_output = vrshrq_n_u16(sum, 4); + + // Convert to p2 and q2 output: + // p2 = p3 - p6 - p4 + p1 + q3 + // q2 = q3 - q6 - q4 + q1 + p3 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4)); + const uint16x8_t q3p3 = Transpose64(p3q3); + sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum); + + *p2q2_output = vrshrq_n_u16(sum, 4); + + // Convert to p1 and q1 output: + // p1 = p2 - p6 - p3 + p0 + q4 + // q1 = q2 - q6 - q3 + q0 + p4 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3)); + const uint16x8_t q4p4 = Transpose64(p4q4); + sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum); + + *p1q1_output = vrshrq_n_u16(sum, 4); + + // Convert to p0 and q0 output: + // p0 = p1 - p6 - p2 + q0 + q5 + // q0 = q1 - q6 - q2 + p0 + p5 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2)); + const uint16x8_t q5p5 = Transpose64(p5q5); + sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum); + + *p0q0_output = vrshrq_n_u16(sum, 4); +} + +void Horizontal14_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride); + auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride); + auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride); + auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); + auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride); + auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride); + auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride); + + const uint16x4_t src[14] = { + vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3), + vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4), + vld1_u16(dst_q5), vld1_u16(dst_q6)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]); + const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]); + const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]); + const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]); + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]); + const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]); + const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]); + // Mask to choose between the outputs of Filter8 and Filter14. + // As with the derivation of |is_flat4_mask|, the question of whether to use + // Filter14 is only raised where |is_flat4_mask| is true. + const uint16x4_t is_flat4_outer_mask = vand_u16( + is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), + vabdq_u16(p0q0, p6q6))); + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, + p5q5_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() and Filter14() do not apply, but Filter4() applies to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t use_filter8_mask = + vcombine_u16(is_flat4_mask, is_flat4_mask); + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); + if (vget_lane_u64(need_filter14, 0) == 0) { + // Filter14() does not apply, but Filter8() and Filter4() apply to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } else { + // All filters may contribute values to final outputs. + const uint16x8_t use_filter14_mask = + vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); + uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); + p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); + p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); + p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); + p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); + p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + } + + vst1_u16(dst_p5, vget_low_u16(p5q5_output)); + vst1_u16(dst_p4, vget_low_u16(p4q4_output)); + vst1_u16(dst_p3, vget_low_u16(p3q3_output)); + vst1_u16(dst_p2, vget_low_u16(p2q2_output)); + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); + vst1_u16(dst_q2, vget_high_u16(p2q2_output)); + vst1_u16(dst_q3, vget_high_u16(p3q3_output)); + vst1_u16(dst_q4, vget_high_u16(p4q4_output)); + vst1_u16(dst_q5, vget_high_u16(p5q5_output)); +} + +inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) { + uint16x8x2_t acdb; +#if defined(__aarch64__) + // a[b] <- [c]d + acdb.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd))); + // [a]b <- c[d] + acdb.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab))); +#else + // a[b] <- [c]d + acdb.val[0] = vreinterpretq_u16_u64( + vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0), + vreinterpretq_u64_u16(ab), 1)); + // [a]b <- c[d] + acdb.val[1] = vreinterpretq_u16_u64( + vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1), + vreinterpretq_u64_u16(ab), 0)); +#endif // defined(__aarch64__) + return acdb; +} + +void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t); + auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + // Low halves: p7 p6 p5 p4 + // High halves: p3 p2 p1 p0 + uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3)}; + // p7 will be the low half of src_p[0]. Not used until the end. + Transpose4x8(src_p); + + // Low halves: q0 q1 q2 q3 + // High halves: q4 q5 q6 q7 + uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8), + vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)}; + // q7 will be the high half of src_q[3]. Not used until the end. + Transpose4x8(src_q); + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = OuterThreshold( + vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]), + vget_low_u16(src_q[1]), outer_thresh); + const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4); + const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4); + const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4); + const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + const uint16x8_t p4q4 = + vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0])); + const uint16x8_t p5q5 = + vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1])); + const uint16x8_t p6q6 = + vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2])); + const uint16x8_t p7q7 = + vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3])); + // Mask to choose between the outputs of Filter8 and Filter14. + // As with the derivation of |is_flat4_mask|, the question of whether to use + // Filter14 is only raised where |is_flat4_mask| is true. + const uint16x4_t is_flat4_outer_mask = vand_u16( + is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), + vabdq_u16(p0q0, p6q6))); + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, + p5q5_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() and Filter14() do not apply, but Filter4() applies to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t use_filter8_mask = + vcombine_u16(is_flat4_mask, is_flat4_mask); + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); + if (vget_lane_u64(need_filter14, 0) == 0) { + // Filter14() does not apply, but Filter8() and Filter4() apply to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } else { + // All filters may contribute values to final outputs. + const uint16x8_t use_filter14_mask = + vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); + uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); + p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); + p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); + p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); + p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); + p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + } + // To get the correctly ordered rows from the transpose, we need: + // p7p3 p6p2 p5p1 p4p0 + // q0q4 q1q5 q2q6 q3q7 + const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output); + const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output); + const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output); + const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output); + uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0], + p5p1_q1q5.val[0], p4p0_q0q4.val[0]}; + Transpose4x8(output_p); + uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1], + p6p2_q2q6.val[1], p7p3_q3q7.val[1]}; + Transpose4x8(output_q); + + // Reverse p values to produce original order: + // p3 p2 p1 p0 q0 q1 q2 q3 + vst1q_u16(dst_0, output_p[0]); + vst1q_u16(dst_0 + 8, output_q[0]); + vst1q_u16(dst_1, output_p[1]); + vst1q_u16(dst_1 + 8, output_q[1]); + vst1q_u16(dst_2, output_p[2]); + vst1q_u16(dst_2 + 8, output_q[2]); + vst1q_u16(dst_3, output_p[3]); + vst1q_u16(dst_3 + 8, output_q[3]); +} + +} // namespace + +void LoopFilterInit10bpp_NEON() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Horizontal4_NEON; + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Horizontal6_NEON; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Horizontal8_NEON; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Horizontal14_NEON; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Vertical14_NEON; +} + +} // namespace dsp +} // namespace libgav1 + +#else // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10) +namespace libgav1 { +namespace dsp { + +void LoopFilterInit10bpp_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 diff --git a/libgav1/src/dsp/arm/loop_filter_neon.cc b/libgav1/src/dsp/arm/loop_filter_neon.cc index 8c03928..a8b236d 100644 --- a/libgav1/src/dsp/arm/loop_filter_neon.cc +++ b/libgav1/src/dsp/arm/loop_filter_neon.cc @@ -29,7 +29,6 @@ namespace libgav1 { namespace dsp { -namespace low_bitdepth { namespace { // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) @@ -149,10 +148,6 @@ void Horizontal4_NEON(void* const dest, const ptrdiff_t stride, needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter4_mask) == 0) { // None of the values will be filtered. return; @@ -209,10 +204,6 @@ void Vertical4_NEON(void* const dest, const ptrdiff_t stride, needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter4_mask) == 0) { // None of the values will be filtered. return; @@ -346,10 +337,6 @@ void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter6_mask) == 0) { // None of the values will be filtered. return; @@ -420,10 +407,6 @@ void Vertical6_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter6_mask) == 0) { // None of the values will be filtered. return; @@ -600,10 +583,6 @@ void Horizontal8_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -679,10 +658,6 @@ void Vertical8_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -863,10 +838,6 @@ void Horizontal14_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -1031,10 +1002,6 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -1158,7 +1125,9 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride, vst1q_u8(dst, output_3); } -void Init8bpp() { +} // namespace + +void LoopFilterInit_NEON() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = @@ -1178,1267 +1147,6 @@ void Init8bpp() { dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14_NEON; } -} // namespace -} // namespace low_bitdepth - -#if LIBGAV1_MAX_BITDEPTH >= 10 -namespace high_bitdepth { -namespace { - -// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) -inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) { - const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh)); - return vorr_u16(vget_low_u16(a), vget_high_u16(a)); -} - -// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh -inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0, - const uint16x4_t q0, const uint16x4_t q1, - const uint16_t outer_thresh) { - const uint16x4_t abd_p0q0 = vabd_u16(p0, q0); - const uint16x4_t abd_p1q1 = vabd_u16(p1, q1); - const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1); - const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1); - const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half); - return vcle_u16(sum, vdup_n_u16(outer_thresh)); -} - -// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && -// OuterThreshold() -inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1, - const uint16_t inner_thresh, - const uint16x4_t outer_mask) { - const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh)); - const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a)); - return vand_u16(inner_mask, outer_mask); -} - -// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && -// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh && -// OuterThreshold() -inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1, - const uint16x8_t abd_p1p2_q1q2, - const uint16_t inner_thresh, - const uint16x4_t outer_mask) { - const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); - const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh)); - const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b)); - return vand_u16(inner_mask, outer_mask); -} - -// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && -// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && -// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh -// OuterThreshold() -inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1, - const uint16x8_t abd_p1p2_q1q2, - const uint16x8_t abd_p2p3_q2q3, - const uint16_t inner_thresh, - const uint16x4_t outer_mask) { - const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); - const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3); - const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh)); - const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c)); - return vand_u16(inner_mask, outer_mask); -} - -// ----------------------------------------------------------------------------- -// FilterNMasks functions. - -inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1, - const uint16_t hev_thresh, const uint16x4_t outer_mask, - const uint16_t inner_thresh, - uint16x4_t* const hev_mask, - uint16x4_t* const needs_filter4_mask) { - const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1); - // This includes cases where NeedsFilter4() is not true and so Filter2() will - // not be applied. - const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh); - - *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask); - - // Filter2() will only be applied if both NeedsFilter4() and Hev() are true. - *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask); -} - -// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh && -// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh -// |flat_thresh| == 4 for 10 bit decode. -inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1, - const uint16x8_t abd_p0p2_q0q2) { - constexpr int flat_thresh = 1 << 2; - const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2); - const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh)); - return vand_u16(vget_low_u16(b), vget_high_u16(b)); -} - -inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1, - const uint16x8_t p0q0, const uint16_t hev_thresh, - const uint16x4_t outer_mask, - const uint16_t inner_thresh, - uint16x4_t* const needs_filter6_mask, - uint16x4_t* const is_flat3_mask, - uint16x4_t* const hev_mask) { - const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); - *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); - *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2)); - *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), - inner_thresh, outer_mask); -} - -// IsFlat4 uses N=1, IsFlatOuter4 uses N=4. -// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh && -// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh && -// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh -// |flat_thresh| == 4 for 10 bit decode. -inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0, - const uint16x8_t abd_pn1p0_qn1q0, - const uint16x8_t abd_pn2p0_qn2q0) { - constexpr int flat_thresh = 1 << 2; - const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0); - const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0); - const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh)); - return vand_u16(vget_low_u16(c), vget_high_u16(c)); -} - -inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2, - const uint16x8_t p1q1, const uint16x8_t p0q0, - const uint16_t hev_thresh, const uint16x4_t outer_mask, - const uint16_t inner_thresh, - uint16x4_t* const needs_filter8_mask, - uint16x4_t* const is_flat4_mask, - uint16x4_t* const hev_mask) { - const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); - *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); - const uint16x4_t is_flat4 = - IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3)); - *needs_filter8_mask = - NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3), - inner_thresh, outer_mask); - // |is_flat4_mask| is used to decide where to use the result of Filter8. - // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false, - // overriding the question of whether to use Filter8. Because Filter4 doesn't - // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the - // source value. To be correct, the mask must account for this override. - *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask); -} - -// ----------------------------------------------------------------------------- -// FilterN functions. - -// Calculate Filter4() or Filter2() based on |hev_mask|. -inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1, - const uint16x8_t p1q1, const uint16x4_t hev_mask, - uint16x8_t* const p1q1_result, - uint16x8_t* const p0q0_result) { - const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4); - // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val); - // q0mp0 means "q0 minus p0". - const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1)); - const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3); - - // If this is for Filter2() then include |p1mq1|. Otherwise zero it. - const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/))); - const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1); - const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1); - const int16x4_t p1mq1_saturated = - Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel); - const int16x4_t hev_option = - vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated); - - const int16x4_t a = vadd_s16(q0mp0_3, hev_option); - - // Need to figure out what's going on here because there are some unnecessary - // tricks to accommodate 8x8 as smallest 8bpp vector - - // We can not shift with rounding because the clamp comes *before* the - // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 = - // Clip3(a + 3, min_signed_val, max_signed_val) >> 3; - const int16x4_t plus_four = - Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel); - const int16x4_t plus_three = - Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel); - const int16x4_t a1 = vshr_n_s16(plus_four, 3); - const int16x4_t a2 = vshr_n_s16(plus_three, 3); - - // a3 = (a1 + 1) >> 1; - const int16x4_t a3 = vrshr_n_s16(a1, 1); - - const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3)); - const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3); - - // Need to shift the second term or we end up with a2_ma2. - const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1)); - const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1); - *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10); - *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10); -} - -void Horizontal4_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - - const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), - vld1_u16(dst_q0), vld1_u16(dst_q1)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); - const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); - Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, - &needs_filter4_mask); - -#if defined(__aarch64__) - // This provides a good speedup for the unit test, but may not come up often - // enough to warrant it. - if (vaddv_u16(needs_filter4_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - const uint64x1_t needs_filter4_mask64 = - vreinterpret_u64_u16(needs_filter4_mask); - if (vget_lane_u64(needs_filter4_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter4_mask_8 = - vcombine_u16(needs_filter4_mask, needs_filter4_mask); - - uint16x8_t f_p1q1; - uint16x8_t f_p0q0; - const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); - - // Already integrated the Hev mask when calculating the filtered values. - const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); - - // p1/q1 are unmodified if only Hev() is true. This works because it was and'd - // with |needs_filter4_mask| previously. - const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); - const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); - - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); -} - -void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - // Offset by 2 uint16_t values to load from first p1 position. - auto* dst = static_cast<uint8_t*>(dest) - 4; - auto* dst_p1 = reinterpret_cast<uint16_t*>(dst); - auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride); - auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2); - auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3); - - uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1)}; - Transpose4x4(src); - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); - const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); - Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, - &needs_filter4_mask); - -#if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. - if (vaddv_u16(needs_filter4_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - const uint64x1_t needs_filter4_mask64 = - vreinterpret_u64_u16(needs_filter4_mask); - if (vget_lane_u64(needs_filter4_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter4_mask_8 = - vcombine_u16(needs_filter4_mask, needs_filter4_mask); - - uint16x8_t f_p1q1; - uint16x8_t f_p0q0; - const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); - - // Already integrated the Hev mask when calculating the filtered values. - const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); - - // p1/q1 are unmodified if only Hev() is true. This works because it was and'd - // with |needs_filter4_mask| previously. - const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); - const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); - - uint16x4_t output[4] = { - vget_low_u16(p1q1_output), - vget_low_u16(p0q0_output), - vget_high_u16(p0q0_output), - vget_high_u16(p1q1_output), - }; - Transpose4x4(output); - - vst1_u16(dst_p1, output[0]); - vst1_u16(dst_p0, output[1]); - vst1_u16(dst_q0, output[2]); - vst1_u16(dst_q1, output[3]); -} - -inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1, - const uint16x8_t p0q0, uint16x8_t* const p1q1_output, - uint16x8_t* const p0q0_output) { - // Sum p1 and q1 output from opposite directions. - // The formula is regrouped to allow 3 doubling operations to be combined. - // - // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 - // ^^^^^^^^ - // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2) - // ^^^^^^^^ - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^^^^^^^ - uint16x8_t sum = vaddq_u16(p2q2, p1q1); - - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^^ - sum = vaddq_u16(sum, p0q0); - - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^ - sum = vshlq_n_u16(sum, 1); - - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^^ ^^^^^^ - // Should dual issue with the left shift. - const uint16x8_t q0p0 = Transpose64(p0q0); - const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0); - sum = vaddq_u16(sum, outer_sum); - - *p1q1_output = vrshrq_n_u16(sum, 3); - - // Convert to p0 and q0 output: - // p0 = p1 - (2 * p2) + q0 + q1 - // q0 = q1 - (2 * q2) + p0 + p1 - // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 - // ^^^^^^^^ - const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1); - // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 - // ^^^^^^^^ - sum = vsubq_u16(sum, p2q2_double); - const uint16x8_t q1p1 = Transpose64(p1q1); - sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1)); - - *p0q0_output = vrshrq_n_u16(sum, 3); -} - -void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - - const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1), - vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1), vld1_u16(dst_q2)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat3_mask; - const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); - const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); - const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); - Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat3_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or - // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 - // output is not used. - uint16x8_t f6_p1q1, f6_p0q0; - const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); - if (vget_lane_u64(need_filter6, 0) == 0) { - // Filter6() does not apply, but Filter4() applies to one or more values. - p0q0_output = p0q0; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); - p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); -} - -void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - // Left side of the filter window. - auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t); - auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - // Overread by 2 values. These overreads become the high halves of src_raw[2] - // and src_raw[3] after transpose. - uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3)}; - Transpose4x8(src_raw); - // p2, p1, p0, q0, q1, q2 - const uint16x4_t src[6] = { - vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]), - vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]), - vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]), - }; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat3_mask; - const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); - const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); - const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); - Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat3_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or - // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 - // output is not used. - uint16x8_t f6_p1q1, f6_p0q0; - const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); - if (vget_lane_u64(need_filter6, 0) == 0) { - // Filter6() does not apply, but Filter4() applies to one or more values. - p0q0_output = p0q0; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); - p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - uint16x4_t output[4] = { - vget_low_u16(p1q1_output), - vget_low_u16(p0q0_output), - vget_high_u16(p0q0_output), - vget_high_u16(p1q1_output), - }; - Transpose4x4(output); - - // dst_n starts at p2, so adjust to p1. - vst1_u16(dst_0 + 1, output[0]); - vst1_u16(dst_1 + 1, output[1]); - vst1_u16(dst_2 + 1, output[2]); - vst1_u16(dst_3 + 1, output[3]); -} - -inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2, - const uint16x8_t p1q1, const uint16x8_t p0q0, - uint16x8_t* const p2q2_output, - uint16x8_t* const p1q1_output, - uint16x8_t* const p0q0_output) { - // Sum p2 and q2 output from opposite directions. - // The formula is regrouped to allow 2 doubling operations to be combined. - // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 - // ^^^^^^^^ - // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) - // ^^^^^^^^ - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^^^^^^ - const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^ - uint16x8_t sum = vshlq_n_u16(p23q23, 1); - - // Add two other terms to make dual issue with shift more likely. - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^^^^^^ - const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^^^^^^^^ - sum = vaddq_u16(sum, p01q01); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^ - sum = vaddq_u16(sum, p3q3); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^ - const uint16x8_t q0p0 = Transpose64(p0q0); - sum = vaddq_u16(sum, q0p0); - - *p2q2_output = vrshrq_n_u16(sum, 3); - - // Convert to p1 and q1 output: - // p1 = p2 - p3 - p2 + p1 + q1 - // q1 = q2 - q3 - q2 + q0 + p1 - sum = vsubq_u16(sum, p23q23); - const uint16x8_t q1p1 = Transpose64(p1q1); - sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1)); - - *p1q1_output = vrshrq_n_u16(sum, 3); - - // Convert to p0 and q0 output: - // p0 = p1 - p3 - p1 + p0 + q2 - // q0 = q1 - q3 - q1 + q0 + p2 - sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1)); - const uint16x8_t q2p2 = Transpose64(p2q2); - sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2)); - - *p0q0_output = vrshrq_n_u16(sum, 3); -} - -void Horizontal8_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); - auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - const uint16x4_t src[8] = { - vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), - vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]); - const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]); - const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]); - const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]); - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() does not apply, but Filter4() applies to one or more values. - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t is_flat4_mask_8 = - vcombine_u16(is_flat4_mask, is_flat4_mask); - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - vst1_u16(dst_p2, vget_low_u16(p2q2_output)); - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); - vst1_u16(dst_q2, vget_high_u16(p2q2_output)); -} - -inline uint16x8_t ReverseLowHalf(const uint16x8_t a) { - return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a)); -} - -void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t); - auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n. - // To get desired pairs after transpose, one half should be reversed. - uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3)}; - - // src[0] = p0q0 - // src[1] = p1q1 - // src[2] = p2q2 - // src[3] = p3q3 - LoopFilterTranspose4x8(src); - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = OuterThreshold( - vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]), - vget_high_u16(src[1]), outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - const uint16x8_t p0q0 = src[0]; - const uint16x8_t p1q1 = src[1]; - const uint16x8_t p2q2 = src[2]; - const uint16x8_t p3q3 = src[3]; - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() does not apply, but Filter4() applies to one or more values. - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t is_flat4_mask_8 = - vcombine_u16(is_flat4_mask, is_flat4_mask); - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3}; - // After transpose, |output| will contain rows of the form: - // p0 p1 p2 p3 q0 q1 q2 q3 - Transpose4x8(output); - - // Reverse p values to produce original order: - // p3 p2 p1 p0 q0 q1 q2 q3 - vst1q_u16(dst_0, ReverseLowHalf(output[0])); - vst1q_u16(dst_1, ReverseLowHalf(output[1])); - vst1q_u16(dst_2, ReverseLowHalf(output[2])); - vst1q_u16(dst_3, ReverseLowHalf(output[3])); -} -inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5, - const uint16x8_t p4q4, const uint16x8_t p3q3, - const uint16x8_t p2q2, const uint16x8_t p1q1, - const uint16x8_t p0q0, uint16x8_t* const p5q5_output, - uint16x8_t* const p4q4_output, - uint16x8_t* const p3q3_output, - uint16x8_t* const p2q2_output, - uint16x8_t* const p1q1_output, - uint16x8_t* const p0q0_output) { - // Sum p5 and q5 output from opposite directions. - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^^ - const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^^^^^^^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^^^^^^^^^^^^^ - uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1); - sum = vaddq_u16(sum, p6q6_x7); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^ - sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^ - sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^ - const uint16x8_t q0p0 = Transpose64(p0q0); - sum = vaddq_u16(sum, q0p0); - - *p5q5_output = vrshrq_n_u16(sum, 4); - - // Convert to p4 and q4 output: - // p4 = p5 - (2 * p6) + p3 + q1 - // q4 = q5 - (2 * q6) + q3 + p1 - sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1)); - const uint16x8_t q1p1 = Transpose64(p1q1); - sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum); - - *p4q4_output = vrshrq_n_u16(sum, 4); - - // Convert to p3 and q3 output: - // p3 = p4 - p6 - p5 + p2 + q2 - // q3 = q4 - q6 - q5 + q2 + p2 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5)); - const uint16x8_t q2p2 = Transpose64(p2q2); - sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum); - - *p3q3_output = vrshrq_n_u16(sum, 4); - - // Convert to p2 and q2 output: - // p2 = p3 - p6 - p4 + p1 + q3 - // q2 = q3 - q6 - q4 + q1 + p3 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4)); - const uint16x8_t q3p3 = Transpose64(p3q3); - sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum); - - *p2q2_output = vrshrq_n_u16(sum, 4); - - // Convert to p1 and q1 output: - // p1 = p2 - p6 - p3 + p0 + q4 - // q1 = q2 - q6 - q3 + q0 + p4 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3)); - const uint16x8_t q4p4 = Transpose64(p4q4); - sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum); - - *p1q1_output = vrshrq_n_u16(sum, 4); - - // Convert to p0 and q0 output: - // p0 = p1 - p6 - p2 + q0 + q5 - // q0 = q1 - q6 - q2 + p0 + p5 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2)); - const uint16x8_t q5p5 = Transpose64(p5q5); - sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum); - - *p0q0_output = vrshrq_n_u16(sum, 4); -} - -void Horizontal14_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride); - auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride); - auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride); - auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); - auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride); - auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride); - auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride); - - const uint16x4_t src[14] = { - vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3), - vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4), - vld1_u16(dst_q5), vld1_u16(dst_q6)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]); - const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]); - const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]); - const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]); - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]); - const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]); - const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]); - // Mask to choose between the outputs of Filter8 and Filter14. - // As with the derivation of |is_flat4_mask|, the question of whether to use - // Filter14 is only raised where |is_flat4_mask| is true. - const uint16x4_t is_flat4_outer_mask = vand_u16( - is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), - vabdq_u16(p0q0, p6q6))); - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, - p5q5_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() and Filter14() do not apply, but Filter4() applies to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t use_filter8_mask = - vcombine_u16(is_flat4_mask, is_flat4_mask); - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); - if (vget_lane_u64(need_filter14, 0) == 0) { - // Filter14() does not apply, but Filter8() and Filter4() apply to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } else { - // All filters may contribute values to final outputs. - const uint16x8_t use_filter14_mask = - vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); - uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; - Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, - &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); - p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); - p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); - p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); - p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); - p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); - p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); - p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); - p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); - p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - } - - vst1_u16(dst_p5, vget_low_u16(p5q5_output)); - vst1_u16(dst_p4, vget_low_u16(p4q4_output)); - vst1_u16(dst_p3, vget_low_u16(p3q3_output)); - vst1_u16(dst_p2, vget_low_u16(p2q2_output)); - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); - vst1_u16(dst_q2, vget_high_u16(p2q2_output)); - vst1_u16(dst_q3, vget_high_u16(p3q3_output)); - vst1_u16(dst_q4, vget_high_u16(p4q4_output)); - vst1_u16(dst_q5, vget_high_u16(p5q5_output)); -} - -inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) { - uint16x8x2_t acdb; -#if defined(__aarch64__) - // a[b] <- [c]d - acdb.val[0] = vreinterpretq_u16_u64( - vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd))); - // [a]b <- c[d] - acdb.val[1] = vreinterpretq_u16_u64( - vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab))); -#else - // a[b] <- [c]d - acdb.val[0] = vreinterpretq_u16_u64( - vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0), - vreinterpretq_u64_u16(ab), 1)); - // [a]b <- c[d] - acdb.val[1] = vreinterpretq_u16_u64( - vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1), - vreinterpretq_u64_u16(ab), 0)); -#endif // defined(__aarch64__) - return acdb; -} - -void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t); - auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - // Low halves: p7 p6 p5 p4 - // High halves: p3 p2 p1 p0 - uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3)}; - // p7 will be the low half of src_p[0]. Not used until the end. - Transpose4x8(src_p); - - // Low halves: q0 q1 q2 q3 - // High halves: q4 q5 q6 q7 - uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8), - vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)}; - // q7 will be the high half of src_q[3]. Not used until the end. - Transpose4x8(src_q); - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = OuterThreshold( - vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]), - vget_low_u16(src_q[1]), outer_thresh); - const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4); - const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4); - const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4); - const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - const uint16x8_t p4q4 = - vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0])); - const uint16x8_t p5q5 = - vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1])); - const uint16x8_t p6q6 = - vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2])); - const uint16x8_t p7q7 = - vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3])); - // Mask to choose between the outputs of Filter8 and Filter14. - // As with the derivation of |is_flat4_mask|, the question of whether to use - // Filter14 is only raised where |is_flat4_mask| is true. - const uint16x4_t is_flat4_outer_mask = vand_u16( - is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), - vabdq_u16(p0q0, p6q6))); - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, - p5q5_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() and Filter14() do not apply, but Filter4() applies to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t use_filter8_mask = - vcombine_u16(is_flat4_mask, is_flat4_mask); - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); - if (vget_lane_u64(need_filter14, 0) == 0) { - // Filter14() does not apply, but Filter8() and Filter4() apply to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } else { - // All filters may contribute values to final outputs. - const uint16x8_t use_filter14_mask = - vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); - uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; - Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, - &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); - p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); - p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); - p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); - p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); - p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); - p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); - p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); - p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); - p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - } - // To get the correctly ordered rows from the transpose, we need: - // p7p3 p6p2 p5p1 p4p0 - // q0q4 q1q5 q2q6 q3q7 - const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output); - const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output); - const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output); - const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output); - uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0], - p5p1_q1q5.val[0], p4p0_q0q4.val[0]}; - Transpose4x8(output_p); - uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1], - p6p2_q2q6.val[1], p7p3_q3q7.val[1]}; - Transpose4x8(output_q); - - // Reverse p values to produce original order: - // p3 p2 p1 p0 q0 q1 q2 q3 - vst1q_u16(dst_0, output_p[0]); - vst1q_u16(dst_0 + 8, output_q[0]); - vst1q_u16(dst_1, output_p[1]); - vst1q_u16(dst_1 + 8, output_q[1]); - vst1q_u16(dst_2, output_p[2]); - vst1q_u16(dst_2 + 8, output_q[2]); - vst1q_u16(dst_3, output_p[3]); - vst1q_u16(dst_3 + 8, output_q[3]); -} - -void Init10bpp() { - Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); - assert(dsp != nullptr); - dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = - Horizontal4_NEON; - dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON; - dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = - Horizontal6_NEON; - dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON; - dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = - Horizontal8_NEON; - dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON; - dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = - Horizontal14_NEON; - dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = - Vertical14_NEON; -} - -} // namespace -} // namespace high_bitdepth -#endif // LIBGAV1_MAX_BITDEPTH >= 10 - -void LoopFilterInit_NEON() { - low_bitdepth::Init8bpp(); -#if LIBGAV1_MAX_BITDEPTH >= 10 - high_bitdepth::Init10bpp(); -#endif -} } // namespace dsp } // namespace libgav1 diff --git a/libgav1/src/dsp/arm/loop_filter_neon.h b/libgav1/src/dsp/arm/loop_filter_neon.h index 540defc..531cd0d 100644 --- a/libgav1/src/dsp/arm/loop_filter_neon.h +++ b/libgav1/src/dsp/arm/loop_filter_neon.h @@ -26,6 +26,7 @@ namespace dsp { // Initializes Dsp::loop_filters, see the defines below for specifics. This // function is not thread-safe. void LoopFilterInit_NEON(); +void LoopFilterInit10bpp_NEON(); } // namespace dsp } // namespace libgav1 diff --git a/libgav1/src/dsp/arm/loop_restoration_neon.cc b/libgav1/src/dsp/arm/loop_restoration_neon.cc index 2db137f..cd8552e 100644 --- a/libgav1/src/dsp/arm/loop_restoration_neon.cc +++ b/libgav1/src/dsp/arm/loop_restoration_neon.cc @@ -1504,7 +1504,6 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0, const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width; uint8x16_t s[2][2], mas[2]; uint16x8_t sq[2][4], bs[3]; - // TODO(b/194217060): Future msan load. s[0][0] = vld1q_u8(src0); s[1][0] = vld1q_u8(src1); @@ -1599,7 +1598,6 @@ inline void BoxSumFilterPreProcess( const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width; uint8x16_t s[2][2], ma3[2][2], ma5[2]; uint16x8_t sq[2][4], b3[2][3], b5[3]; - // TODO(b/194217060): Future msan load. s[0][0] = vld1q_u8(src0); s[1][0] = vld1q_u8(src1); @@ -1801,7 +1799,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src, uint8_t* const dst) { uint8x16_t s[2], mas[2]; uint16x8_t sq[4], bs[4]; - // TODO(b/194217060): Future msan load. s[0] = vld1q_u8(src0); BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], @@ -1812,7 +1809,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src, uint16x8_t ma[2]; uint8x16_t masx[3]; uint32x4x2_t b[2]; - // TODO(b/194217060): Future msan load. s[1] = vld1q_u8(src0 + x + 16); BoxFilterPreProcess5LastRow(s, x + 8, scale, sum5, square_sum5, sq + 1, mas, @@ -1856,7 +1852,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width; uint8x16_t s[2], mas[2]; uint16x8_t sq[4], bs[3]; - // TODO(b/194217060): Future msan load. s[0] = vld1q_u8(src0); BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]); @@ -1915,7 +1910,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter( const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width; uint8x16_t s[2][2], ma3[2][2], ma5[2]; uint16x8_t sq[2][4], b3[2][3], b5[3]; - // TODO(b/194217060): Future msan load. s[0][0] = vld1q_u8(src0); s[1][0] = vld1q_u8(src1); @@ -2023,7 +2017,6 @@ inline void BoxFilterLastRow( uint8x16_t s[2], ma3[2], ma5[2]; uint16x8_t sq[4], ma[3], b3[3], b5[3]; uint32x4x2_t b[3]; - // TODO(b/194217060): Future msan load. s[0] = vld1q_u8(src0); BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5, @@ -2033,7 +2026,6 @@ inline void BoxFilterLastRow( do { uint8x16_t ma3x[3], ma5x[3]; int16x8_t p[2]; - // TODO(b/194217060): Future msan load. s[1] = vld1q_u8(src0 + x + 16); BoxFilterPreProcessLastRow(s, x + 8, scales, sum3, sum5, square_sum3, diff --git a/libgav1/src/dsp/arm/mask_blend_neon.cc b/libgav1/src/dsp/arm/mask_blend_neon.cc index 853f949..ecc67f8 100644 --- a/libgav1/src/dsp/arm/mask_blend_neon.cc +++ b/libgav1/src/dsp/arm/mask_blend_neon.cc @@ -33,50 +33,40 @@ namespace dsp { namespace low_bitdepth { namespace { -// TODO(b/150461164): Consider combining with GetInterIntraMask4x2(). -// Compound predictors use int16_t values and need to multiply long because the -// Convolve range * 64 is 20 bits. Unfortunately there is no multiply int16_t by -// int8_t and accumulate into int32_t instruction. -template <int subsampling_x, int subsampling_y> -inline int16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { - if (subsampling_x == 1) { - const int16x4_t mask_val0 = vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask))); - const int16x4_t mask_val1 = vreinterpret_s16_u16( - vpaddl_u8(vld1_u8(mask + (mask_stride << subsampling_y)))); - int16x8_t final_val; - if (subsampling_y == 1) { - const int16x4_t next_mask_val0 = - vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride))); - const int16x4_t next_mask_val1 = - vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride * 3))); - final_val = vaddq_s16(vcombine_s16(mask_val0, mask_val1), - vcombine_s16(next_mask_val0, next_mask_val1)); - } else { - final_val = vreinterpretq_s16_u16( - vpaddlq_u8(vreinterpretq_u8_s16(vcombine_s16(mask_val0, mask_val1)))); - } - return vrshrq_n_s16(final_val, subsampling_y + 1); +template <int subsampling_y> +inline uint8x8_t GetMask4x2(const uint8_t* mask) { + if (subsampling_y == 1) { + const uint8x16x2_t mask_val = vld2q_u8(mask); + const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]); + const uint32x2_t row_01 = vreinterpret_u32_u8(vget_low_u8(combined_horz)); + const uint32x2_t row_23 = vreinterpret_u32_u8(vget_high_u8(combined_horz)); + + const uint32x2x2_t row_02_13 = vtrn_u32(row_01, row_23); + // Use a halving add to work around the case where all |mask| values are 64. + return vrshr_n_u8(vhadd_u8(vreinterpret_u8_u32(row_02_13.val[0]), + vreinterpret_u8_u32(row_02_13.val[1])), + 1); } - assert(subsampling_y == 0 && subsampling_x == 0); - const uint8x8_t mask_val0 = Load4(mask); - const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0); - return vreinterpretq_s16_u16(vmovl_u8(mask_val)); + // subsampling_x == 1 + const uint8x8x2_t mask_val = vld2_u8(mask); + return vrhadd_u8(mask_val.val[0], mask_val.val[1]); } template <int subsampling_x, int subsampling_y> -inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) { +inline uint8x8_t GetMask8(const uint8_t* mask) { + if (subsampling_x == 1 && subsampling_y == 1) { + const uint8x16x2_t mask_val = vld2q_u8(mask); + const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]); + // Use a halving add to work around the case where all |mask| values are 64. + return vrshr_n_u8( + vhadd_u8(vget_low_u8(combined_horz), vget_high_u8(combined_horz)), 1); + } if (subsampling_x == 1) { - int16x8_t mask_val = vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask))); - if (subsampling_y == 1) { - const int16x8_t next_mask_val = - vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask + mask_stride))); - mask_val = vaddq_s16(mask_val, next_mask_val); - } - return vrshrq_n_s16(mask_val, 1 + subsampling_y); + const uint8x8x2_t mask_val = vld2_u8(mask); + return vrhadd_u8(mask_val.val[0], mask_val.val[1]); } assert(subsampling_y == 0 && subsampling_x == 0); - const uint8x8_t mask_val = vld1_u8(mask); - return vreinterpretq_s16_u16(vmovl_u8(mask_val)); + return vld1_u8(mask); } inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, @@ -109,89 +99,162 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, StoreHi4(dst + dst_stride, result); } -template <int subsampling_x, int subsampling_y> +template <int subsampling_y> inline void MaskBlending4x4_NEON(const int16_t* LIBGAV1_RESTRICT pred_0, const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t* LIBGAV1_RESTRICT mask, - const ptrdiff_t mask_stride, uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { + constexpr int subsampling_x = 1; + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; const int16x8_t mask_inverter = vdupq_n_s16(64); - int16x8_t pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + // Compound predictors use int16_t values and need to multiply long because + // the Convolve range * 64 is 20 bits. Unfortunately there is no multiply + // int16_t by int8_t and accumulate into int32_t instruction. + int16x8_t pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - // TODO(b/150461164): Arm tends to do better with load(val); val += stride - // It may be possible to turn this into a loop with a templated height. - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); } -template <int subsampling_x, int subsampling_y> +template <int subsampling_y> inline void MaskBlending4xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0, const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t* LIBGAV1_RESTRICT const mask_ptr, - const ptrdiff_t mask_stride, const int height, + const int height, uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const uint8_t* mask = mask_ptr; if (height == 4) { - MaskBlending4x4_NEON<subsampling_x, subsampling_y>( - pred_0, pred_1, mask, mask_stride, dst, dst_stride); + MaskBlending4x4_NEON<subsampling_y>(pred_0, pred_1, mask, dst, dst_stride); return; } + constexpr int subsampling_x = 1; + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; const int16x8_t mask_inverter = vdupq_n_s16(64); int y = 0; do { int16x8_t pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + vreinterpretq_s16_u16(vmovl_u8(GetMask4x2<subsampling_y>(mask))); int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; y += 8; } while (y < height); } +inline uint8x8_t CombinePred8(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const int16x8_t pred_mask_0, + const int16x8_t pred_mask_1) { + // First 8 values. + const int16x8_t pred_val_0 = vld1q_s16(pred_0); + const int16x8_t pred_val_1 = vld1q_s16(pred_1); + // int res = (mask_value * prediction_0[x] + + // (64 - mask_value) * prediction_1[x]) >> 6; + const int32x4_t weighted_pred_lo = + vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0)); + const int32x4_t weighted_pred_hi = + vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0)); + const int32x4_t weighted_combo_lo = vmlal_s16( + weighted_pred_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1)); + const int32x4_t weighted_combo_hi = vmlal_s16( + weighted_pred_hi, vget_high_s16(pred_mask_1), vget_high_s16(pred_val_1)); + + // dst[x] = static_cast<Pixel>( + // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, + // (1 << kBitdepth8) - 1)); + return vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6), + vshrn_n_s32(weighted_combo_hi, 6)), + 4); +} + +template <int subsampling_x, int subsampling_y> +inline void MaskBlending8xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, + const int height, + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { + const uint8_t* mask = mask_ptr; + const int16x8_t mask_inverter = vdupq_n_s16(64); + int y = height; + do { + const int16x8_t pred_mask_0 = + ZeroExtend(GetMask8<subsampling_x, subsampling_y>(mask)); + // 64 - mask + const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); + const uint8x8_t result = + CombinePred8(pred_0, pred_1, pred_mask_0, pred_mask_1); + vst1_u8(dst, result); + dst += dst_stride; + mask += 8 << (subsampling_x + subsampling_y); + pred_0 += 8; + pred_1 += 8; + } while (--y != 0); +} + +template <int subsampling_x, int subsampling_y> +inline uint8x16_t GetMask16(const uint8_t* mask, const ptrdiff_t mask_stride) { + if (subsampling_x == 1 && subsampling_y == 1) { + const uint8x16x2_t mask_val0 = vld2q_u8(mask); + const uint8x16x2_t mask_val1 = vld2q_u8(mask + mask_stride); + const uint8x16_t combined_horz0 = + vaddq_u8(mask_val0.val[0], mask_val0.val[1]); + const uint8x16_t combined_horz1 = + vaddq_u8(mask_val1.val[0], mask_val1.val[1]); + // Use a halving add to work around the case where all |mask| values are 64. + return vrshrq_n_u8(vhaddq_u8(combined_horz0, combined_horz1), 1); + } + if (subsampling_x == 1) { + const uint8x16x2_t mask_val = vld2q_u8(mask); + return vrhaddq_u8(mask_val.val[0], mask_val.val[1]); + } + assert(subsampling_y == 0 && subsampling_x == 0); + return vld1q_u8(mask); +} + template <int subsampling_x, int subsampling_y> inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, const void* LIBGAV1_RESTRICT prediction_1, @@ -204,8 +267,13 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); if (width == 4) { - MaskBlending4xH_NEON<subsampling_x, subsampling_y>( - pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride); + MaskBlending4xH_NEON<subsampling_y>(pred_0, pred_1, mask_ptr, height, dst, + dst_stride); + return; + } + if (width == 8) { + MaskBlending8xH_NEON<subsampling_x, subsampling_y>(pred_0, pred_1, mask_ptr, + height, dst, dst_stride); return; } const uint8_t* mask = mask_ptr; @@ -214,35 +282,24 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, do { int x = 0; do { - const int16x8_t pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( + const uint8x16_t pred_mask_0 = GetMask16<subsampling_x, subsampling_y>( mask + (x << subsampling_x), mask_stride); + const int16x8_t pred_mask_0_lo = ZeroExtend(vget_low_u8(pred_mask_0)); + const int16x8_t pred_mask_0_hi = ZeroExtend(vget_high_u8(pred_mask_0)); // 64 - mask - const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); - const int16x8_t pred_val_0 = vld1q_s16(pred_0 + x); - const int16x8_t pred_val_1 = vld1q_s16(pred_1 + x); + const int16x8_t pred_mask_1_lo = vsubq_s16(mask_inverter, pred_mask_0_lo); + const int16x8_t pred_mask_1_hi = vsubq_s16(mask_inverter, pred_mask_0_hi); + uint8x8_t result; - // int res = (mask_value * prediction_0[x] + - // (64 - mask_value) * prediction_1[x]) >> 6; - const int32x4_t weighted_pred_0_lo = - vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0)); - const int32x4_t weighted_pred_0_hi = - vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0)); - const int32x4_t weighted_combo_lo = - vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1), - vget_low_s16(pred_val_1)); - const int32x4_t weighted_combo_hi = - vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1), - vget_high_s16(pred_val_1)); - - // dst[x] = static_cast<Pixel>( - // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, - // (1 << kBitdepth8) - 1)); - result = vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6), - vshrn_n_s32(weighted_combo_hi, 6)), - 4); + result = + CombinePred8(pred_0 + x, pred_1 + x, pred_mask_0_lo, pred_mask_1_lo); vst1_u8(dst + x, result); - x += 8; + result = CombinePred8(pred_0 + x + 8, pred_1 + x + 8, pred_mask_0_hi, + pred_mask_1_hi); + vst1_u8(dst + x + 8, result); + + x += 16; } while (x < width); dst += dst_stride; pred_0 += width; @@ -251,63 +308,19 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, } while (++y < height); } -// TODO(b/150461164): This is much faster for inter_intra (input is Pixel -// values) but regresses compound versions (input is int16_t). Try to -// consolidate these. template <int subsampling_x, int subsampling_y> inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { if (subsampling_x == 1) { - const uint8x8_t mask_val = - vpadd_u8(vld1_u8(mask), vld1_u8(mask + (mask_stride << subsampling_y))); - if (subsampling_y == 1) { - const uint8x8_t next_mask_val = vpadd_u8(vld1_u8(mask + mask_stride), - vld1_u8(mask + mask_stride * 3)); - - // Use a saturating add to work around the case where all |mask| values - // are 64. Together with the rounding shift this ensures the correct - // result. - const uint8x8_t sum = vqadd_u8(mask_val, next_mask_val); - return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y); - } - - return vrshr_n_u8(mask_val, /*subsampling_x=*/1); + return GetMask4x2<subsampling_y>(mask); } - + // When using intra or difference weighted masks, the function doesn't use + // subsampling, so |mask_stride| may be 4 or 8. assert(subsampling_y == 0 && subsampling_x == 0); const uint8x8_t mask_val0 = Load4(mask); - // TODO(b/150461164): Investigate the source of |mask| and see if the stride - // can be removed. - // TODO(b/150461164): The unit tests start at 8x8. Does this get run? return Load4<1>(mask + mask_stride, mask_val0); } -template <int subsampling_x, int subsampling_y> -inline uint8x8_t GetInterIntraMask8(const uint8_t* mask, - ptrdiff_t mask_stride) { - if (subsampling_x == 1) { - const uint8x16_t mask_val = vld1q_u8(mask); - const uint8x8_t mask_paired = - vpadd_u8(vget_low_u8(mask_val), vget_high_u8(mask_val)); - if (subsampling_y == 1) { - const uint8x16_t next_mask_val = vld1q_u8(mask + mask_stride); - const uint8x8_t next_mask_paired = - vpadd_u8(vget_low_u8(next_mask_val), vget_high_u8(next_mask_val)); - - // Use a saturating add to work around the case where all |mask| values - // are 64. Together with the rounding shift this ensures the correct - // result. - const uint8x8_t sum = vqadd_u8(mask_paired, next_mask_paired); - return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y); - } - - return vrshr_n_u8(mask_paired, /*subsampling_x=*/1); - } - - assert(subsampling_y == 0 && subsampling_x == 0); - return vld1_u8(mask); -} - inline void InterIntraWriteMaskBlendLine8bpp4x2( const uint8_t* LIBGAV1_RESTRICT const pred_0, uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1, @@ -374,6 +387,32 @@ inline void InterIntraMaskBlending8bpp4xH_NEON( } template <int subsampling_x, int subsampling_y> +inline void InterIntraMaskBlending8bpp8xH_NEON( + const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, + const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask, + const ptrdiff_t mask_stride, const int height) { + const uint8x8_t mask_inverter = vdup_n_u8(64); + int y = height; + do { + const uint8x8_t pred_mask_1 = GetMask8<subsampling_x, subsampling_y>(mask); + // 64 - mask + const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1); + const uint8x8_t pred_val_0 = vld1_u8(pred_0); + const uint8x8_t pred_val_1 = vld1_u8(pred_1); + const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0); + // weighted_pred0 + weighted_pred1 + const uint16x8_t weighted_combo = + vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1); + const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6); + vst1_u8(pred_1, result); + + pred_0 += 8; + pred_1 += pred_stride_1; + mask += mask_stride << subsampling_y; + } while (--y != 0); +} + +template <int subsampling_x, int subsampling_y> inline void InterIntraMaskBlend8bpp_NEON( const uint8_t* LIBGAV1_RESTRICT prediction_0, uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1, @@ -385,30 +424,46 @@ inline void InterIntraMaskBlend8bpp_NEON( height); return; } + if (width == 8) { + InterIntraMaskBlending8bpp8xH_NEON<subsampling_x, subsampling_y>( + prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride, + height); + return; + } const uint8_t* mask = mask_ptr; - const uint8x8_t mask_inverter = vdup_n_u8(64); + const uint8x16_t mask_inverter = vdupq_n_u8(64); int y = 0; do { int x = 0; do { - // TODO(b/150461164): Consider a 16 wide specialization (at least for the - // unsampled version) to take advantage of vld1q_u8(). - const uint8x8_t pred_mask_1 = - GetInterIntraMask8<subsampling_x, subsampling_y>( - mask + (x << subsampling_x), mask_stride); + const uint8x16_t pred_mask_1 = GetMask16<subsampling_x, subsampling_y>( + mask + (x << subsampling_x), mask_stride); // 64 - mask - const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1); - const uint8x8_t pred_val_0 = vld1_u8(prediction_0); + const uint8x16_t pred_mask_0 = vsubq_u8(mask_inverter, pred_mask_1); + const uint8x8_t pred_val_0_lo = vld1_u8(prediction_0); + prediction_0 += 8; + const uint8x8_t pred_val_0_hi = vld1_u8(prediction_0); prediction_0 += 8; - const uint8x8_t pred_val_1 = vld1_u8(prediction_1 + x); - const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0); + // Ensure armv7 build combines the load. + const uint8x16_t pred_val_1 = vld1q_u8(prediction_1 + x); + const uint8x8_t pred_val_1_lo = vget_low_u8(pred_val_1); + const uint8x8_t pred_val_1_hi = vget_high_u8(pred_val_1); + const uint16x8_t weighted_pred_0_lo = + vmull_u8(vget_low_u8(pred_mask_0), pred_val_0_lo); // weighted_pred0 + weighted_pred1 - const uint16x8_t weighted_combo = - vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1); - const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6); - vst1_u8(prediction_1 + x, result); + const uint16x8_t weighted_combo_lo = + vmlal_u8(weighted_pred_0_lo, vget_low_u8(pred_mask_1), pred_val_1_lo); + const uint8x8_t result_lo = vrshrn_n_u16(weighted_combo_lo, 6); + vst1_u8(prediction_1 + x, result_lo); + const uint16x8_t weighted_pred_0_hi = + vmull_u8(vget_high_u8(pred_mask_0), pred_val_0_hi); + // weighted_pred0 + weighted_pred1 + const uint16x8_t weighted_combo_hi = vmlal_u8( + weighted_pred_0_hi, vget_high_u8(pred_mask_1), pred_val_1_hi); + const uint8x8_t result_hi = vrshrn_n_u16(weighted_combo_hi, 6); + vst1_u8(prediction_1 + x + 8, result_hi); - x += 8; + x += 16; } while (x < width); prediction_1 += prediction_stride_1; mask += mask_stride << subsampling_y; diff --git a/libgav1/src/dsp/arm/obmc_neon.cc b/libgav1/src/dsp/arm/obmc_neon.cc index 659ed8e..271bbaa 100644 --- a/libgav1/src/dsp/arm/obmc_neon.cc +++ b/libgav1/src/dsp/arm/obmc_neon.cc @@ -52,6 +52,17 @@ inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred, StoreLo4(pred, result); } +inline void WriteObmcLine8(uint8_t* LIBGAV1_RESTRICT const pred, + const uint8x8_t obmc_pred_val, + const uint8x8_t pred_mask, + const uint8x8_t obmc_pred_mask) { + const uint8x8_t pred_val = vld1_u8(pred); + const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); + const uint8x8_t result = + vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + vst1_u8(pred, result); +} + inline void OverlapBlendFromLeft2xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, @@ -99,24 +110,25 @@ inline void OverlapBlendFromLeft4xH_NEON( inline void OverlapBlendFromLeft8xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) { const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6); + constexpr int obmc_prediction_stride = 8; // 64 - mask const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); int y = 0; do { - const uint8x8_t pred_val = vld1_u8(pred); - const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); - const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred); - const uint8x8_t result = - vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred); + WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask, obmc_pred_mask); + pred += prediction_stride; - vst1_u8(pred, result); + WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask, + obmc_pred_mask); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (++y != height); + + obmc_pred += obmc_prediction_stride << 1; + y += 2; + } while (y != height); } void OverlapBlendFromLeft_NEON( @@ -140,8 +152,7 @@ void OverlapBlendFromLeft_NEON( return; } if (width == 8) { - OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred); return; } const uint8x16_t mask_inverter = vdupq_n_u8(64); @@ -262,26 +273,31 @@ inline void OverlapBlendFromTop4xH_NEON( inline void OverlapBlendFromTop8xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 8; const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8_t* mask = kObmcMask + height - 2; const int compute_height = height - (height >> 2); int y = 0; do { - const uint8x8_t pred_mask = vdup_n_u8(mask[y]); + const uint8x8_t pred_mask0 = vdup_n_u8(mask[y]); // 64 - mask - const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); - const uint8x8_t pred_val = vld1_u8(pred); - const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); - const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred); - const uint8x8_t result = - vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + const uint8x8_t obmc_pred_mask0 = vsub_u8(mask_inverter, pred_mask0); + const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred); - vst1_u8(pred, result); + WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask0, + obmc_pred_mask0); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (++y != compute_height); + ++y; + + const uint8x8_t pred_mask1 = vdup_n_u8(mask[y]); + // 64 - mask + const uint8x8_t obmc_pred_mask1 = vsub_u8(mask_inverter, pred_mask1); + WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask1, + obmc_pred_mask1); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + } while (++y < compute_height); } void OverlapBlendFromTop_NEON( @@ -301,8 +317,7 @@ void OverlapBlendFromTop_NEON( } if (width == 8) { - OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred); return; } @@ -371,26 +386,23 @@ constexpr uint16_t kObmcMask[62] = { 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64}; -inline uint16x4_t BlendObmc2Or4(uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, +inline uint16x4_t BlendObmc2Or4(uint16_t* const pred, + const uint16x4_t obmc_pred_val, const uint16x4_t pred_mask, const uint16x4_t obmc_pred_mask) { - const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x4_t obmc_pred_val = - vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x4_t pred_val = vld1_u16(pred); const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val); const uint16x4_t result = vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); return result; } -inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, +inline uint16x8_t BlendObmc8(uint16_t* LIBGAV1_RESTRICT const pred, + const uint16_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) { - const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x8_t obmc_pred_val = - vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x8_t pred_val = vld1q_u16(pred); + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val); const uint16x8_t result = vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); @@ -398,27 +410,29 @@ inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred, } inline void OverlapBlendFromLeft2xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 2; const uint16x4_t mask_inverter = vdup_n_u16(64); // Second two lanes unused. const uint16x4_t pred_mask = vld1_u16(kObmcMask); const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask); int y = 0; do { + const uint16x4_t obmc_pred_0 = vld1_u16(obmc_pred); const uint16x4_t result_0 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - Store2<0>(reinterpret_cast<uint16_t*>(pred), result_0); + BlendObmc2Or4(pred, obmc_pred_0, pred_mask, obmc_pred_mask); + Store2<0>(pred, result_0); - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; + const uint16x4_t obmc_pred_1 = vld1_u16(obmc_pred); const uint16x4_t result_1 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - Store2<0>(reinterpret_cast<uint16_t*>(pred), result_1); + BlendObmc2Or4(pred, obmc_pred_1, pred_mask, obmc_pred_mask); + Store2<0>(pred, result_1); - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; y += 2; @@ -426,26 +440,26 @@ inline void OverlapBlendFromLeft2xH_NEON( } inline void OverlapBlendFromLeft4xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 4; const uint16x4_t mask_inverter = vdup_n_u16(64); const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2); // 64 - mask const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask); int y = 0; do { - const uint16x4_t result_0 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result_0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - const uint16x4_t result_1 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result_1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); + const uint16x4_t result_0 = BlendObmc2Or4(pred, vget_low_u16(obmc_pred_val), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result_0); + pred = AddByteStride(pred, prediction_stride); + + const uint16x4_t result_1 = BlendObmc2Or4( + pred, vget_high_u16(obmc_pred_val), pred_mask, obmc_pred_mask); + vst1_u16(pred, result_1); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; y += 2; } while (y != height); @@ -456,52 +470,47 @@ void OverlapBlendFromLeft_NEON( const int width, const int height, const void* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { - auto* pred = static_cast<uint8_t*>(prediction); - const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); assert(width >= 2); assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred); return; } const uint16x8_t mask_inverter = vdupq_n_u16(64); const uint16_t* mask = kObmcMask + width - 2; int x = 0; do { - pred = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(prediction) + x); - obmc_pred = reinterpret_cast<const uint8_t*>( - static_cast<const uint16_t*>(obmc_prediction) + x); + uint16_t* pred_x = pred + x; + const uint16_t* obmc_pred_x = obmc_pred + x; const uint16x8_t pred_mask = vld1q_u16(mask + x); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); int y = 0; do { const uint16x8_t result = - BlendObmc8(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + BlendObmc8(pred_x, obmc_pred_x, pred_mask, obmc_pred_mask); + vst1q_u16(pred_x, result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred_x = AddByteStride(pred_x, prediction_stride); + obmc_pred_x = AddByteStride(obmc_pred_x, obmc_prediction_stride); } while (++y < height); x += 8; } while (x < width); } template <int lane> -inline uint16x4_t BlendObmcFromTop4( - uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, - const uint16x8_t obmc_pred_mask) { - const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x4_t obmc_pred_val = - vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); +inline uint16x4_t BlendObmcFromTop4(uint16_t* const pred, + const uint16x4_t obmc_pred_val, + const uint16x8_t pred_mask, + const uint16x8_t obmc_pred_mask) { + const uint16x4_t pred_val = vld1_u16(pred); const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask); const uint16x4_t result = vrshr_n_u16( VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6); @@ -510,12 +519,11 @@ inline uint16x4_t BlendObmcFromTop4( template <int lane> inline uint16x8_t BlendObmcFromTop8( - uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, - const uint16x8_t obmc_pred_mask) { - const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x8_t obmc_pred_val = - vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + uint16_t* LIBGAV1_RESTRICT const pred, + const uint16_t* LIBGAV1_RESTRICT const obmc_pred, + const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) { + const uint16x8_t pred_val = vld1q_u16(pred); + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask); const uint16x8_t result = vrshrq_n_u16( VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6); @@ -523,41 +531,43 @@ inline uint16x8_t BlendObmcFromTop8( } inline void OverlapBlendFromTop4x2Or4_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride, const int height) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) { + constexpr int obmc_prediction_stride = 4; const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]); const uint16x8_t mask_inverter = vdupq_n_u16(64); const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); - uint16x4_t result = - BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + const uint16x8_t obmc_pred_val_0 = vld1q_u16(obmc_pred); + uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val_0), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); if (height == 2) { // Mask value is 64, meaning |pred| is unchanged. return; } - result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val_0), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; - result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + const uint16x4_t obmc_pred_val_2 = vld1_u16(obmc_pred); + result = + BlendObmcFromTop4<2>(pred, obmc_pred_val_2, pred_mask, obmc_pred_mask); + vst1_u16(pred, result); } inline void OverlapBlendFromTop4xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { if (height < 8) { - OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, - obmc_prediction_stride, height); + OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, height); return; } + constexpr int obmc_prediction_stride = 4; const uint16_t* mask = kObmcMask + height - 2; const uint16x8_t mask_inverter = vdupq_n_u16(64); int y = 0; @@ -566,36 +576,44 @@ inline void OverlapBlendFromTop4xH_NEON( do { const uint16x8_t pred_mask = vld1q_u16(&mask[y]); const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); - uint16x4_t result = - BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + // Load obmc row 0, 1. + uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); + uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; + + // Load obmc row 2, 3. + obmc_pred_val = vld1q_u16(obmc_pred); + result = BlendObmcFromTop4<2>(pred, vget_low_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<3>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; + + // Load obmc row 4, 5. + obmc_pred_val = vld1q_u16(obmc_pred); + result = BlendObmcFromTop4<4>(pred, vget_low_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<5>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; // Increment for the right mask index. y += 6; @@ -603,147 +621,147 @@ inline void OverlapBlendFromTop4xH_NEON( } inline void OverlapBlendFromTop8xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride, const int height) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) { const uint16_t* mask = kObmcMask + height - 2; const uint16x8_t mask_inverter = vdupq_n_u16(64); uint16x8_t pred_mask = vld1q_u16(mask); uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); uint16x8_t result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 2) return; - pred += prediction_stride; + constexpr int obmc_prediction_stride = 8; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 4) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 8) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; pred_mask = vld1q_u16(&mask[8]); obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 16) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; pred_mask = vld1q_u16(&mask[16]); obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); } void OverlapBlendFromTop_NEON( @@ -751,20 +769,18 @@ void OverlapBlendFromTop_NEON( const int width, const int height, const void* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { - auto* pred = static_cast<uint8_t*>(prediction); - const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); assert(width >= 4); assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, - obmc_prediction_stride, height); + OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, height); return; } @@ -773,19 +789,16 @@ void OverlapBlendFromTop_NEON( const uint16x8_t pred_mask = vld1q_u16(mask); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); -#define OBMC_ROW_FROM_TOP(n) \ - do { \ - int x = 0; \ - do { \ - const uint16x8_t result = BlendObmcFromTop8<n>( \ - reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(pred) + x), \ - reinterpret_cast<const uint8_t*>( \ - reinterpret_cast<const uint16_t*>(obmc_pred) + x), \ - pred_mask, obmc_pred_mask); \ - vst1q_u16(reinterpret_cast<uint16_t*>(pred) + x, result); \ - \ - x += 8; \ - } while (x < width); \ +#define OBMC_ROW_FROM_TOP(n) \ + do { \ + int x = 0; \ + do { \ + const uint16x8_t result = BlendObmcFromTop8<n>( \ + pred + x, obmc_pred + x, pred_mask, obmc_pred_mask); \ + vst1q_u16(pred + x, result); \ + \ + x += 8; \ + } while (x < width); \ } while (false) // Compute 1 row. @@ -797,11 +810,11 @@ void OverlapBlendFromTop_NEON( // Compute 3 rows. if (height == 4) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); return; } @@ -809,20 +822,20 @@ void OverlapBlendFromTop_NEON( // Compute 6 rows. if (height == 8) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); return; } @@ -830,42 +843,42 @@ void OverlapBlendFromTop_NEON( // Compute 12 rows. if (height == 16) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(6); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(7); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); const uint16x8_t pred_mask = vld1q_u16(&mask[8]); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); return; } @@ -879,29 +892,29 @@ void OverlapBlendFromTop_NEON( // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(6); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(7); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); y += 8; } while (y < compute_height); diff --git a/libgav1/src/dsp/arm/warp_neon.cc b/libgav1/src/dsp/arm/warp_neon.cc index 71e0a43..da380b1 100644 --- a/libgav1/src/dsp/arm/warp_neon.cc +++ b/libgav1/src/dsp/arm/warp_neon.cc @@ -147,14 +147,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, do { const int src_x = (start_x + 4) << subsampling_x; const int src_y = (start_y + 4) << subsampling_y; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's // boundary, the filtering can be simplified. We can divide the plane @@ -207,22 +201,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + if (filter_params.ix4 - 7 >= source_width - 1 || + filter_params.ix4 + 7 <= 0) { // Regions 1 and 2. // Points to the left or right border of the first row of |src|. const uint8_t* first_row_border = - (ix4 + 7 <= 0) ? src : src + source_width - 1; + (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1; // In general, for y in [-7, 8), the row number iy4 + y is clipped: // const int row = Clip3(iy4 + y, 0, source_height - 1); // In two special cases, iy4 + y is clipped to either 0 or // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 1. // Every sample used to calculate the prediction block has the same // value. So the whole prediction block has the same value. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint8_t row_border_pixel = first_row_border[row * source_stride]; @@ -256,15 +252,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; int sum = first_row_border[row * source_stride]; sum <<= (kFilterBits - kInterRoundBitsHorizontal); intermediate_result_column[y + 7] = sum; } // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); #if defined(__aarch64__) @@ -341,10 +337,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 3. // Horizontal filter. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint8_t* const src_row = src + row * source_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -354,11 +351,12 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // has left and right borders of at least 13 bytes that extend the // frame boundary pixels. We also assume there is at least one extra // padding byte after the right border of the last source row. - const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]); + const uint8x16_t src_row_v = vld1q_u8(&src_row[filter_params.ix4 - 7]); // Convert src_row_v to int8 (subtract 128). const int8x16_t src_row_centered = vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128))); - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { HorizontalFilter(sx4, alpha, src_row_centered, intermediate_result[y + 7]); @@ -367,12 +365,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, } else { // Region 4. // Horizontal filter. - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; const uint8_t* const src_row = src + row * source_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -382,7 +381,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // has left and right borders of at least 13 bytes that extend the // frame boundary pixels. We also assume there is at least one extra // padding byte after the right border of the last source row. - const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]); + const uint8x16_t src_row_v = + vld1q_u8(&src_row[filter_params.ix4 - 7]); // Convert src_row_v to int8 (subtract 128). const int8x16_t src_row_centered = vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128))); @@ -395,8 +395,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // Regions 3 and 4. // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); int16x8_t filter[8]; @@ -574,14 +574,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, do { const int src_x = (start_x + 4) << subsampling_x; const int src_y = (start_y + 4) << subsampling_y; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's // boundary, the filtering can be simplified. We can divide the plane @@ -634,22 +628,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + if (filter_params.ix4 - 7 >= source_width - 1 || + filter_params.ix4 + 7 <= 0) { // Regions 1 and 2. // Points to the left or right border of the first row of |src|. const uint16_t* first_row_border = - (ix4 + 7 <= 0) ? src : src + source_width - 1; + (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1; // In general, for y in [-7, 8), the row number iy4 + y is clipped: // const int row = Clip3(iy4 + y, 0, source_height - 1); // In two special cases, iy4 + y is clipped to either 0 or // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 1. // Every sample used to calculate the prediction block has the same // value. So the whole prediction block has the same value. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint16_t row_border_pixel = first_row_border[row * src_stride]; DestType* dst_row = dst + start_x - block_start_x; @@ -684,15 +680,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; int sum = first_row_border[row * src_stride]; sum <<= (kFilterBits - kInterRoundBitsHorizontal); intermediate_result_column[y + 7] = sum; } // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); #if defined(__aarch64__) @@ -782,10 +778,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 3. // Horizontal filter. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint16_t* const src_row = src + row * src_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -795,8 +792,10 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // has left and right borders of at least 13 pixels that extend the // frame boundary pixels. We also assume there is at least one extra // padding pixel after the right border of the last source row. - const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]); - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + const uint16x8x2_t src_row_v = + LoadSrcRow(&src_row[filter_params.ix4 - 7]); + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); sx4 += beta; @@ -804,12 +803,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, } else { // Region 4. // Horizontal filter. - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; const uint16_t* const src_row = src + row * src_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -819,7 +819,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // frame has left and right borders of at least 13 pixels that extend // the frame boundary pixels. We also assume there is at least one // extra padding pixel after the right border of the last source row. - const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]); + const uint16x8x2_t src_row_v = + LoadSrcRow(&src_row[filter_params.ix4 - 7]); HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); sx4 += beta; } @@ -828,8 +829,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // Regions 3 and 4. // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); int16x8_t filter[8]; diff --git a/libgav1/src/dsp/average_blend.cc b/libgav1/src/dsp/average_blend.cc index 273b355..1a37aa1 100644 --- a/libgav1/src/dsp/average_blend.cc +++ b/libgav1/src/dsp/average_blend.cc @@ -87,6 +87,21 @@ void Init10bpp() { } #endif +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->average_blend = AverageBlend_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_AverageBlend + dsp->average_blend = AverageBlend_C<12, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + } // namespace void AverageBlendInit_C() { @@ -94,6 +109,9 @@ void AverageBlendInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/cdef.cc b/libgav1/src/dsp/cdef.cc index ca2adfd..9dd9287 100644 --- a/libgav1/src/dsp/cdef.cc +++ b/libgav1/src/dsp/cdef.cc @@ -32,9 +32,11 @@ namespace { #include "src/dsp/cdef.inc" // Silence unused function warnings when CdefDirection_C is obviated. -#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ - !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \ - (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefDirection)) +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \ + (LIBGAV1_MAX_BITDEPTH >= 10 && \ + !defined(LIBGAV1_Dsp10bpp_CdefDirection)) || \ + (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefDirection)) constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105}; int32_t Square(int32_t x) { return x * x; } @@ -103,12 +105,15 @@ void CdefDirection_C(const void* LIBGAV1_RESTRICT const source, #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || // !defined(LIBGAV1_Dsp8bpp_CdefDirection) || // (LIBGAV1_MAX_BITDEPTH >= 10 && - // !defined(LIBGAV1_Dsp10bpp_CdefDirection)) + // !defined(LIBGAV1_Dsp10bpp_CdefDirection)) + // (LIBGAV1_MAX_BITDEPTH == 12 && + // !defined(LIBGAV1_Dsp12bpp_CdefDirection)) // Silence unused function warnings when CdefFilter_C is obviated. -#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ - !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \ - (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters)) +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \ + (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters)) || \ + (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefFilters)) int Constrain(int diff, int threshold, int damping) { assert(threshold != 0); @@ -218,7 +223,9 @@ void CdefFilter_C(const uint16_t* LIBGAV1_RESTRICT src, #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || // !defined(LIBGAV1_Dsp8bpp_CdefFilters) || // (LIBGAV1_MAX_BITDEPTH >= 10 && - // !defined(LIBGAV1_Dsp10bpp_CdefFilters)) + // !defined(LIBGAV1_Dsp10bpp_CdefFilters)) + // (LIBGAV1_MAX_BITDEPTH == 12 && + // !defined(LIBGAV1_Dsp12bpp_CdefFilters)) void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(8); @@ -294,7 +301,48 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->cdef_direction = CdefDirection_C<12, uint16_t>; + dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>; + dsp->cdef_filters[0][1] = + CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = + CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>; + dsp->cdef_filters[1][1] = + CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = + CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_CdefDirection + dsp->cdef_direction = CdefDirection_C<12, uint16_t>; #endif +#ifndef LIBGAV1_Dsp12bpp_CdefFilters + dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>; + dsp->cdef_filters[0][1] = + CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = + CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>; + dsp->cdef_filters[1][1] = + CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = + CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -303,6 +351,9 @@ void CdefInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/cdef.h b/libgav1/src/dsp/cdef.h index b820b77..ce23ea5 100644 --- a/libgav1/src/dsp/cdef.h +++ b/libgav1/src/dsp/cdef.h @@ -38,6 +38,11 @@ namespace libgav1 { namespace dsp { +enum { + kCdefSecondaryTap0 = 2, + kCdefSecondaryTap1 = 1, +}; + // Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not // thread-safe. void CdefInit_C(); diff --git a/libgav1/src/dsp/constants.h b/libgav1/src/dsp/constants.h index 7c1b62c..dd0a4e0 100644 --- a/libgav1/src/dsp/constants.h +++ b/libgav1/src/dsp/constants.h @@ -27,25 +27,7 @@ namespace libgav1 { enum { - // Documentation variables. - kBitdepth8 = 8, - kBitdepth10 = 10, - kBitdepth12 = 12, - // Weights are quadratic from '1' to '1 / block_size', scaled by - // 2^kSmoothWeightScale. - kSmoothWeightScale = 8, kCflLumaBufferStride = 32, - // InterRound0, Section 7.11.3.2. - kInterRoundBitsHorizontal = 3, // 8 & 10-bit. - kInterRoundBitsHorizontal12bpp = 5, - kInterRoundBitsCompoundVertical = 7, // 8, 10 & 12-bit compound prediction. - kInterRoundBitsVertical = 11, // 8 & 10-bit, single prediction. - kInterRoundBitsVertical12bpp = 9, - // Offset applied to 10bpp and 12bpp predictors to allow storing them in - // uint16_t. Removed before blending. - kCompoundOffset = (1 << 14) + (1 << 13), - kCdefSecondaryTap0 = 2, - kCdefSecondaryTap1 = 1, }; // anonymous enum extern const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8]; diff --git a/libgav1/src/dsp/convolve.cc b/libgav1/src/dsp/convolve.cc index f11b45e..6989da0 100644 --- a/libgav1/src/dsp/convolve.cc +++ b/libgav1/src/dsp/convolve.cc @@ -864,7 +864,93 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>; + dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>; + dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>; + dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>; + + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>; + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>; + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>; + dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>; + + dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>; + dsp->convolve[1][0][0][1] = + ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>; + dsp->convolve[1][0][1][0] = + ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>; + dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>; + + dsp->convolve[1][1][0][0] = nullptr; + dsp->convolve[1][1][0][1] = nullptr; + dsp->convolve[1][1][1][0] = nullptr; + dsp->convolve[1][1][1][1] = nullptr; + + dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>; + dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>; +#else // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp12bpp_ConvolveCopy + dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveHorizontal + dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveVertical + dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Convolve2D + dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundCopy + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundHorizontal + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>; #endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundVertical + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveCompound2D + dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockCopy + dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockHorizontal + dsp->convolve[1][0][0][1] = + ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockVertical + dsp->convolve[1][0][1][0] = + ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlock2D + dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>; +#endif + + dsp->convolve[1][1][0][0] = nullptr; + dsp->convolve[1][1][0][1] = nullptr; + dsp->convolve[1][1][1][0] = nullptr; + dsp->convolve[1][1][1][1] = nullptr; + +#ifndef LIBGAV1_Dsp12bpp_ConvolveScale2D + dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundScale2D + dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -873,6 +959,9 @@ void ConvolveInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/convolve.h b/libgav1/src/dsp/convolve.h index 5bc0bad..8780bfc 100644 --- a/libgav1/src/dsp/convolve.h +++ b/libgav1/src/dsp/convolve.h @@ -17,6 +17,8 @@ #ifndef LIBGAV1_SRC_DSP_CONVOLVE_H_ #define LIBGAV1_SRC_DSP_CONVOLVE_H_ +#include <cassert> + // Pull in LIBGAV1_DspXXX defines representing the implementation status // of each function. The resulting value of each can be used by each module to // determine whether an implementation is needed at compile time. @@ -43,6 +45,35 @@ namespace dsp { // thread-safe. void ConvolveInit_C(); +inline int GetNumTapsInFilter(const int filter_index) { + if (filter_index < 2) { + // Despite the names these only use 6 taps. + // kInterpolationFilterEightTap + // kInterpolationFilterEightTapSmooth + return 6; + } + + if (filter_index == 2) { + // kInterpolationFilterEightTapSharp + return 8; + } + + if (filter_index == 3) { + // kInterpolationFilterBilinear + return 2; + } + + assert(filter_index > 3); + // For small sizes (width/height <= 4) the large filters are replaced with 4 + // tap options. + // If the original filters were |kInterpolationFilterEightTap| or + // |kInterpolationFilterEightTapSharp| then it becomes + // |kInterpolationFilterSwitchable|. + // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 + // tap filter. + return 4; +} + } // namespace dsp } // namespace libgav1 diff --git a/libgav1/src/dsp/convolve.inc b/libgav1/src/dsp/convolve.inc index e0f755e..2e0b270 100644 --- a/libgav1/src/dsp/convolve.inc +++ b/libgav1/src/dsp/convolve.inc @@ -12,39 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Constants and utility functions used for convolve implementations. +// Constants used for convolve implementations. // This will be included inside an anonymous namespace on files where these are // necessary. -int GetNumTapsInFilter(const int filter_index) { - if (filter_index < 2) { - // Despite the names these only use 6 taps. - // kInterpolationFilterEightTap - // kInterpolationFilterEightTapSmooth - return 6; - } - - if (filter_index == 2) { - // kInterpolationFilterEightTapSharp - return 8; - } - - if (filter_index == 3) { - // kInterpolationFilterBilinear - return 2; - } - - assert(filter_index > 3); - // For small sizes (width/height <= 4) the large filters are replaced with 4 - // tap options. - // If the original filters were |kInterpolationFilterEightTap| or - // |kInterpolationFilterEightTapSharp| then it becomes - // |kInterpolationFilterSwitchable|. - // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 - // tap filter. - return 4; -} - constexpr int kIntermediateAllocWidth = kMaxSuperBlockSizeInPixels; constexpr int kIntermediateStride = 8; constexpr int kHorizontalOffset = 3; diff --git a/libgav1/src/dsp/distance_weighted_blend.cc b/libgav1/src/dsp/distance_weighted_blend.cc index 34d10fc..ef83235 100644 --- a/libgav1/src/dsp/distance_weighted_blend.cc +++ b/libgav1/src/dsp/distance_weighted_blend.cc @@ -88,7 +88,22 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_DistanceWeightedBlend + dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>; #endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -97,6 +112,9 @@ void DistanceWeightedBlendInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/dsp.cc b/libgav1/src/dsp/dsp.cc index aac0ca0..97a064f 100644 --- a/libgav1/src/dsp/dsp.cc +++ b/libgav1/src/dsp/dsp.cc @@ -78,6 +78,12 @@ dsp::Dsp* GetWritableDspTable(int bitdepth) { return &dsp_10bpp; } #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: { + static dsp::Dsp dsp_12bpp; + return &dsp_12bpp; + } +#endif } return nullptr; } @@ -157,6 +163,7 @@ void DspInit() { #if LIBGAV1_MAX_BITDEPTH >= 10 ConvolveInit10bpp_NEON(); InverseTransformInit10bpp_NEON(); + LoopFilterInit10bpp_NEON(); LoopRestorationInit10bpp_NEON(); #endif // LIBGAV1_MAX_BITDEPTH >= 10 #endif // LIBGAV1_ENABLE_NEON diff --git a/libgav1/src/dsp/film_grain.cc b/libgav1/src/dsp/film_grain.cc index fa12b69..906230d 100644 --- a/libgav1/src/dsp/film_grain.cc +++ b/libgav1/src/dsp/film_grain.cc @@ -19,17 +19,16 @@ #include <cstddef> #include <cstdint> #include <cstring> -#include <new> -#include "src/dsp/common.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/film_grain_common.h" #include "src/utils/array_2d.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" -#include "src/utils/logging.h" +#include "src/utils/constants.h" #include "src/utils/memory.h" +#include "src/utils/types.h" namespace libgav1 { namespace dsp { @@ -45,7 +44,7 @@ void InitializeScalingLookupTable_C(int num_points, const uint8_t point_value[], memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length); return; } - constexpr int index_shift = bitdepth - kBitdepth8; + constexpr int index_shift = (bitdepth == kBitdepth10) ? 2 : 0; static_assert(sizeof(scaling_lut[0]) == 2, ""); Memset(scaling_lut, point_scaling[0], std::max(static_cast<int>(point_value[0]), 1) << index_shift); @@ -866,6 +865,121 @@ void Init10bpp() { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + + // LumaAutoRegressionFunc + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; + + // ChromaAutoRegressionFunc + // Chroma autoregression should never be called when lag is 0 and use_luma is + // false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>; + + // ConstructNoiseStripesFunc + dsp->film_grain.construct_noise_stripes[0] = + ConstructNoiseStripes_C<kBitdepth12, int16_t>; + dsp->film_grain.construct_noise_stripes[1] = + ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>; + + // ConstructNoiseImageOverlapFunc + dsp->film_grain.construct_noise_image_overlap = + ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>; + + // InitializeScalingLutFunc + dsp->film_grain.initialize_scaling_lut = + InitializeScalingLookupTable_C<kBitdepth12>; + + // BlendNoiseWithImageLumaFunc + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>; + + // BlendNoiseWithImageChromaFunc + dsp->film_grain.blend_noise_chroma[0] = + BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionLuma + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionChroma + // Chroma autoregression should never be called when lag is 0 and use_luma is + // false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseStripes + dsp->film_grain.construct_noise_stripes[0] = + ConstructNoiseStripes_C<kBitdepth12, int16_t>; + dsp->film_grain.construct_noise_stripes[1] = + ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseImageOverlap + dsp->film_grain.construct_noise_image_overlap = + ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainInitializeScalingLutFunc + dsp->film_grain.initialize_scaling_lut = + InitializeScalingLookupTable_C<kBitdepth12>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseLuma + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChroma + dsp->film_grain.blend_noise_chroma[0] = + BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChromaWithCfl + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace film_grain @@ -874,6 +988,9 @@ void FilmGrainInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 film_grain::Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + film_grain::Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/film_grain_common.h b/libgav1/src/dsp/film_grain_common.h index 2e6ad45..3c8d761 100644 --- a/libgav1/src/dsp/film_grain_common.h +++ b/libgav1/src/dsp/film_grain_common.h @@ -17,15 +17,7 @@ #ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_ #define LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_ -#include <cstddef> #include <cstdint> -#include <memory> -#include <type_traits> - -#include "src/dsp/common.h" -#include "src/utils/array_2d.h" -#include "src/utils/constants.h" -#include "src/utils/cpu.h" namespace libgav1 { diff --git a/libgav1/src/dsp/intra_edge.cc b/libgav1/src/dsp/intra_edge.cc index fe66db2..9875ef1 100644 --- a/libgav1/src/dsp/intra_edge.cc +++ b/libgav1/src/dsp/intra_edge.cc @@ -100,7 +100,26 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>; + dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_IntraEdgeFilter + dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_IntraEdgeUpsampler + dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>; #endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -109,6 +128,9 @@ void IntraEdgeInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/intrapred.cc b/libgav1/src/dsp/intrapred.cc index 75af279..3162acc 100644 --- a/libgav1/src/dsp/intrapred.cc +++ b/libgav1/src/dsp/intrapred.cc @@ -1422,6 +1422,551 @@ void Init10bpp() { } // NOLINT(readability/fn_size) #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using Defs12bpp = IntraPredBppDefs<12, uint16_t>; + +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_INTRAPREDICTORS(DefsHbd, Defs12bpp); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] = + Defs12bpp::_4x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] = + DefsHbd::_4x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] = + DefsHbd::_4x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = + DefsHbd::_4x4::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] = + DefsHbd::_4x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] = + DefsHbd::_4x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] = + DefsHbd::_4x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] = + Defs12bpp::_4x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] = + DefsHbd::_4x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] = + DefsHbd::_4x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = + DefsHbd::_4x8::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] = + DefsHbd::_4x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] = + DefsHbd::_4x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] = + DefsHbd::_4x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] = + Defs12bpp::_4x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] = + DefsHbd::_4x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] = + DefsHbd::_4x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] = + DefsHbd::_4x16::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] = + DefsHbd::_4x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] = + DefsHbd::_4x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] = + DefsHbd::_4x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] = + Defs12bpp::_8x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] = + DefsHbd::_8x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] = + DefsHbd::_8x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = + DefsHbd::_8x4::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] = + DefsHbd::_8x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] = + DefsHbd::_8x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] = + DefsHbd::_8x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] = + Defs12bpp::_8x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] = + DefsHbd::_8x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] = + DefsHbd::_8x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = + DefsHbd::_8x8::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] = + DefsHbd::_8x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] = + DefsHbd::_8x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] = + DefsHbd::_8x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] = + Defs12bpp::_8x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] = + DefsHbd::_8x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] = + DefsHbd::_8x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] = + DefsHbd::_8x16::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] = + DefsHbd::_8x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] = + DefsHbd::_8x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] = + DefsHbd::_8x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] = + Defs12bpp::_8x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] = + DefsHbd::_8x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] = + DefsHbd::_8x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] = + DefsHbd::_8x32::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] = + DefsHbd::_8x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] = + DefsHbd::_8x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] = + DefsHbd::_8x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] = + Defs12bpp::_16x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] = + DefsHbd::_16x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] = + DefsHbd::_16x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] = + DefsHbd::_16x4::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] = + DefsHbd::_16x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] = + DefsHbd::_16x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] = + DefsHbd::_16x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] = + Defs12bpp::_16x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] = + DefsHbd::_16x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] = + DefsHbd::_16x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] = + DefsHbd::_16x8::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] = + DefsHbd::_16x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] = + DefsHbd::_16x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] = + DefsHbd::_16x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] = + Defs12bpp::_16x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] = + DefsHbd::_16x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] = + DefsHbd::_16x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] = + DefsHbd::_16x16::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] = + DefsHbd::_16x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] = + DefsHbd::_16x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] = + DefsHbd::_16x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] = + Defs12bpp::_16x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] = + DefsHbd::_16x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] = + DefsHbd::_16x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] = + DefsHbd::_16x32::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] = + DefsHbd::_16x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] = + DefsHbd::_16x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] = + DefsHbd::_16x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] = + Defs12bpp::_16x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] = + DefsHbd::_16x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] = + DefsHbd::_16x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] = + DefsHbd::_16x64::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] = + DefsHbd::_16x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] = + DefsHbd::_16x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] = + DefsHbd::_16x64::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] = + Defs12bpp::_32x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] = + DefsHbd::_32x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] = + DefsHbd::_32x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] = + DefsHbd::_32x8::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] = + DefsHbd::_32x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] = + DefsHbd::_32x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] = + DefsHbd::_32x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] = + Defs12bpp::_32x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] = + DefsHbd::_32x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] = + DefsHbd::_32x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] = + DefsHbd::_32x16::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] = + DefsHbd::_32x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] = + DefsHbd::_32x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] = + DefsHbd::_32x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] = + Defs12bpp::_32x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] = + DefsHbd::_32x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] = + DefsHbd::_32x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] = + DefsHbd::_32x32::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] = + DefsHbd::_32x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] = + DefsHbd::_32x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] = + DefsHbd::_32x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] = + Defs12bpp::_32x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] = + DefsHbd::_32x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] = + DefsHbd::_32x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] = + DefsHbd::_32x64::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] = + DefsHbd::_32x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] = + DefsHbd::_32x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] = + DefsHbd::_32x64::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] = + Defs12bpp::_64x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] = + DefsHbd::_64x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] = + DefsHbd::_64x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] = + DefsHbd::_64x16::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] = + DefsHbd::_64x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] = + DefsHbd::_64x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] = + DefsHbd::_64x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] = + Defs12bpp::_64x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] = + DefsHbd::_64x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] = + DefsHbd::_64x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] = + DefsHbd::_64x32::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] = + DefsHbd::_64x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] = + DefsHbd::_64x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] = + DefsHbd::_64x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] = + Defs12bpp::_64x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] = + DefsHbd::_64x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] = + DefsHbd::_64x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] = + DefsHbd::_64x64::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] = + DefsHbd::_64x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] = + DefsHbd::_64x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] = + DefsHbd::_64x64::Paeth; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} // NOLINT(readability/fn_size) +#endif // LIBGAV1_MAX_BITDEPTH == 12 + #undef INIT_INTRAPREDICTORS_WxH #undef INIT_INTRAPREDICTORS } // namespace @@ -1431,6 +1976,9 @@ void IntraPredInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/intrapred_cfl.cc b/libgav1/src/dsp/intrapred_cfl.cc index 0f7f4f2..798bb73 100644 --- a/libgav1/src/dsp/intrapred_cfl.cc +++ b/libgav1/src/dsp/intrapred_cfl.cc @@ -639,6 +639,263 @@ void Init10bpp() { } // NOLINT(readability/fn_size) #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_CFL_INTRAPREDICTORS(12, uint16_t); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x4] = + CflIntraPredictor_C<4, 4, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler_C<4, 4, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] = + CflSubsampler_C<4, 4, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler_C<4, 4, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x8] = + CflIntraPredictor_C<4, 8, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler_C<4, 8, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] = + CflSubsampler_C<4, 8, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler_C<4, 8, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x16] = + CflIntraPredictor_C<4, 16, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler_C<4, 16, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] = + CflSubsampler_C<4, 16, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler_C<4, 16, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x4] = + CflIntraPredictor_C<8, 4, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler_C<8, 4, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] = + CflSubsampler_C<8, 4, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler_C<8, 4, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x8] = + CflIntraPredictor_C<8, 8, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler_C<8, 8, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] = + CflSubsampler_C<8, 8, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler_C<8, 8, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x16] = + CflIntraPredictor_C<8, 16, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler_C<8, 16, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] = + CflSubsampler_C<8, 16, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler_C<8, 16, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x32] = + CflIntraPredictor_C<8, 32, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler_C<8, 32, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] = + CflSubsampler_C<8, 32, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler_C<8, 32, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x4] = + CflIntraPredictor_C<16, 4, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler_C<16, 4, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] = + CflSubsampler_C<16, 4, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler_C<16, 4, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x8] = + CflIntraPredictor_C<16, 8, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler_C<16, 8, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] = + CflSubsampler_C<16, 8, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler_C<16, 8, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor_C<16, 16, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler_C<16, 16, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] = + CflSubsampler_C<16, 16, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler_C<16, 16, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor_C<16, 32, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler_C<16, 32, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] = + CflSubsampler_C<16, 32, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler_C<16, 32, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x8] = + CflIntraPredictor_C<32, 8, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler_C<32, 8, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] = + CflSubsampler_C<32, 8, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler_C<32, 8, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor_C<32, 16, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler_C<32, 16, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] = + CflSubsampler_C<32, 16, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler_C<32, 16, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor_C<32, 32, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler_C<32, 32, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] = + CflSubsampler_C<32, 32, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler_C<32, 32, 12, uint16_t, 1, 1>; +#endif + +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + // Cfl predictors are available only for transform sizes with max(width, + // height) <= 32. Set all others to nullptr. + for (const auto i : kTransformSizesLargerThan32x32) { + dsp->cfl_intra_predictors[i] = nullptr; + for (int j = 0; j < kNumSubsamplingTypes; ++j) { + dsp->cfl_subsamplers[i][j] = nullptr; + } + } +} // NOLINT(readability/fn_size) +#endif // LIBGAV1_MAX_BITDEPTH == 12 + #undef INIT_CFL_INTRAPREDICTOR_WxH #undef INIT_CFL_INTRAPREDICTORS @@ -649,6 +906,9 @@ void IntraPredCflInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/intrapred_directional.cc b/libgav1/src/dsp/intrapred_directional.cc index 21a40b5..9146074 100644 --- a/libgav1/src/dsp/intrapred_directional.cc +++ b/libgav1/src/dsp/intrapred_directional.cc @@ -94,11 +94,19 @@ void DirectionalIntraPredictorZone1_C( } while (++y < height); } +// clang 14.0.0 produces incorrect code with LIBGAV1_RESTRICT. +// https://github.com/llvm/llvm-project/issues/54427 +#if defined(__clang__) && __clang_major__ == 14 +#define LOCAL_RESTRICT +#else +#define LOCAL_RESTRICT LIBGAV1_RESTRICT +#endif + template <typename Pixel> void DirectionalIntraPredictorZone2_C( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column, const int width, + void* LOCAL_RESTRICT const dest, ptrdiff_t stride, + const void* LOCAL_RESTRICT const top_row, + const void* LOCAL_RESTRICT const left_column, const int width, const int height, const int xstep, const int ystep, const bool upsampled_top, const bool upsampled_left) { const auto* const top = static_cast<const Pixel*>(top_row); @@ -143,6 +151,8 @@ void DirectionalIntraPredictorZone2_C( } while (++y < height); } +#undef LOCAL_RESTRICT + template <typename Pixel> void DirectionalIntraPredictorZone3_C( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, @@ -236,6 +246,34 @@ void Init10bpp() { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C<uint16_t>; + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C<uint16_t>; + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C<uint16_t>; +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone1 + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C<uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone2 + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C<uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone3 + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C<uint16_t>; +#endif +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace void IntraPredDirectionalInit_C() { @@ -243,6 +281,9 @@ void IntraPredDirectionalInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/intrapred_filter.cc b/libgav1/src/dsp/intrapred_filter.cc index 9a45eff..2d183cf 100644 --- a/libgav1/src/dsp/intrapred_filter.cc +++ b/libgav1/src/dsp/intrapred_filter.cc @@ -131,6 +131,21 @@ void Init10bpp() { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_FilterIntraPredictor + dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace void IntraPredFilterInit_C() { @@ -138,6 +153,9 @@ void IntraPredFilterInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/intrapred_smooth.cc b/libgav1/src/dsp/intrapred_smooth.cc index 0c7f272..16b8274 100644 --- a/libgav1/src/dsp/intrapred_smooth.cc +++ b/libgav1/src/dsp/intrapred_smooth.cc @@ -714,6 +714,266 @@ void Init10bpp() { } // NOLINT(readability/fn_size) #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using DefsHbd = SmoothDefs<uint16_t>; + +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_SMOOTH(DefsHbd); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = + DefsHbd::_4x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = + DefsHbd::_4x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = + DefsHbd::_4x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = + DefsHbd::_4x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = + DefsHbd::_4x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = + DefsHbd::_4x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = + DefsHbd::_8x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = + DefsHbd::_8x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = + DefsHbd::_8x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = + DefsHbd::_8x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = + DefsHbd::_8x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = + DefsHbd::_8x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = + DefsHbd::_8x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = + DefsHbd::_8x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = + DefsHbd::_16x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = + DefsHbd::_16x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = + DefsHbd::_16x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = + DefsHbd::_16x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = + DefsHbd::_16x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = + DefsHbd::_16x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = + DefsHbd::_16x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = + DefsHbd::_16x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = + DefsHbd::_16x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = + DefsHbd::_16x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = + DefsHbd::_32x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = + DefsHbd::_32x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = + DefsHbd::_32x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = + DefsHbd::_32x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = + DefsHbd::_32x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = + DefsHbd::_32x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = + DefsHbd::_32x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = + DefsHbd::_32x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = + DefsHbd::_64x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = + DefsHbd::_64x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = + DefsHbd::_64x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = + DefsHbd::_64x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = + DefsHbd::_64x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = + DefsHbd::_64x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x64::SmoothHorizontal; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} // NOLINT(readability/fn_size) +#endif // LIBGAV1_MAX_BITDEPTH == 12 + #undef INIT_SMOOTH_WxH #undef INIT_SMOOTH } // namespace @@ -723,6 +983,9 @@ void IntraPredSmoothInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/intrapred_smooth.h b/libgav1/src/dsp/intrapred_smooth.h index 6802003..06454af 100644 --- a/libgav1/src/dsp/intrapred_smooth.h +++ b/libgav1/src/dsp/intrapred_smooth.h @@ -38,6 +38,12 @@ namespace libgav1 { namespace dsp { +enum { + // Weights are quadratic from '1' to '1 / block_size', scaled by + // 2^kSmoothWeightScale. + kSmoothWeightScale = 8, +}; + // Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*]. // This function is not thread-safe. void IntraPredSmoothInit_C(); diff --git a/libgav1/src/dsp/inverse_transform.cc b/libgav1/src/dsp/inverse_transform.cc index 1b0064f..0bbdffa 100644 --- a/libgav1/src/dsp/inverse_transform.cc +++ b/libgav1/src/dsp/inverse_transform.cc @@ -18,6 +18,7 @@ #include <cassert> #include <cstdint> #include <cstring> +#include <type_traits> #include "src/dsp/dsp.h" #include "src/utils/array_2d.h" @@ -25,6 +26,15 @@ #include "src/utils/compiler_attributes.h" #include "src/utils/logging.h" +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK +#endif + +#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \ + LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK +#include <cinttypes> +#endif + namespace libgav1 { namespace dsp { namespace { @@ -34,24 +44,25 @@ namespace { constexpr uint8_t kTransformColumnShift = 4; -#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) -#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK -#endif - -int32_t RangeCheckValue(int32_t value, int8_t range) { +template <typename T> +int32_t RangeCheckValue(T value, int8_t range) { #if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \ LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK + static_assert( + std::is_same<T, int32_t>::value || std::is_same<T, std::int64_t>::value, + ""); assert(range <= 32); const auto min = static_cast<int32_t>(-(uint32_t{1} << (range - 1))); const auto max = static_cast<int32_t>((uint32_t{1} << (range - 1)) - 1); if (min > value || value > max) { - LIBGAV1_DLOG(ERROR, "coeff out of bit range, value: %d bit range %d\n", - value, range); + LIBGAV1_DLOG(ERROR, + "coeff out of bit range, value: %" PRId64 " bit range %d", + static_cast<int64_t>(value), range); assert(min <= value && value <= max); } #endif // LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK static_cast<void>(range); - return value; + return static_cast<int32_t>(value); } template <typename Residual> @@ -433,7 +444,13 @@ void Adst4_C(void* dest, int8_t range) { // Section 7.13.2.6: It is a requirement of bitstream conformance that all // values stored in the s and x arrays by this process are representable by // a signed integer using range + 12 bits of precision. - int32_t s[7]; + // Note the intermediate value can only exceed INT32_MAX with invalid 12-bit + // content. For simplicity in unoptimized code, int64_t is used for both 10 & + // 12-bit. SIMD implementations can allow these to rollover on platforms + // where this has defined behavior. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; + Intermediate s[7]; s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12); s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12); s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12); @@ -454,19 +471,23 @@ void Adst4_C(void* dest, int8_t range) { s[0] = RangeCheckValue(s[0] + s[3], range + 12); s[1] = RangeCheckValue(s[1] - s[4], range + 12); s[3] = s[2]; - s[2] = RangeCheckValue(kAdst4Multiplier[2] * b7, range + 12); + // With range checking enabled b7 would be trapped above. This prevents an + // integer sanitizer warning. In SIMD implementations the multiply can be + // allowed to rollover on platforms where this has defined behavior. + const auto adst2_b7 = static_cast<Intermediate>(kAdst4Multiplier[2]) * b7; + s[2] = RangeCheckValue(adst2_b7, range + 12); // stage 4. s[0] = RangeCheckValue(s[0] + s[5], range + 12); s[1] = RangeCheckValue(s[1] - s[6], range + 12); // stages 5 and 6. - const int32_t x0 = RangeCheckValue(s[0] + s[3], range + 12); - const int32_t x1 = RangeCheckValue(s[1] + s[3], range + 12); - int32_t x3 = RangeCheckValue(s[0] + s[1], range + 12); + const Intermediate x0 = RangeCheckValue(s[0] + s[3], range + 12); + const Intermediate x1 = RangeCheckValue(s[1] + s[3], range + 12); + Intermediate x3 = RangeCheckValue(s[0] + s[1], range + 12); x3 = RangeCheckValue(x3 - s[3], range + 12); - int32_t dst_0 = RightShiftWithRounding(x0, 12); - int32_t dst_1 = RightShiftWithRounding(x1, 12); - int32_t dst_2 = RightShiftWithRounding(s[2], 12); - int32_t dst_3 = RightShiftWithRounding(x3, 12); + auto dst_0 = static_cast<int32_t>(RightShiftWithRounding(x0, 12)); + auto dst_1 = static_cast<int32_t>(RightShiftWithRounding(x1, 12)); + auto dst_2 = static_cast<int32_t>(RightShiftWithRounding(s[2], 12)); + auto dst_3 = static_cast<int32_t>(RightShiftWithRounding(x3, 12)); if (sizeof(Residual) == 2) { // If the first argument to RightShiftWithRounding(..., 12) is only // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it @@ -840,6 +861,10 @@ void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift, template <typename Residual> void Identity4Row_C(void* dest, int8_t shift) { + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; assert(shift == 0 || shift == 1); auto* const dst = static_cast<Residual*>(dest); // If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding| @@ -847,10 +872,10 @@ void Identity4Row_C(void* dest, int8_t shift) { // values of |shift|. const int32_t rounding = (1 + (shift << 1)) << 11; for (int i = 0; i < 4; ++i) { - // The intermediate value here will have to fit into an int32_t for it to be - // bitstream conformant. The multiplication is promoted to int32_t by - // defining kIdentity4Multiplier as int32_t. - int32_t dst_i = (dst[i] * kIdentity4Multiplier + rounding) >> (12 + shift); + const auto intermediate = + static_cast<Intermediate>(dst[i]) * kIdentity4Multiplier; + int32_t dst_i = + static_cast<int32_t>((intermediate + rounding) >> (12 + shift)); if (sizeof(Residual) == 2) { dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); } @@ -874,16 +899,24 @@ void Identity4Column_C(void* dest, int8_t /*shift*/) { template <int bitdepth, typename Residual> void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round, int row_shift, bool is_row) { + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; auto* const dst = static_cast<Residual*>(dest); if (is_row) { if (should_round) { - dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier; + dst[0] = RightShiftWithRounding(intermediate, 12); } const int32_t rounding = (1 + (row_shift << 1)) << 11; + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kIdentity4Multiplier; int32_t dst_i = - (dst[0] * kIdentity4Multiplier + rounding) >> (12 + row_shift); + static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift)); if (sizeof(Residual) == 2) { dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); } @@ -923,11 +956,17 @@ void Identity8Column_C(void* dest, int8_t /*shift*/) { template <int bitdepth, typename Residual> void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round, int row_shift, bool is_row) { + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; auto* const dst = static_cast<Residual*>(dest); if (is_row) { if (should_round) { - dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier; + dst[0] = RightShiftWithRounding(intermediate, 12); } int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[0]), row_shift); @@ -954,13 +993,19 @@ void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round, template <typename Residual> void Identity16Row_C(void* dest, int8_t shift) { assert(shift == 1 || shift == 2); + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; auto* const dst = static_cast<Residual*>(dest); const int32_t rounding = (1 + (1 << shift)) << 11; for (int i = 0; i < 16; ++i) { - // The intermediate value here will have to fit into an int32_t for it to be - // bitstream conformant. The multiplication is promoted to int32_t by - // defining kIdentity16Multiplier as int32_t. - int32_t dst_i = (dst[i] * kIdentity16Multiplier + rounding) >> (12 + shift); + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for all cases. + const auto intermediate = + static_cast<Intermediate>(dst[i]) * kIdentity16Multiplier; + int32_t dst_i = + static_cast<int32_t>((intermediate + rounding) >> (12 + shift)); if (sizeof(Residual) == 2) { dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); } @@ -985,16 +1030,24 @@ void Identity16Column_C(void* dest, int8_t /*shift*/) { template <int bitdepth, typename Residual> void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round, int row_shift, bool is_row) { + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; auto* const dst = static_cast<Residual*>(dest); if (is_row) { if (should_round) { - dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier; + dst[0] = RightShiftWithRounding(intermediate, 12); } const int32_t rounding = (1 + (1 << row_shift)) << 11; + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kIdentity16Multiplier; int32_t dst_i = - (dst[0] * kIdentity16Multiplier + rounding) >> (12 + row_shift); + static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift)); if (sizeof(Residual) == 2) { dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); } @@ -1034,11 +1087,17 @@ void Identity32Column_C(void* dest, int8_t /*shift*/) { template <int bitdepth, typename Residual> void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round, int row_shift, bool is_row) { + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; auto* const dst = static_cast<Residual*>(dest); if (is_row) { if (should_round) { - dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier; + dst[0] = RightShiftWithRounding(intermediate, 12); } int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[0]), row_shift); @@ -1612,6 +1671,148 @@ void Init10bpp() { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); + static_cast<void>(dsp); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + InitAll<12, int32_t, uint16_t>(dsp); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dDct + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dDct + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dDct + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dDct + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize64_Transform1dDct + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dAdst + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dAdst + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dAdst + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dIdentity + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity4DcOnly_C<12, int32_t>, Identity4Row_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity4DcOnly_C<12, int32_t>, + Identity4Column_C<int32_t>, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dIdentity + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity8DcOnly_C<12, int32_t>, Identity8Row_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity8DcOnly_C<12, int32_t>, + Identity8Column_C<int32_t>, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dIdentity + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity16DcOnly_C<12, int32_t>, Identity16Row_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity16DcOnly_C<12, int32_t>, + Identity16Column_C<int32_t>, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dIdentity + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity32DcOnly_C<12, int32_t>, Identity32Row_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity32DcOnly_C<12, int32_t>, + Identity32Column_C<int32_t>, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dWht + dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht, + Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht, + Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>, + /*is_row=*/false>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace void InverseTransformInit_C() { @@ -1619,10 +1820,12 @@ void InverseTransformInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif // Local functions that may be unused depending on the optimizations // available. - static_cast<void>(RangeCheckValue); static_cast<void>(kBitReverseLookup); } diff --git a/libgav1/src/dsp/libgav1_dsp.cmake b/libgav1/src/dsp/libgav1_dsp.cmake index 4bd1443..fedb35b 100644 --- a/libgav1/src/dsp/libgav1_dsp.cmake +++ b/libgav1/src/dsp/libgav1_dsp.cmake @@ -113,6 +113,7 @@ list(APPEND libgav1_dsp_sources_neon "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc" "${libgav1_source}/dsp/arm/inverse_transform_neon.cc" "${libgav1_source}/dsp/arm/inverse_transform_neon.h" + "${libgav1_source}/dsp/arm/loop_filter_10bit_neon.cc" "${libgav1_source}/dsp/arm/loop_filter_neon.cc" "${libgav1_source}/dsp/arm/loop_filter_neon.h" "${libgav1_source}/dsp/arm/loop_restoration_10bit_neon.cc" diff --git a/libgav1/src/dsp/loop_filter.cc b/libgav1/src/dsp/loop_filter.cc index 14d47bf..bb0583f 100644 --- a/libgav1/src/dsp/loop_filter.cc +++ b/libgav1/src/dsp/loop_filter.cc @@ -603,6 +603,73 @@ void Init10bpp() { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using Defs12bpp = LoopFilterFuncs_C<12, uint16_t>; + +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal4; + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = + Defs12bpp::Vertical4; + + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal6; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = + Defs12bpp::Vertical6; + + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal8; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = + Defs12bpp::Vertical8; + + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal14; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Defs12bpp::Vertical14; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal4; +#endif +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = + Defs12bpp::Vertical4; +#endif + +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal6; +#endif +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = + Defs12bpp::Vertical6; +#endif + +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal8; +#endif +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = + Defs12bpp::Vertical8; +#endif + +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal14; +#endif +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Defs12bpp::Vertical14; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace void LoopFilterInit_C() { @@ -610,6 +677,9 @@ void LoopFilterInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif // Local functions that may be unused depending on the optimizations // available. static_cast<void>(AdjustThresholds); diff --git a/libgav1/src/dsp/loop_restoration.cc b/libgav1/src/dsp/loop_restoration.cc index 2301a3e..eb8052c 100644 --- a/libgav1/src/dsp/loop_restoration.cc +++ b/libgav1/src/dsp/loop_restoration.cc @@ -922,7 +922,6 @@ void Init8bpp() { } #if LIBGAV1_MAX_BITDEPTH >= 10 - void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(10); assert(dsp != nullptr); @@ -939,8 +938,27 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } - #endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>; + dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_WienerFilter + dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_SelfGuidedFilter + dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace void LoopRestorationInit_C() { @@ -948,6 +966,9 @@ void LoopRestorationInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/loop_restoration.h b/libgav1/src/dsp/loop_restoration.h index de80926..8fefc40 100644 --- a/libgav1/src/dsp/loop_restoration.h +++ b/libgav1/src/dsp/loop_restoration.h @@ -39,16 +39,6 @@ namespace libgav1 { namespace dsp { -enum { - // Precision of a division table (mtable) - kSgrProjScaleBits = 20, - kSgrProjReciprocalBits = 12, - // Core self-guided restoration precision bits. - kSgrProjSgrBits = 8, - // Precision bits of generated values higher than source before projection. - kSgrProjRestoreBits = 4 -}; // anonymous enum - extern const uint8_t kSgrMaLookup[256]; // Initializes Dsp::loop_restorations. This function is not thread-safe. diff --git a/libgav1/src/dsp/mask_blend.cc b/libgav1/src/dsp/mask_blend.cc index 207fde0..34d7fe8 100644 --- a/libgav1/src/dsp/mask_blend.cc +++ b/libgav1/src/dsp/mask_blend.cc @@ -197,7 +197,50 @@ void Init10bpp() { dsp->inter_intra_mask_blend_8bpp[2] = nullptr; #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>; + dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>; + dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>; + dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>; + dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>; + dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>; + // These are only used with 8-bit. + dsp->inter_intra_mask_blend_8bpp[0] = nullptr; + dsp->inter_intra_mask_blend_8bpp[1] = nullptr; + dsp->inter_intra_mask_blend_8bpp[2] = nullptr; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_MaskBlend444 + dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_MaskBlend422 + dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_MaskBlend420 + dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>; #endif +#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra444 + dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra422 + dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra420 + dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>; +#endif + // These are only used with 8-bit. + dsp->inter_intra_mask_blend_8bpp[0] = nullptr; + dsp->inter_intra_mask_blend_8bpp[1] = nullptr; + dsp->inter_intra_mask_blend_8bpp[2] = nullptr; +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -206,6 +249,9 @@ void MaskBlendInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/obmc.cc b/libgav1/src/dsp/obmc.cc index 6b5c6e3..479cb1d 100644 --- a/libgav1/src/dsp/obmc.cc +++ b/libgav1/src/dsp/obmc.cc @@ -116,7 +116,28 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>; + dsp->obmc_blend[kObmcDirectionHorizontal] = + OverlapBlendHorizontal_C<uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_ObmcVertical + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ObmcHorizontal + dsp->obmc_blend[kObmcDirectionHorizontal] = + OverlapBlendHorizontal_C<uint16_t>; #endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -125,6 +146,9 @@ void ObmcInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/super_res.cc b/libgav1/src/dsp/super_res.cc index 570ba73..7593729 100644 --- a/libgav1/src/dsp/super_res.cc +++ b/libgav1/src/dsp/super_res.cc @@ -95,7 +95,23 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); + dsp->super_res_coefficients = nullptr; +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->super_res = SuperRes_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_SuperRes + dsp->super_res = SuperRes_C<12, uint16_t>; #endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -104,6 +120,9 @@ void SuperResInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/warp.cc b/libgav1/src/dsp/warp.cc index dd467ea..f62f1ed 100644 --- a/libgav1/src/dsp/warp.cc +++ b/libgav1/src/dsp/warp.cc @@ -111,14 +111,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, start_x += 8) { const int src_x = (start_x + 4) << subsampling_x; const int src_y = (start_y + 4) << subsampling_y; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's @@ -172,22 +166,24 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + if (filter_params.ix4 - 7 >= source_width - 1 || + filter_params.ix4 + 7 <= 0) { // Regions 1 and 2. // Points to the left or right border of the first row of |src|. const Pixel* first_row_border = - (ix4 + 7 <= 0) ? src : src + source_width - 1; + (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1; // In general, for y in [-7, 8), the row number iy4 + y is clipped: // const int row = Clip3(iy4 + y, 0, source_height - 1); // In two special cases, iy4 + y is clipped to either 0 or // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 1. // Every sample used to calculate the prediction block has the same // value. So the whole prediction block has the same value. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const Pixel row_border_pixel = first_row_border[row * source_stride]; DestType* dst_row = dst + start_x - block_start_x; if (is_compound) { @@ -220,15 +216,15 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, for (int y = -7; y < 8; ++y) { // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved below. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; int sum = first_row_border[row * source_stride]; sum <<= kFilterBits - kRoundBitsHorizontal; intermediate_result_column[y + 7] = sum; } // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); for (int x = 0; x < 8; ++x) { @@ -269,12 +265,14 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 3. // Horizontal filter. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const Pixel* const src_row = src + row * source_stride; - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { int sx = sx4 - MultiplyBy4(alpha); for (int x = -4; x < 4; ++x) { @@ -300,7 +298,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // -13 <= column <= (source_width - 1) + 13. // Therefore we may over-read up to 13 pixels before the source // row, or up to 13 pixels after the source row. - const int column = ix4 + x + k - 3; + const int column = filter_params.ix4 + x + k - 3; sum += kWarpedFilters[offset][k] * src_row[column]; } intermediate_result[y + 7][x + 4] = @@ -315,7 +313,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0. // It follows that -6 <= iy4 <= source_height + 5. This inequality is // used below. - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { // We assume the source frame has top and bottom borders of at least // 13 pixels that extend the frame boundary pixels. @@ -326,7 +325,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // -13 <= row <= (source_height - 1) + 13. // Therefore we may over-read up to 13 pixels above the top source // row, or up to 13 pixels below the bottom source row. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; const Pixel* const src_row = src + row * source_stride; int sx = sx4 - MultiplyBy4(alpha); for (int x = -4; x < 4; ++x) { @@ -352,7 +351,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // -13 <= column <= (source_width - 1) + 13. // Therefore we may over-read up to 13 pixels before the source // row, or up to 13 pixels after the source row. - const int column = ix4 + x + k - 3; + const int column = filter_params.ix4 + x + k - 3; sum += kWarpedFilters[offset][k] * src_row[column]; } intermediate_result[y + 7][x + 4] = @@ -367,8 +366,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // Regions 3 and 4. // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); // The spec says we should use the following loop condition: // y < std::min(4, block_start_y + block_height - start_y - 4); // We can prove that block_start_y + block_height - start_y >= 8, which @@ -460,7 +459,26 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>; + dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_Warp + dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>; #endif +#ifndef LIBGAV1_Dsp12bpp_WarpCompound + dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -469,6 +487,9 @@ void WarpInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/warp.h b/libgav1/src/dsp/warp.h index 7367a9b..9c20f12 100644 --- a/libgav1/src/dsp/warp.h +++ b/libgav1/src/dsp/warp.h @@ -38,9 +38,39 @@ namespace libgav1 { namespace dsp { +// Section 7.11.3.5. +struct WarpFilterParams { + int64_t x4; + int64_t y4; + int ix4; + int iy4; +}; + // Initializes Dsp::warp. This function is not thread-safe. void WarpInit_C(); +// Section 7.11.3.5. +inline WarpFilterParams GetWarpFilterParams(int src_x, int src_y, + int subsampling_x, + int subsampling_y, + const int* warp_params) { + WarpFilterParams filter_params; + // warp_params[2]/[5] require 17 bits (the others 14). With large resolutions + // the result of the multiplication will require 33. + const int64_t dst_x = static_cast<int64_t>(src_x) * warp_params[2] + + src_y * warp_params[3] + warp_params[0]; + const int64_t dst_y = src_x * warp_params[4] + + static_cast<int64_t>(src_y) * warp_params[5] + + warp_params[1]; + filter_params.x4 = dst_x >> subsampling_x; + filter_params.y4 = dst_y >> subsampling_y; + filter_params.ix4 = + static_cast<int>(filter_params.x4 >> kWarpedModelPrecisionBits); + filter_params.iy4 = + static_cast<int>(filter_params.y4 >> kWarpedModelPrecisionBits); + return filter_params; +} + } // namespace dsp } // namespace libgav1 diff --git a/libgav1/src/dsp/weight_mask.cc b/libgav1/src/dsp/weight_mask.cc index 41f4c70..ee3808b 100644 --- a/libgav1/src/dsp/weight_mask.cc +++ b/libgav1/src/dsp/weight_mask.cc @@ -213,7 +213,86 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_WEIGHT_MASK(8, 8, 12, 0, 0); + INIT_WEIGHT_MASK(8, 16, 12, 0, 1); + INIT_WEIGHT_MASK(8, 32, 12, 0, 2); + INIT_WEIGHT_MASK(16, 8, 12, 1, 0); + INIT_WEIGHT_MASK(16, 16, 12, 1, 1); + INIT_WEIGHT_MASK(16, 32, 12, 1, 2); + INIT_WEIGHT_MASK(16, 64, 12, 1, 3); + INIT_WEIGHT_MASK(32, 8, 12, 2, 0); + INIT_WEIGHT_MASK(32, 16, 12, 2, 1); + INIT_WEIGHT_MASK(32, 32, 12, 2, 2); + INIT_WEIGHT_MASK(32, 64, 12, 2, 3); + INIT_WEIGHT_MASK(64, 16, 12, 3, 1); + INIT_WEIGHT_MASK(64, 32, 12, 3, 2); + INIT_WEIGHT_MASK(64, 64, 12, 3, 3); + INIT_WEIGHT_MASK(64, 128, 12, 3, 4); + INIT_WEIGHT_MASK(128, 64, 12, 4, 3); + INIT_WEIGHT_MASK(128, 128, 12, 4, 4); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x8 + INIT_WEIGHT_MASK(8, 8, 12, 0, 0); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x16 + INIT_WEIGHT_MASK(8, 16, 12, 0, 1); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x32 + INIT_WEIGHT_MASK(8, 32, 12, 0, 2); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x8 + INIT_WEIGHT_MASK(16, 8, 12, 1, 0); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x16 + INIT_WEIGHT_MASK(16, 16, 12, 1, 1); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x32 + INIT_WEIGHT_MASK(16, 32, 12, 1, 2); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x64 + INIT_WEIGHT_MASK(16, 64, 12, 1, 3); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x8 + INIT_WEIGHT_MASK(32, 8, 12, 2, 0); #endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x16 + INIT_WEIGHT_MASK(32, 16, 12, 2, 1); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x32 + INIT_WEIGHT_MASK(32, 32, 12, 2, 2); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x64 + INIT_WEIGHT_MASK(32, 64, 12, 2, 3); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x16 + INIT_WEIGHT_MASK(64, 16, 12, 3, 1); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x32 + INIT_WEIGHT_MASK(64, 32, 12, 3, 2); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x64 + INIT_WEIGHT_MASK(64, 64, 12, 3, 3); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x128 + INIT_WEIGHT_MASK(64, 128, 12, 3, 4); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x64 + INIT_WEIGHT_MASK(128, 64, 12, 4, 3); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x128 + INIT_WEIGHT_MASK(128, 128, 12, 4, 4); +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -222,6 +301,9 @@ void WeightMaskInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/libgav1/src/dsp/x86/average_blend_sse4.cc b/libgav1/src/dsp/x86/average_blend_sse4.cc index 911c5a9..c08b3d6 100644 --- a/libgav1/src/dsp/x86/average_blend_sse4.cc +++ b/libgav1/src/dsp/x86/average_blend_sse4.cc @@ -35,24 +35,46 @@ namespace { constexpr int kInterPostRoundBit = 4; -inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT dest) { - const __m128i pred_0 = LoadLo8(prediction_0); - const __m128i pred_1 = LoadLo8(prediction_1); - __m128i res = _mm_add_epi16(pred_0, pred_1); - res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1); - Store4(dest, _mm_packus_epi16(res, res)); +inline void AverageBlend4x4Row(const int16_t* LIBGAV1_RESTRICT prediction_0, + const int16_t* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT dest, + const ptrdiff_t dest_stride) { + const __m128i pred_00 = LoadAligned16(prediction_0); + const __m128i pred_10 = LoadAligned16(prediction_1); + __m128i res_0 = _mm_add_epi16(pred_00, pred_10); + res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1); + const __m128i pred_01 = LoadAligned16(prediction_0 + 8); + const __m128i pred_11 = LoadAligned16(prediction_1 + 8); + __m128i res_1 = _mm_add_epi16(pred_01, pred_11); + res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1); + const __m128i result_pixels = _mm_packus_epi16(res_0, res_1); + Store4(dest, result_pixels); + dest += dest_stride; + const int result_1 = _mm_extract_epi32(result_pixels, 1); + memcpy(dest, &result_1, sizeof(result_1)); + dest += dest_stride; + const int result_2 = _mm_extract_epi32(result_pixels, 2); + memcpy(dest, &result_2, sizeof(result_2)); + dest += dest_stride; + const int result_3 = _mm_extract_epi32(result_pixels, 3); + memcpy(dest, &result_3, sizeof(result_3)); } inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0, const int16_t* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT dest) { - const __m128i pred_0 = LoadAligned16(prediction_0); - const __m128i pred_1 = LoadAligned16(prediction_1); - __m128i res = _mm_add_epi16(pred_0, pred_1); - res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1); - StoreLo8(dest, _mm_packus_epi16(res, res)); + uint8_t* LIBGAV1_RESTRICT dest, + const ptrdiff_t dest_stride) { + const __m128i pred_00 = LoadAligned16(prediction_0); + const __m128i pred_10 = LoadAligned16(prediction_1); + __m128i res_0 = _mm_add_epi16(pred_00, pred_10); + res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1); + const __m128i pred_01 = LoadAligned16(prediction_0 + 8); + const __m128i pred_11 = LoadAligned16(prediction_1 + 8); + __m128i res_1 = _mm_add_epi16(pred_01, pred_11); + res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1); + const __m128i result_pixels = _mm_packus_epi16(res_0, res_1); + StoreLo8(dest, result_pixels); + StoreHi8(dest + dest_stride, result_pixels); } inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0, @@ -85,35 +107,27 @@ void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, int y = height; if (width == 4) { + const ptrdiff_t dest_stride4 = dest_stride << 2; + constexpr ptrdiff_t width4 = 4 << 2; do { - // TODO(b/150326556): |prediction_[01]| values are packed. It is possible - // to load 8 values at a time. - AverageBlend4Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; - - AverageBlend4Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; + AverageBlend4x4Row(pred_0, pred_1, dst, dest_stride); + dst += dest_stride4; + pred_0 += width4; + pred_1 += width4; - y -= 2; + y -= 4; } while (y != 0); return; } if (width == 8) { + const ptrdiff_t dest_stride2 = dest_stride << 1; + constexpr ptrdiff_t width2 = 8 << 1; do { - AverageBlend8Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; - - AverageBlend8Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; + AverageBlend8Row(pred_0, pred_1, dst, dest_stride); + dst += dest_stride2; + pred_0 += width2; + pred_1 += width2; y -= 2; } while (y != 0); diff --git a/libgav1/src/dsp/x86/convolve_avx2.cc b/libgav1/src/dsp/x86/convolve_avx2.cc index 4126ca9..6e94347 100644 --- a/libgav1/src/dsp/x86/convolve_avx2.cc +++ b/libgav1/src/dsp/x86/convolve_avx2.cc @@ -39,17 +39,17 @@ namespace { // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final // sum from outranging int16_t. -template <int filter_index> +template <int num_taps> __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { __m256i sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1 const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3 const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5 sum = _mm256_add_epi16(v_madd_21, v_madd_43); sum = _mm256_add_epi16(sum, v_madd_65); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0 const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2 @@ -58,7 +58,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32); const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76); sum = _mm256_add_epi16(v_sum_7654, v_sum_3210); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3 } else { @@ -70,7 +70,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { return sum; } -template <int filter_index> +template <int num_taps> __m256i SumHorizontalTaps(const __m256i* const src, const __m256i* const v_tap) { __m256i v_src[4]; @@ -78,32 +78,32 @@ __m256i SumHorizontalTaps(const __m256i* const src, const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long); const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long); - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21 v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65 - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10 v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76 - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 } - return SumOnePassTaps<filter_index>(v_src, v_tap); + return SumOnePassTaps<num_taps>(v_src, v_tap); } -template <int filter_index> +template <int num_taps> __m256i SimpleHorizontalTaps(const __m256i* const src, const __m256i* const v_tap) { - __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap); + __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap); // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - @@ -116,17 +116,16 @@ __m256i SimpleHorizontalTaps(const __m256i* const src, return _mm256_packus_epi16(sum, sum); } -template <int filter_index> +template <int num_taps> __m256i HorizontalTaps8To16(const __m256i* const src, const __m256i* const v_tap) { - const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap); + const __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } // Filter 2xh sizes. -template <int num_taps, int filter_index, bool is_2d = false, - bool is_compound = false> +template <int num_taps, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -145,14 +144,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, do { if (is_2d) { const __m128i sum = - HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap); + HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap); Store4(&dest16[0], sum); dest16 += pred_stride; Store4(&dest16[0], _mm_srli_si128(sum, 8)); dest16 += pred_stride; } else { const __m128i sum = - SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); Store2(dest8, sum); dest8 += pred_stride; Store2(dest8, _mm_srli_si128(sum, 4)); @@ -169,7 +168,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); __m128i sum; const __m128i input = LoadLo8(&src[2]); - if (filter_index == 3) { + if (num_taps == 2) { // 03 04 04 05 05 06 06 07 .... const __m128i v_src_43 = _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); @@ -194,8 +193,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, } // Filter widths >= 4. -template <int num_taps, int filter_index, bool is_2d = false, - bool is_compound = false> +template <int num_taps, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -214,11 +212,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m256i src_long = SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8])); const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + HorizontalTaps8To16<num_taps>(&src_long, v_tap); const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]), LoadUnaligned16(&src[x + 24])); const __m256i result2 = - HorizontalTaps8To16<filter_index>(&src_long2, v_tap); + HorizontalTaps8To16<num_taps>(&src_long2, v_tap); if (is_2d) { StoreAligned32(&dest16[x], result); StoreAligned32(&dest16[x + 16], result2); @@ -230,11 +228,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // Load src used to calculate dest8[7:0] and dest8[23:16]. const __m256i src_long = LoadUnaligned32(&src[x]); const __m256i result = - SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + SimpleHorizontalTaps<num_taps>(&src_long, v_tap); // Load src used to calculate dest8[15:8] and dest8[31:24]. const __m256i src_long2 = LoadUnaligned32(&src[x + 8]); const __m256i result2 = - SimpleHorizontalTaps<filter_index>(&src_long2, v_tap); + SimpleHorizontalTaps<num_taps>(&src_long2, v_tap); // Combine results and store. StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2)); } @@ -252,13 +250,12 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8])); - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[src_stride]), LoadUnaligned16(&src[8 + src_stride])); const __m256i result2 = - HorizontalTaps8To16<filter_index>(&src_long2, v_tap); + HorizontalTaps8To16<num_taps>(&src_long2, v_tap); if (is_2d) { StoreAligned32(&dest16[0], result); StoreAligned32(&dest16[pred_stride], result2); @@ -270,12 +267,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[src_stride])); - const __m256i result = - SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap); const __m256i src_long2 = SetrM128i( LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride])); const __m256i result2 = - SimpleHorizontalTaps<filter_index>(&src_long2, v_tap); + SimpleHorizontalTaps<num_taps>(&src_long2, v_tap); const __m256i packed_result = _mm256_unpacklo_epi64(result, result2); StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result)); StoreUnaligned16(&dest8[pred_stride], @@ -292,8 +288,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, if (is_2d) { const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8])); - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); StoreAligned32(&dest16[0], result); } @@ -306,8 +301,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); const __m256i src_long = SetrM128i(this_row, next_row); if (is_2d || is_compound) { - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); if (is_2d) { StoreAligned16(&dest16[0], _mm256_castsi256_si128(result)); StoreAligned16(&dest16[pred_stride], @@ -322,8 +316,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(this_row, next_row); - const __m256i result = - SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap); StoreLo8(&dest8[0], _mm256_castsi256_si128(result)); StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1)); } @@ -337,8 +330,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // filter the remaining row. if (is_2d) { const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0])); - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); StoreAligned16(&dest16[0], _mm256_castsi256_si128(result)); } @@ -351,8 +343,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); const __m256i src_long = SetrM128i(this_row, next_row); if (is_2d || is_compound) { - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); StoreLo8(&dest16[0], _mm256_castsi256_si128(result)); StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1)); } else { @@ -360,8 +351,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(this_row, next_row); - const __m256i result = - SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap); Store4(&dest8[0], _mm256_castsi256_si128(result)); Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1)); } @@ -375,8 +365,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // filter the remaining row. if (is_2d) { const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0])); - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); StoreLo8(&dest16[0], _mm256_castsi256_si128(result)); } } @@ -554,18 +543,15 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH( const __m128i v_horizontal_filter = LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]); - if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. + if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -582,28 +568,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_horizontal_filter, v_tap); - FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 1) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 0) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -617,7 +600,8 @@ void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); // The output of the horizontal filter is guaranteed to fit in 16 bits. alignas(32) uint16_t @@ -730,61 +714,60 @@ __m256i Compound1DShift(const __m256i sum) { return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template <int filter_index, bool unpack_high = false> +template <int num_taps, bool unpack_high = false> __m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) { __m256i v_src[4]; if (!unpack_high) { - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]); v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); } } else { - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]); v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); } } - return SumOnePassTaps<filter_index>(v_src, v_tap); + return SumOnePassTaps<num_taps>(v_src, v_tap); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const __m256i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -821,9 +804,9 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row] = LoadUnaligned32(src_x); src_x += src_stride; - const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m256i sums_hi = - SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap); + SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap); if (is_compound) { const __m256i results = Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20)); @@ -861,13 +844,12 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src, } while (x < width); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int /*width*/, const int height, const __m256i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -922,9 +904,9 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row - 1] = _mm256_inserti128_si256( srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1); - const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m256i sums_hi = - SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap); + SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap); if (is_compound) { const __m256i results = Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20)); @@ -964,13 +946,12 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src, } while (y != 0); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int /*width*/, const int height, const __m256i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -1025,7 +1006,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row - 1] = _mm256_inserti128_si256( srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1); - const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m256i results = Compound1DShift(sums); const __m128i this_dst = _mm256_castsi256_si128(results); @@ -1062,13 +1043,12 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, } while (y != 0); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int /*width*/, const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -1101,7 +1081,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row] = LoadLo8(src_x); src_x += src_stride; - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -1137,7 +1117,8 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -1151,43 +1132,43 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference, // Use 256 bits for width > 4. if (width > 4) { __m256i taps_256[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical8xH<6>(src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical16xH<6>(src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical32xH<6>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical8xH<8>(src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical16xH<8>(src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical32xH<8>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap. SetupTaps<4>(&v_filter, taps_256); if (width == 8) { FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height, @@ -1199,67 +1180,38 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference, FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else { - SetupTaps<4>(&v_filter, taps_256); - if (width == 8) { - FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height, - taps_256); - } else if (width == 16) { - FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height, - taps_256); - } else { - FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height, - taps_256); - } } } else { // width <= 8 // Use 128 bit code. __m128i taps[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, - taps); - } else { - FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, - taps); - } - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_filter, taps); - if (width == 2) { - FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); } - } else { + } else { // 4 tap. SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); } } } @@ -1272,7 +1224,8 @@ void ConvolveCompoundVertical_AVX2( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -1286,43 +1239,43 @@ void ConvolveCompoundVertical_AVX2( // Use 256 bits for width > 4. if (width > 4) { __m256i taps_256[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<0, /*is_compound=*/true>( + FilterVertical8xH<6, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<0, /*is_compound=*/true>( + FilterVertical16xH<6, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<0, /*is_compound=*/true>( + FilterVertical32xH<6, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<2, /*is_compound=*/true>( + FilterVertical8xH<8, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<2, /*is_compound=*/true>( + FilterVertical16xH<8, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<2, /*is_compound=*/true>( + FilterVertical32xH<8, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<3, /*is_compound=*/true>( + FilterVertical8xH<2, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<3, /*is_compound=*/true>( + FilterVertical16xH<2, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<3, /*is_compound=*/true>( + FilterVertical32xH<2, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap. SetupTaps<4>(&v_filter, taps_256); if (width == 8) { FilterVertical8xH<4, /*is_compound=*/true>( @@ -1334,43 +1287,27 @@ void ConvolveCompoundVertical_AVX2( FilterVertical32xH<4, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else { - SetupTaps<4>(&v_filter, taps_256); - if (width == 8) { - FilterVertical8xH<5, /*is_compound=*/true>( - src, src_stride, dest, dest_stride, width, height, taps_256); - } else if (width == 16) { - FilterVertical16xH<5, /*is_compound=*/true>( - src, src_stride, dest, dest_stride, width, height, taps_256); - } else { - FilterVertical32xH<5, /*is_compound=*/true>( - src, src_stride, dest, dest_stride, width, height, taps_256); - } } } else { // width <= 4 // Use 128 bit code. __m128i taps[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); - FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else if (filter_index == 2) { // 8 tap. + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); - FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else if (filter_index == 3) { // 2 tap. + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); - FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_filter, taps); - FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else { + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else { // 4 tap. SetupTaps<4>(&v_filter, taps); - FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); } } } @@ -1430,7 +1367,8 @@ void ConvolveCompound2D_AVX2( void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); // The output of the horizontal filter is guaranteed to fit in 16 bits. alignas(32) uint16_t diff --git a/libgav1/src/dsp/x86/convolve_sse4.cc b/libgav1/src/dsp/x86/convolve_sse4.cc index f7e5a71..f427c4c 100644 --- a/libgav1/src/dsp/x86/convolve_sse4.cc +++ b/libgav1/src/dsp/x86/convolve_sse4.cc @@ -36,7 +36,7 @@ namespace { #include "src/dsp/x86/convolve_sse4.inc" -template <int filter_index> +template <int num_taps> __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { __m128i v_src[4]; @@ -44,33 +44,33 @@ __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long); const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long); - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65 - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76 - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 } - const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap); + const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap); return sum; } -template <int filter_index> +template <int num_taps> __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { - __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap); + __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap); // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - @@ -83,16 +83,15 @@ __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, return _mm_packus_epi16(sum, sum); } -template <int filter_index> +template <int num_taps> __m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { - const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap); + const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template <int num_taps, int filter_index, bool is_2d = false, - bool is_compound = false> +template <int num_taps, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -108,16 +107,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, int x = 0; do { if (is_2d || is_compound) { - const __m128i v_sum = - HorizontalTaps8To16<filter_index>(&src[x], v_tap); + const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap); if (is_2d) { StoreAligned16(&dest16[x], v_sum); } else { StoreUnaligned16(&dest16[x], v_sum); } } else { - const __m128i result = - SimpleHorizontalTaps<filter_index>(&src[x], v_tap); + const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap); StoreLo8(&dest8[x], result); } x += 8; @@ -138,10 +135,10 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, int y = height; do { if (is_2d || is_compound) { - const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap); + const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap); StoreLo8(dest16, v_sum); } else { - const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap); + const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap); Store4(&dest8[0], result); } src += src_stride; @@ -157,14 +154,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, do { if (is_2d) { const __m128i sum = - HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap); + HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap); Store4(&dest16[0], sum); dest16 += pred_stride; Store4(&dest16[0], _mm_srli_si128(sum, 8)); dest16 += pred_stride; } else { const __m128i sum = - SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); Store2(dest8, sum); dest8 += pred_stride; Store2(dest8, _mm_srli_si128(sum, 4)); @@ -181,7 +178,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); __m128i sum; const __m128i input = LoadLo8(&src[2]); - if (filter_index == 3) { + if (num_taps == 2) { // 03 04 04 05 05 06 06 07 .... const __m128i v_src_43 = _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); @@ -218,28 +215,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_horizontal_filter, v_tap); - FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 1) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 0) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -253,7 +247,8 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); // The output of the horizontal filter is guaranteed to fit in 16 bits. alignas(16) uint16_t @@ -329,13 +324,12 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, } } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -373,7 +367,7 @@ void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row] = LoadLo8(src_x); src_x += src_stride; - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16_x, results); @@ -410,7 +404,8 @@ void ConvolveVertical_SSE4_1( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -422,63 +417,50 @@ void ConvolveVertical_SSE4_1( const __m128i v_filter = LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<8>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps); } - } else { - // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases. - // See convolve_neon.cc - SetupTaps<4>(&v_filter, taps); - - if (width == 2) { - FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); - } else if (width == 4) { - FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); - } else { - FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, - taps); - } } } -void ConvolveCompoundCopy_SSE4( +void ConvolveCompoundCopy_SSE4_1( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, @@ -502,7 +484,6 @@ void ConvolveCompoundCopy_SSE4( _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical); const __m128i v_dest_hi = _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical); - // TODO(slavarnway): Investigate using aligned stores. StoreUnaligned16(&dest[x], v_dest_lo); StoreUnaligned16(&dest[x + 8], v_dest_hi); x += 16; @@ -544,7 +525,8 @@ void ConvolveCompoundVertical_SSE4_1( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -555,55 +537,42 @@ void ConvolveCompoundVertical_SSE4_1( const __m128i v_filter = LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap SetupTaps<4>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else { - SetupTaps<4>(&v_filter, taps); - - if (width == 4) { - FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); - } else { - FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, - width, height, taps); - } } } @@ -656,7 +625,8 @@ void ConvolveCompound2D_SSE4_1( // Similarly for height. const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); const int intermediate_height = height + vertical_taps - 1; const ptrdiff_t src_stride = reference_stride; const auto* const src = static_cast<const uint8_t*>(reference) - @@ -933,7 +903,7 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src, source); StoreLo8(intermediate, RightShiftWithRounding_S16( - SumOnePassTaps<filter_index>(source, taps), + SumOnePassTaps<num_taps>(source, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; @@ -960,10 +930,9 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src, PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source); // Shift by one less because the taps are halved. - StoreAligned16( - intermediate_x, - RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps), - kInterRoundBitsHorizontal - 1)); + StoreAligned16(intermediate_x, RightShiftWithRounding_S16( + SumOnePassTaps<num_taps>(source, taps), + kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; } while (--y != 0); @@ -1188,7 +1157,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, alignas(16) int16_t intermediate_result[kIntermediateAllocWidth * (2 * kIntermediateAllocWidth + kSubPixelTaps)]; - const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); + const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index); const int intermediate_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> kScaleSubPixelBits) + @@ -1211,7 +1180,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, // inputs in each iteration on large blocks. When step_x is large, we need a // second register and alignr in order to gather all filter inputs. // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap. - const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index); + const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index); const int kernel_start_ceiling = 16 - num_horiz_taps; // This truncated quotient |grade_x_threshold| selects |step_x| such that: // (step_x * 7) >> kScaleSubPixelBits < single load limit @@ -1891,7 +1860,7 @@ void Init8bpp() { dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1; dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1; - dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4; + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1; dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1; dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1; dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1; diff --git a/libgav1/src/dsp/x86/convolve_sse4.inc b/libgav1/src/dsp/x86/convolve_sse4.inc index 550d6a4..5548c5b 100644 --- a/libgav1/src/dsp/x86/convolve_sse4.inc +++ b/libgav1/src/dsp/x86/convolve_sse4.inc @@ -18,20 +18,63 @@ #include "src/dsp/convolve.inc" +// This version checks for the special cases when filter_index == 1. +int GetNumTapsInFilter(const int filter_index, const int filter_id) { + if (filter_index == 0) { + // Despite the names these only use 6 taps. + // kInterpolationFilterEightTap + // kInterpolationFilterEightTapSmooth + return 6; + } + + if (filter_index == 1) { + // Despite the names these only use 6 taps. + // kInterpolationFilterEightTap + // kInterpolationFilterEightTapSmooth + if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) | + (filter_id == 8) | (filter_id == 9)) != 0) { + return 6; + } + // When |filter_index| == 1, the |filter_id| values not listed above map to + // 4 tap filters. + return 4; + } + + if (filter_index == 2) { + // kInterpolationFilterEightTapSharp + return 8; + } + + if (filter_index == 3) { + // kInterpolationFilterBilinear + return 2; + } + + assert(filter_index > 3); + // For small sizes (width/height <= 4) the large filters are replaced with 4 + // tap options. + // If the original filters were |kInterpolationFilterEightTap| or + // |kInterpolationFilterEightTapSharp| then it becomes + // |kInterpolationFilterSwitchable|. + // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 + // tap filter. + return 4; +} + // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final // sum from outranging int16_t. -template <int filter_index> +template <int num_taps> __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { __m128i sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1 const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3 const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5 sum = _mm_add_epi16(v_madd_21, v_madd_43); sum = _mm_add_epi16(sum, v_madd_65); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0 const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2 @@ -40,7 +83,7 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32); const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76); sum = _mm_add_epi16(v_sum_7654, v_sum_3210); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3 } else { @@ -52,13 +95,13 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { return sum; } -template <int filter_index> +template <int num_taps> __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, const __m128i* const v_tap) { // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]); - if (filter_index == 3) { + if (num_taps == 2) { // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17 const __m128i v_src_43 = _mm_shuffle_epi8( v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403)); @@ -79,10 +122,10 @@ __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, return v_sum_5432; } -template <int filter_index> +template <int num_taps> __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, const __m128i* const v_tap) { - __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - @@ -95,11 +138,10 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, return _mm_packus_epi16(sum, sum); } -template <int filter_index> +template <int num_taps> __m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, const __m128i* const v_tap) { - const __m128i sum = - SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } @@ -411,36 +453,34 @@ __m128i Compound1DShift(const __m128i sum) { return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template <int filter_index> +template <int num_taps> __m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) { __m128i v_src[4]; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); } - const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap); + const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap); return sum; } -// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the -// 2D version. -template <int num_taps, int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, void* const dst, const ptrdiff_t dst_stride, const int height, const __m128i* const v_tap) { @@ -468,7 +508,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 10 11 12 13 20 21 22 23 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -515,7 +555,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 30 31 32 33 40 41 42 43 srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -574,7 +614,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 50 51 52 53 60 61 62 63 srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -645,7 +685,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 70 71 72 73 80 81 82 83 srcs[7] = _mm_unpacklo_epi32(d, srcs[8]); - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -672,7 +712,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, } } -template <int num_taps, int filter_index, bool negative_outside_taps = false> +template <int num_taps, bool negative_outside_taps = false> void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, void* const dst, const ptrdiff_t dst_stride, const int height, const __m128i* const v_tap) { @@ -705,7 +745,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, // 10 11 20 21 30 31 40 41 srcs[1] = _mm_srli_si128(srcs_0_2, 2); // This uses srcs[0]..srcs[1]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); @@ -760,7 +800,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[3] = _mm_srli_si128(srcs_0_4, 6); // This uses srcs[0]..srcs[3]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); @@ -829,7 +869,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[5] = _mm_srli_si128(srcs_4_8, 2); // This uses srcs[0]..srcs[5]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); @@ -909,7 +949,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[7] = _mm_srli_si128(srcs_4_8, 6); // This uses srcs[0]..srcs[7]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); diff --git a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc index c813df4..8c32117 100644 --- a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc +++ b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc @@ -34,54 +34,50 @@ namespace low_bitdepth { namespace { constexpr int kInterPostRoundBit = 4; +constexpr int kInterPostRhsAdjust = 1 << (16 - kInterPostRoundBit - 1); inline __m128i ComputeWeightedAverage8(const __m128i& pred0, const __m128i& pred1, - const __m128i& weights) { - // TODO(https://issuetracker.google.com/issues/150325685): Investigate range. - const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1); - const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights); - const __m128i result_lo = - RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4); - - const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1); - const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights); - const __m128i result_hi = - RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4); - - return _mm_packs_epi32(result_lo, result_hi); + const __m128i& weight) { + // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0 + // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >> + // 8(=kInterPostRoundBit + 4) + // The formula is manipulated to avoid lengthening to 32 bits. + // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1 + // = (p0 - p1) * w0 + 16 * p1 + // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808. + const __m128i diff = _mm_slli_epi16(_mm_sub_epi16(pred0, pred1), 1); + // (((p0 - p1) * (w0 << 12) >> 16) + ((16 * p1) >> 4) + const __m128i weighted_diff = _mm_mulhi_epi16(diff, weight); + // ((p0 - p1) * w0 >> 4) + p1 + const __m128i upscaled_average = _mm_add_epi16(weighted_diff, pred1); + // (x << 11) >> 15 == x >> 4 + const __m128i right_shift_prep = _mm_set1_epi16(kInterPostRhsAdjust); + // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4 + return _mm_mulhrs_epi16(upscaled_average, right_shift_prep); } template <int height> inline void DistanceWeightedBlend4xH_SSE4_1( const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, - const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest, - const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); - const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + // Upscale the weight for mulhi. + const __m128i weights = _mm_set1_epi16(weight << 11); for (int y = 0; y < height; y += 4) { - // TODO(b/150326556): Use larger loads. - const __m128i src_00 = LoadLo8(pred_0); - const __m128i src_10 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - __m128i src_0 = LoadHi8(src_00, pred_0); - __m128i src_1 = LoadHi8(src_10, pred_1); - pred_0 += 4; - pred_1 += 4; - const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights); - - const __m128i src_01 = LoadLo8(pred_0); - const __m128i src_11 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - src_0 = LoadHi8(src_01, pred_0); - src_1 = LoadHi8(src_11, pred_1); - pred_0 += 4; - pred_1 += 4; - const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights); + const __m128i src_00 = LoadAligned16(pred_0); + const __m128i src_10 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; + const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights); + + const __m128i src_01 = LoadAligned16(pred_0); + const __m128i src_11 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; + const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights); const __m128i result_pixels = _mm_packus_epi16(res0, res1); Store4(dst, result_pixels); @@ -101,11 +97,11 @@ inline void DistanceWeightedBlend4xH_SSE4_1( template <int height> inline void DistanceWeightedBlend8xH_SSE4_1( const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, - const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest, - const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); - const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + // Upscale the weight for mulhi. + const __m128i weights = _mm_set1_epi16(weight << 11); for (int y = 0; y < height; y += 2) { const __m128i src_00 = LoadAligned16(pred_0); @@ -130,11 +126,12 @@ inline void DistanceWeightedBlend8xH_SSE4_1( inline void DistanceWeightedBlendLarge_SSE4_1( const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, const int height, - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight, + const int width, const int height, void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); - const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + // Upscale the weight for mulhi. + const __m128i weights = _mm_set1_epi16(weight << 11); int y = height; do { @@ -162,23 +159,24 @@ inline void DistanceWeightedBlendLarge_SSE4_1( void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, + const uint8_t /*weight_1*/, const int width, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); + const uint8_t weight = weight_0; if (width == 4) { if (height == 4) { - DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight, dest, + dest_stride); } else if (height == 8) { - DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight, dest, + dest_stride); } else { assert(height == 16); - DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight, dest, + dest_stride); } return; } @@ -186,28 +184,28 @@ void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, if (width == 8) { switch (height) { case 4: - DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight, dest, + dest_stride); return; case 8: - DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight, dest, + dest_stride); return; case 16: - DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight, dest, + dest_stride); return; default: assert(height == 32); - DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight, dest, + dest_stride); return; } } - DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width, - height, dest, dest_stride); + DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight, width, height, dest, + dest_stride); } void Init8bpp() { @@ -273,27 +271,19 @@ inline void DistanceWeightedBlend4xH_SSE4_1( int y = height; do { - const __m128i src_00 = LoadLo8(pred_0); - const __m128i src_10 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - __m128i src_0 = LoadHi8(src_00, pred_0); - __m128i src_1 = LoadHi8(src_10, pred_1); - pred_0 += 4; - pred_1 += 4; + const __m128i src_00 = LoadAligned16(pred_0); + const __m128i src_10 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; const __m128i res0 = - ComputeWeightedAverage8(src_0, src_1, weight0, weight1); - - const __m128i src_01 = LoadLo8(pred_0); - const __m128i src_11 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - src_0 = LoadHi8(src_01, pred_0); - src_1 = LoadHi8(src_11, pred_1); - pred_0 += 4; - pred_1 += 4; + ComputeWeightedAverage8(src_00, src_10, weight0, weight1); + + const __m128i src_01 = LoadAligned16(pred_0); + const __m128i src_11 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; const __m128i res1 = - ComputeWeightedAverage8(src_0, src_1, weight0, weight1); + ComputeWeightedAverage8(src_01, src_11, weight0, weight1); StoreLo8(dst, res0); dst += dest_stride; diff --git a/libgav1/src/dsp/x86/film_grain_sse4.cc b/libgav1/src/dsp/x86/film_grain_sse4.cc index 9ece947..59d18a6 100644 --- a/libgav1/src/dsp/x86/film_grain_sse4.cc +++ b/libgav1/src/dsp/x86/film_grain_sse4.cc @@ -23,14 +23,15 @@ #include <cstdint> #include <cstring> -#include "src/dsp/common.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/film_grain_common.h" #include "src/dsp/x86/common_sse4.h" +#include "src/utils/array_2d.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" -#include "src/utils/logging.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" namespace libgav1 { namespace dsp { @@ -165,7 +166,7 @@ void BlendNoiseWithImageLuma_SSE4_1( int y = 0; do { int x = 0; - for (; x < safe_width; x += 8) { + for (; x + 8 <= safe_width; x += 8) { const __m128i orig = LoadSource(&in_y_row[x]); const __m128i scaling = GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); @@ -181,6 +182,7 @@ void BlendNoiseWithImageLuma_SSE4_1( // Prevent arbitrary indices from entering GetScalingFactors. memset(luma_buffer, 0, sizeof(luma_buffer)); const int valid_range = width - x; + assert(valid_range < 8); memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0])); luma_buffer[valid_range] = in_y_row[width - 1]; const __m128i orig = LoadSource(&in_y_row[x]); @@ -239,7 +241,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( int y = 0; do { int x = 0; - for (; x < safe_chroma_width; x += 8) { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const __m128i average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); @@ -252,8 +254,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); } - // This section only runs if width % (8 << sub_x) != 0. It should never run - // on 720p and above. if (x < chroma_width) { // Prevent huge indices from entering GetScalingFactors due to // uninitialized values. This is not a problem in 8bpp because the table @@ -365,7 +365,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1( int y = 0; do { int x = 0; - for (; x < safe_chroma_width; x += 8) { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const __m128i average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); diff --git a/libgav1/src/dsp/x86/intrapred_directional_sse4.cc b/libgav1/src/dsp/x86/intrapred_directional_sse4.cc index e642aee..bc61745 100644 --- a/libgav1/src/dsp/x86/intrapred_directional_sse4.cc +++ b/libgav1/src/dsp/x86/intrapred_directional_sse4.cc @@ -624,14 +624,6 @@ inline void DirectionalZone2FromLeftCol_4x4_SSE4_1( } } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - template <bool upsampled> inline void DirectionalZone2FromLeftCol_8x8_SSE4_1( uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column, @@ -729,6 +721,103 @@ inline void DirectionalZone1Blend_8xH( } } +template <bool shuffle_left_column, bool upsampled_left, bool upsampled_top> +inline void DirectionalZone2_8xH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint8_t* LIBGAV1_RESTRICT const top_row, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base, + const __m128i& left_y) { + const int upsample_left_shift = static_cast<int>(upsampled_left); + const int upsample_top_shift = static_cast<int>(upsampled_top); + + // Loop incrementers for moving by block (8x8). This function handles blocks + // with height 4 as well. They are calculated in one pass so these variables + // do not get used. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + const __m128i xstep8_vect = _mm_set1_epi16(xstep8); + + // Cover 8x4 case. + const int min_height = (height == 4) ? 4 : 8; + + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x; + + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1); + DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + DirectionalZone1_4xH(dst_x + 4, stride, + top_row + ((x + 4) << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + if (max_top_only_y == height) return; + + const __m128i max_shift = _mm_set1_epi8(32); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + const __m128i dest_index_x = + _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); + const __m128i sampler_top = + upsampled_top + ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); + // All rows from |min_left_only_y| down for this set of columns, only need + // |left_column| to compute. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + + __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); + __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); + int top_x = -xstep_y; + + const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0)); + for (; y < min_left_only_y; + y += 8, dst_x += stride8, + xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), + xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), + top_x -= xstep8) { + // Pick up from the last y-value, using the 10% slower but secure method for + // left prediction. + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y); + } else { + DirectionalZone3_8xH<upsampled_left, 8>( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep); + } + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), + shift_mask), + 1); + shifts = _mm_packus_epi16(shifts, shifts); + __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); + DirectionalZone1Blend_8xH<upsampled_top, 8>( + dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, + xstep_bounds_off, shifts, dest_index_x, top_x, xstep); + } + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_8xH<upsampled_left, 8>( + dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift), + base_left_y, -ystep); + } +} + // 7.11.2.4 (8) 90 < angle > 180 // The strategy for this function is to know how many blocks can be processed // with just pixels from |top_ptr|, then handle mixed blocks, then handle only @@ -742,29 +831,11 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride, const int width, const int height, const int xstep, const int ystep) { auto* dst = static_cast<uint8_t*>(dest); - const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); - const __m128i max_shift = _mm_set1_epi8(32); - const ptrdiff_t stride8 = stride << 3; - const __m128i dest_index_x = - _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); - const __m128i sampler_top = - upsampled_top - ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) - : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); - const __m128i shift_mask = _mm_set1_epi32(0x003F003F); - // All columns from |min_top_only_x| to the right will only need |top_row| to - // compute. This assumes minimum |xstep| is 3. + // All columns from |min_top_only_x| to the right will only need |top_row| + // to compute. This assumes minimum |xstep| is 3. const int min_top_only_x = std::min((height * xstep) >> 6, width); - // For steep angles, the source pixels from left_column may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - const int max_shuffle_height = - std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]); - - const int xstep8 = xstep << 3; - const __m128i xstep8_vect = _mm_set1_epi16(xstep8); // Accumulate xstep across 8 rows. const __m128i xstep_dup = _mm_set1_epi16(-xstep); const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); @@ -787,105 +858,39 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride, // offset. Following values need the full ystep as a relative offset. const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder); const __m128i ystep_dup = _mm_set1_epi16(-ystep); + const __m128i dest_index_x = + _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x); left_y = _mm_add_epi16(ystep_init, left_y); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); const __m128i increment_top8 = _mm_set1_epi16(8 << 6); int x = 0; - // This loop treats each set of 4 columns in 3 stages with y-value boundaries. - // The first stage, before the first y-loop, covers blocks that are only - // computed from the top row. The second stage, comprising two y-loops, covers - // blocks that have a mixture of values computed from top or left. The final - // stage covers blocks that are only computed from the left. + for (int left_offset = -left_base_increment; x < min_shuffle_x; + x += 8, + xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8), + // Watch left_y because it can still get big. + left_y = _mm_add_epi16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH<false, upsampled_left, upsampled_top>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_for_shift, xstep_bounds_base, left_y); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8), // Watch left_y because it can still get big. left_y = _mm_add_epi16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x; - - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), - max_top_only_y, -xstep, upsampled_top); - DirectionalZone1_4xH(dst_x + 4, stride, - top_row + ((x + 4) << upsample_top_shift), - max_top_only_y, -xstep, upsampled_top); - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); - // All rows from |min_left_only_y| down for this set of columns, only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); - __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); - int top_x = -xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, - xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), - xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), - top_x -= xstep8) { - DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), left_y); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), - shift_mask), - 1); - shifts = _mm_packus_epi16(shifts, shifts); - __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); - DirectionalZone1Blend_8xH<upsampled_top, 8>( - dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, - xstep_bounds_off, shifts, dest_index_x, top_x, xstep); - } - // Pick up from the last y-value, using the 10% slower but secure method for - // left prediction. - const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0)); - for (; y < min_left_only_y; - y += 8, dst_x += stride8, - xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), - xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), - top_x -= xstep8) { - const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); - - DirectionalZone3_8xH<upsampled_left, 8>( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), - shift_mask), - 1); - shifts = _mm_packus_epi16(shifts, shifts); - __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - DirectionalZone1Blend_8xH<upsampled_top, 8>( - dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, - xstep_bounds_off, shifts, dest_index_x, top_x, xstep); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_8xH<upsampled_left, 8>( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep); - } + DirectionalZone2_8xH<true, upsampled_left, upsampled_top>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_for_shift, xstep_bounds_base, left_y); } for (; x < width; x += 4) { DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift), @@ -952,8 +957,8 @@ inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride, left_offset -= left_base_increment4) { uint8_t* dst_x = dst + x; - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4; + // Round down to the nearest multiple of 4. + const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3; DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), max_top_only_y, -xstep, upsampled_top); int y = max_top_only_y; diff --git a/libgav1/src/dsp/x86/loop_restoration_sse4.cc b/libgav1/src/dsp/x86/loop_restoration_sse4.cc index 3363f0e..b4df072 100644 --- a/libgav1/src/dsp/x86/loop_restoration_sse4.cc +++ b/libgav1/src/dsp/x86/loop_restoration_sse4.cc @@ -2088,6 +2088,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter( uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) { __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3]; + ma5[1] = _mm_setzero_si128(); // Quiets -Wmaybe-unintialized with gcc. s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width); sq[0][0] = SquareLo8(s[0][0]); diff --git a/libgav1/src/dsp/x86/mask_blend_sse4.cc b/libgav1/src/dsp/x86/mask_blend_sse4.cc index a18444b..833814c 100644 --- a/libgav1/src/dsp/x86/mask_blend_sse4.cc +++ b/libgav1/src/dsp/x86/mask_blend_sse4.cc @@ -30,35 +30,81 @@ namespace libgav1 { namespace dsp { -namespace low_bitdepth { namespace { +template <int subsampling_x, int subsampling_y> +inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride) { + if (subsampling_x == 1 && subsampling_y == 1) { + const __m128i one = _mm_set1_epi8(1); + const __m128i mask_val_0 = LoadUnaligned16(mask); + const __m128i mask_val_1 = LoadUnaligned16(mask + stride); + const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1); + const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); + return RightShiftWithRounding_U16(mask_0, 2); + } + if (subsampling_x == 1) { + const __m128i row_vals = LoadUnaligned16(mask); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); + const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); + __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + return RightShiftWithRounding_U16(subsampled_mask, 1); + } + assert(subsampling_y == 0 && subsampling_x == 0); + const __m128i mask_val = LoadLo8(mask); + return _mm_cvtepu8_epi16(mask_val); +} + +// Imitate behavior of ARM vtrn1q_u64. +inline __m128i Transpose1_U64(const __m128i a, const __m128i b) { + return _mm_castps_si128( + _mm_movelh_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); +} + +// Imitate behavior of ARM vtrn2q_u64. +inline __m128i Transpose2_U64(const __m128i a, const __m128i b) { + return _mm_castps_si128( + _mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); +} + // Width can only be 4 when it is subsampled from a block of width 8, hence // subsampling_x is always 1 when this function is called. template <int subsampling_x, int subsampling_y> -inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +inline __m128i GetMask4x2(const uint8_t* mask) { + if (subsampling_x == 1 && subsampling_y == 1) { + const __m128i mask_val_01 = LoadUnaligned16(mask); + // Stride is fixed because this is the smallest block size. + const __m128i mask_val_23 = LoadUnaligned16(mask + 16); + // Transpose rows to add row 0 to row 1, and row 2 to row 3. + const __m128i mask_val_02 = Transpose1_U64(mask_val_01, mask_val_23); + const __m128i mask_val_13 = Transpose2_U64(mask_val_23, mask_val_01); + const __m128i add_0 = _mm_adds_epu8(mask_val_02, mask_val_13); + const __m128i one = _mm_set1_epi8(1); + const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); + return RightShiftWithRounding_U16(mask_0, 2); + } + return GetMask8<subsampling_x, 0>(mask, 0); +} + +template <int subsampling_x, int subsampling_y> +inline __m128i GetInterIntraMask4x2(const uint8_t* mask, + ptrdiff_t mask_stride) { if (subsampling_x == 1) { - const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); - const __m128i mask_val_1 = - _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y))); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - if (subsampling_y == 1) { - const __m128i next_mask_val_0 = - _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride)); - const __m128i next_mask_val_1 = - _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3)); - subsampled_mask = _mm_add_epi16( - subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1)); - } - return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y); + return GetMask4x2<subsampling_x, subsampling_y>(mask); } + // When using intra or difference weighted masks, the function doesn't use + // subsampling, so |mask_stride| may be 4 or 8. + assert(subsampling_y == 0 && subsampling_x == 0); const __m128i mask_val_0 = Load4(mask); const __m128i mask_val_1 = Load4(mask + mask_stride); return _mm_cvtepu8_epi16( _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4))); } +} // namespace + +namespace low_bitdepth { +namespace { + // This function returns a 16-bit packed mask to fit in _mm_madd_epi16. // 16-bit is also the lowest packing for hadd, but without subsampling there is // an unfortunate conversion required. @@ -87,38 +133,6 @@ inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask, return _mm_cvtepu8_epi16(mask_val); } -// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because, -// when is_inter_intra is true, the prediction values are brought to 8-bit -// packing as well. -template <int subsampling_x, int subsampling_y> -inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t stride) { - if (subsampling_x == 1) { - const __m128i row_vals = LoadUnaligned16(mask); - - const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); - const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - - if (subsampling_y == 1) { - const __m128i next_row_vals = LoadUnaligned16(mask + stride); - const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals); - const __m128i next_mask_val_1 = - _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8)); - subsampled_mask = _mm_add_epi16( - subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1)); - } - const __m128i ret = - RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y); - return _mm_packus_epi16(ret, ret); - } - assert(subsampling_y == 0 && subsampling_x == 0); - // Unfortunately there is no shift operation for 8-bit packing, or else we - // could return everything with 8-bit packing. - const __m128i mask_val = LoadLo8(mask); - return mask_val; -} - inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, const int16_t* LIBGAV1_RESTRICT const pred_1, const __m128i pred_mask_0, @@ -149,15 +163,14 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, } template <int subsampling_x, int subsampling_y> -inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, - const uint8_t* LIBGAV1_RESTRICT mask, - const ptrdiff_t mask_stride, - uint8_t* LIBGAV1_RESTRICT dst, - const ptrdiff_t dst_stride) { +inline void MaskBlending4x4_SSE4_1(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT mask, + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; const __m128i mask_inverter = _mm_set1_epi16(64); - __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -166,30 +179,30 @@ inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); } template <int subsampling_x, int subsampling_y> -inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, - const uint8_t* LIBGAV1_RESTRICT const mask_ptr, - const ptrdiff_t mask_stride, const int height, - uint8_t* LIBGAV1_RESTRICT dst, - const ptrdiff_t dst_stride) { +inline void MaskBlending4xH_SSE4_1( + const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const int height, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { + assert(subsampling_x == 1); const uint8_t* mask = mask_ptr; + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; if (height == 4) { - MaskBlending4x4_SSE4<subsampling_x, subsampling_y>( - pred_0, pred_1, mask, mask_stride, dst, dst_stride); + MaskBlending4x4_SSE4_1<subsampling_x, subsampling_y>(pred_0, pred_1, mask, + dst, dst_stride); return; } const __m128i mask_inverter = _mm_set1_epi16(64); int y = 0; do { - __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, @@ -199,7 +212,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -208,7 +221,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -217,7 +230,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -230,21 +243,21 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, } template <int subsampling_x, int subsampling_y> -inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - const ptrdiff_t /*prediction_stride_1*/, - const uint8_t* LIBGAV1_RESTRICT const mask_ptr, - const ptrdiff_t mask_stride, const int width, - const int height, void* LIBGAV1_RESTRICT dest, - const ptrdiff_t dst_stride) { +inline void MaskBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + const ptrdiff_t /*prediction_stride_1*/, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, + const ptrdiff_t mask_stride, const int width, + const int height, void* LIBGAV1_RESTRICT dest, + const ptrdiff_t dst_stride) { auto* dst = static_cast<uint8_t*>(dest); const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); const ptrdiff_t pred_stride_0 = width; const ptrdiff_t pred_stride_1 = width; if (width == 4) { - MaskBlending4xH_SSE4<subsampling_x, subsampling_y>( - pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride); + MaskBlending4xH_SSE4_1<subsampling_x, subsampling_y>( + pred_0, pred_1, mask_ptr, height, dst, dst_stride); return; } const uint8_t* mask = mask_ptr; @@ -293,7 +306,6 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2( const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1); const __m128i pred_val_0 = LoadLo8(pred_0); - // TODO(b/150326556): One load. __m128i pred_val_1 = Load4(pred_1); pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4), pred_val_1); @@ -309,16 +321,16 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2( } template <int subsampling_x, int subsampling_y> -inline void InterIntraMaskBlending8bpp4x4_SSE4( +inline void InterIntraMaskBlending8bpp4x4_SSE4_1( const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask, const ptrdiff_t mask_stride) { const __m128i mask_inverter = _mm_set1_epi8(64); const __m128i pred_mask_u16_first = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); mask += mask_stride << (1 + subsampling_y); const __m128i pred_mask_u16_second = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); mask += mask_stride << (1 + subsampling_y); __m128i pred_mask_1 = _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second); @@ -335,26 +347,26 @@ inline void InterIntraMaskBlending8bpp4x4_SSE4( } template <int subsampling_x, int subsampling_y> -inline void InterIntraMaskBlending8bpp4xH_SSE4( +inline void InterIntraMaskBlending8bpp4xH_SSE4_1( const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, const int height) { const uint8_t* mask = mask_ptr; if (height == 4) { - InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>( + InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>( pred_0, pred_1, pred_stride_1, mask, mask_stride); return; } int y = 0; do { - InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>( + InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>( pred_0, pred_1, pred_stride_1, mask, mask_stride); pred_0 += 4 << 2; pred_1 += pred_stride_1 << 2; mask += mask_stride << (2 + subsampling_y); - InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>( + InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>( pred_0, pred_1, pred_stride_1, mask, mask_stride); pred_0 += 4 << 2; pred_1 += pred_stride_1 << 2; @@ -363,14 +375,31 @@ inline void InterIntraMaskBlending8bpp4xH_SSE4( } while (y < height); } +// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because, +// when is_inter_intra is true, the prediction values are brought to 8-bit +// packing as well. +template <int subsampling_x, int subsampling_y> +inline __m128i GetInterIntraMask8bpp8(const uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t stride) { + if (subsampling_x == 1) { + const __m128i ret = GetMask8<subsampling_x, subsampling_y>(mask, stride); + return _mm_packus_epi16(ret, ret); + } + assert(subsampling_y == 0 && subsampling_x == 0); + // Unfortunately there is no shift operation for 8-bit packing, or else we + // could return everything with 8-bit packing. + const __m128i mask_val = LoadLo8(mask); + return mask_val; +} + template <int subsampling_x, int subsampling_y> -void InterIntraMaskBlend8bpp_SSE4( +void InterIntraMaskBlend8bpp_SSE4_1( const uint8_t* LIBGAV1_RESTRICT prediction_0, uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1, const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, const int width, const int height) { if (width == 4) { - InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>( + InterIntraMaskBlending8bpp4xH_SSE4_1<subsampling_x, subsampling_y>( prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride, height); return; @@ -382,7 +411,7 @@ void InterIntraMaskBlend8bpp_SSE4( int x = 0; do { const __m128i pred_mask_1 = - GetInterIntraMask8<subsampling_x, subsampling_y>( + GetInterIntraMask8bpp8<subsampling_x, subsampling_y>( mask + (x << subsampling_x), mask_stride); // 64 - mask const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1); @@ -411,24 +440,24 @@ void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444) - dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>; + dsp->mask_blend[0][0] = MaskBlend_SSE4_1<0, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422) - dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>; + dsp->mask_blend[1][0] = MaskBlend_SSE4_1<1, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420) - dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>; + dsp->mask_blend[2][0] = MaskBlend_SSE4_1<1, 1>; #endif // The is_inter_intra index of mask_blend[][] is replaced by // inter_intra_mask_blend_8bpp[] in 8-bit. #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444) - dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>; + dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4_1<0, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422) - dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>; + dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4_1<1, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420) - dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>; + dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4_1<1, 1>; #endif } @@ -443,14 +472,6 @@ constexpr int kMax10bppSample = (1 << 10) - 1; constexpr int kMaskInverse = 64; constexpr int kRoundBitsMaskBlend = 4; -inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits, - const __m128i zero) { - // Shift out all but the last bit. - const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1); - // Avg with zero will shift by 1 and round. - return _mm_avg_epu16(v_tmp_d, zero); -} - inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits, const __m128i shift) { const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift); @@ -458,53 +479,31 @@ inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits, } template <int subsampling_x, int subsampling_y> -inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride, - const __m128i zero) { - if (subsampling_x == 1) { - if (subsampling_y == 0) { - const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); - const __m128i mask_val_1 = - _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y))); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero); - } - const __m128i one = _mm_set1_epi8(1); - const __m128i mask_val_0 = - LoadHi8(LoadLo8(mask), mask + (mask_stride << 1)); - const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride), - mask + (mask_stride << 1) + mask_stride); - const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1); - const __m128i subsampled_mask = _mm_maddubs_epi16(add, one); - return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero); +inline __m128i GetMask4x2(const uint8_t* mask) { + if (subsampling_x == 1 && subsampling_y == 1) { + const __m128i mask_row_01 = LoadUnaligned16(mask); + const __m128i mask_row_23 = LoadUnaligned16(mask + 16); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01); + const __m128i mask_val_1 = + _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8)); + const __m128i mask_val_2 = _mm_cvtepu8_epi16(mask_row_23); + const __m128i mask_val_3 = + _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_23, 8)); + const __m128i subsampled_mask_02 = _mm_hadd_epi16(mask_val_0, mask_val_2); + const __m128i subsampled_mask_13 = _mm_hadd_epi16(mask_val_1, mask_val_3); + const __m128i subsampled_mask = + _mm_add_epi16(subsampled_mask_02, subsampled_mask_13); + return RightShiftWithRounding_U16(subsampled_mask, 2); } - assert(subsampling_y == 0 && subsampling_x == 0); - const __m128i mask_val_0 = Load4(mask); - const __m128i mask_val_1 = Load4(mask + mask_stride); - return _mm_cvtepu8_epi16( - _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4))); -} - -template <int subsampling_x, int subsampling_y> -inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride, - const __m128i zero) { if (subsampling_x == 1) { - if (subsampling_y == 0) { - const __m128i row_vals = LoadUnaligned16(mask); - const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); - const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero); - } - const __m128i one = _mm_set1_epi8(1); - const __m128i mask_val_0 = LoadUnaligned16(mask); - const __m128i mask_val_1 = LoadUnaligned16(mask + stride); - const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1); - const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); - return RightShiftWithRoundingZero_U16(mask_0, 2, zero); + const __m128i mask_row_01 = LoadUnaligned16(mask); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01); + const __m128i mask_val_1 = + _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8)); + const __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + return RightShiftWithRounding_U16(subsampled_mask, 1); } - assert(subsampling_y == 0 && subsampling_x == 0); - const __m128i mask_val = LoadLo8(mask); - return _mm_cvtepu8_epi16(mask_val); + return _mm_cvtepu8_epi16(LoadLo8(mask)); } inline void WriteMaskBlendLine10bpp4x2_SSE4_1( @@ -558,12 +557,10 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0, uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); const __m128i offset = _mm_set1_epi32(kCompoundOffset); const __m128i max = _mm_set1_epi16(kMax10bppSample); - __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, shift4, dst, @@ -573,8 +570,7 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, shift4, dst, @@ -595,7 +591,6 @@ inline void MaskBlend10bpp4xH_SSE4_1( return; } const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const uint8_t pred0_stride2 = 4 << 1; const ptrdiff_t pred1_stride2 = pred_stride_1 << 1; const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y); @@ -605,8 +600,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); int y = height; do { - __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, @@ -617,8 +611,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( mask += mask_stride2; dst += dst_stride2; - pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, @@ -628,8 +621,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( mask += mask_stride2; dst += dst_stride2; - pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, @@ -639,8 +631,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( mask += mask_stride2; dst += dst_stride2; - pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, @@ -675,7 +666,6 @@ inline void MaskBlend10bpp_SSE4_1( } const uint8_t* mask = mask_ptr; const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y; const __m128i offset = _mm_set1_epi32(kCompoundOffset); const __m128i max = _mm_set1_epi16(kMax10bppSample); @@ -685,7 +675,7 @@ inline void MaskBlend10bpp_SSE4_1( int x = 0; do { const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( - mask + (x << subsampling_x), mask_stride, zero); + mask + (x << subsampling_x), mask_stride); const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x); const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x); // 64 - mask @@ -729,7 +719,6 @@ inline void MaskBlend10bpp_SSE4_1( mask += mask_stride_ss; } while (--y != 0); } - inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1( const uint16_t* LIBGAV1_RESTRICT prediction_0, const uint16_t* LIBGAV1_RESTRICT prediction_1, @@ -764,9 +753,8 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1( uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); - const __m128i zero = _mm_setzero_si128(); __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, shift6, @@ -777,7 +765,7 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1( dst += dst_stride << 1; pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, shift6, @@ -798,7 +786,6 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( return; } const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); const uint8_t pred0_stride2 = 4 << 1; const ptrdiff_t pred1_stride2 = pred_stride_1 << 1; @@ -807,7 +794,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( int y = height; do { __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -818,7 +805,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( dst += dst_stride2; pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -829,7 +816,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( dst += dst_stride2; pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -840,7 +827,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( dst += dst_stride2; pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -876,14 +863,13 @@ inline void InterIntraMaskBlend10bpp_SSE4_1( const uint8_t* mask = mask_ptr; const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); - const __m128i zero = _mm_setzero_si128(); const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y; int y = height; do { int x = 0; do { const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( - mask + (x << subsampling_x), mask_stride, zero); + mask + (x << subsampling_x), mask_stride); const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x); const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x); // 64 - mask diff --git a/libgav1/src/dsp/x86/obmc_sse4.cc b/libgav1/src/dsp/x86/obmc_sse4.cc index 8ce23b4..f068ff3 100644 --- a/libgav1/src/dsp/x86/obmc_sse4.cc +++ b/libgav1/src/dsp/x86/obmc_sse4.cc @@ -39,8 +39,8 @@ namespace { inline void OverlapBlendFromLeft2xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 2; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); @@ -51,8 +51,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( int y = height; do { const __m128i pred_val = Load2x2(pred, pred + prediction_stride); - const __m128i obmc_pred_val = - Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride); + const __m128i obmc_pred_val = Load4(obmc_pred); const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); const __m128i result = @@ -71,8 +70,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( inline void OverlapBlendFromLeft4xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 4; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); @@ -85,15 +84,12 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( int y = height; do { const __m128i pred_val0 = Load4(pred); - const __m128i obmc_pred_val0 = Load4(obmc_pred); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; // Place the second row of each source in the second four bytes. const __m128i pred_val = _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); - const __m128i obmc_pred_val = _mm_alignr_epi8( - Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); const __m128i result = RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); @@ -102,7 +98,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( const int second_row_result = _mm_extract_epi32(packed_result, 1); memcpy(pred, &second_row_result, sizeof(second_row_result)); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + obmc_pred += obmc_prediction_stride << 1; y -= 2; } while (y != 0); } @@ -110,8 +106,8 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( inline void OverlapBlendFromLeft8xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 8; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi8(64); @@ -121,16 +117,25 @@ inline void OverlapBlendFromLeft8xH_SSE4_1( const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); int y = height; do { - const __m128i pred_val = LoadLo8(pred); - const __m128i obmc_pred_val = LoadLo8(obmc_pred); - const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); - const __m128i result = - RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6); + + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6); - StoreLo8(pred, _mm_packus_epi16(result, result)); + const __m128i result = _mm_packus_epi16(result_lo, result_hi); + StoreLo8(pred, result); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (--y != 0); + StoreHi8(pred, result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + y -= 2; + } while (y != 0); } void OverlapBlendFromLeft_SSE4_1( @@ -144,18 +149,15 @@ void OverlapBlendFromLeft_SSE4_1( assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } const __m128i mask_inverter = _mm_set1_epi8(64); @@ -192,8 +194,8 @@ void OverlapBlendFromLeft_SSE4_1( inline void OverlapBlendFromTop4xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 4; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi16(64); @@ -212,13 +214,10 @@ inline void OverlapBlendFromTop4xH_SSE4_1( _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); const __m128i pred_val0 = Load4(pred); - const __m128i obmc_pred_val0 = Load4(obmc_pred); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; const __m128i pred_val = _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); - const __m128i obmc_pred_val = _mm_alignr_epi8( - Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val); const __m128i result = RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); @@ -227,7 +226,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1( Store4(pred - prediction_stride, packed_result); Store4(pred, _mm_srli_si128(packed_result, 4)); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + obmc_pred += obmc_prediction_stride << 1; y += 2; } while (y < compute_height); } @@ -235,8 +234,8 @@ inline void OverlapBlendFromTop4xH_SSE4_1( inline void OverlapBlendFromTop8xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 8; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const uint8_t* mask = kObmcMask + height - 2; @@ -244,20 +243,35 @@ inline void OverlapBlendFromTop8xH_SSE4_1( const int compute_height = height - (height >> 2); int y = compute_height; do { - const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]); + const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]); // 64 - mask - const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); - const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); - const __m128i pred_val = LoadLo8(pred); - const __m128i obmc_pred_val = LoadLo8(obmc_pred); - const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); - const __m128i result = - RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0); + const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0); - StoreLo8(pred, _mm_packus_epi16(result, result)); + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6); + + --y; + const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]); + // 64 - mask + const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1); + const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1); + + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6); + + const __m128i result = _mm_packus_epi16(result_lo, result_hi); + StoreLo8(pred, result); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (--y != 0); + StoreHi8(pred, result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + } while (--y > 0); } void OverlapBlendFromTop_SSE4_1( @@ -271,13 +285,11 @@ void OverlapBlendFromTop_SSE4_1( assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } @@ -333,8 +345,8 @@ constexpr int kRoundBitsObmcBlend = 6; inline void OverlapBlendFromLeft2xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 2; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const ptrdiff_t pred_stride2 = pred_stride << 1; @@ -348,8 +360,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( int y = height; do { const __m128i pred_val = Load4x2(pred, pred + pred_stride); - const __m128i obmc_pred_val = - Load4x2(obmc_pred, obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val); const __m128i result = RightShiftWithRounding_U32( _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend); @@ -364,8 +375,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( inline void OverlapBlendFromLeft4xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 4; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const ptrdiff_t pred_stride2 = pred_stride << 1; @@ -379,8 +390,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( int y = height; do { const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); - const __m128i obmc_pred_val = - LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val); const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val); const __m128i result_lo = RightShiftWithRounding_U32( @@ -410,13 +420,11 @@ void OverlapBlendFromLeft10bpp_SSE4_1( assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } const __m128i mask_inverter = _mm_set1_epi8(64); @@ -452,8 +460,8 @@ void OverlapBlendFromLeft10bpp_SSE4_1( inline void OverlapBlendFromTop4xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 4; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi16(64); @@ -473,8 +481,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1( const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); - const __m128i obmc_pred_val = - LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val); const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val); const __m128i result_lo = RightShiftWithRounding_U32( @@ -505,8 +512,7 @@ void OverlapBlendFromTop10bpp_SSE4_1( assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } diff --git a/libgav1/src/dsp/x86/warp_sse4.cc b/libgav1/src/dsp/x86/warp_sse4.cc index 5830894..5498052 100644 --- a/libgav1/src/dsp/x86/warp_sse4.cc +++ b/libgav1/src/dsp/x86/warp_sse4.cc @@ -167,7 +167,7 @@ inline void WriteVerticalFilter(const __m128i filter[8], } template <bool is_compound, typename DestType> -inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma, +inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma, int delta, DestType* LIBGAV1_RESTRICT dest_row, ptrdiff_t dest_stride) { int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); @@ -188,8 +188,8 @@ inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma, } template <bool is_compound, typename DestType> -inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, int y4, - int gamma, int delta, +inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, + int64_t y4, int gamma, int delta, DestType* LIBGAV1_RESTRICT dest_row, ptrdiff_t dest_stride) { int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); @@ -249,7 +249,7 @@ inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src, template <bool is_compound, typename DestType> inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src, - ptrdiff_t source_stride, int source_width, int y4, + ptrdiff_t source_stride, int source_width, int64_t y4, int ix4, int iy4, int gamma, int delta, int16_t intermediate_result_column[15], DestType* LIBGAV1_RESTRICT dst_row, @@ -291,7 +291,7 @@ inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src, template <bool is_compound, typename DestType> inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src, ptrdiff_t source_stride, int source_height, int alpha, - int beta, int x4, int ix4, int iy4, + int beta, int64_t x4, int ix4, int iy4, int16_t intermediate_result[15][8]) { // Region 3 // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. @@ -323,8 +323,9 @@ inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src, template <bool is_compound, typename DestType> inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src, - ptrdiff_t source_stride, int alpha, int beta, int x4, - int ix4, int iy4, int16_t intermediate_result[15][8]) { + ptrdiff_t source_stride, int alpha, int beta, + int64_t x4, int ix4, int iy4, + int16_t intermediate_result[15][8]) { // Region 4. // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. @@ -379,14 +380,8 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src, int16_t intermediate_result_column[15]; }; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's // boundary, the filtering can be simplified. We can divide the plane @@ -439,33 +434,38 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { - if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) { + if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) { + if ((filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0)) { // Outside the frame in both directions. One repeated value. - WarpRegion1<is_compound, DestType>(src, source_stride, source_width, - source_height, ix4, iy4, dst_row, - dest_stride); + WarpRegion1<is_compound, DestType>( + src, source_stride, source_width, source_height, filter_params.ix4, + filter_params.iy4, dst_row, dest_stride); return; } // Outside the frame horizontally. Rows repeated. WarpRegion2<is_compound, DestType>( - src, source_stride, source_width, y4, ix4, iy4, gamma, delta, - intermediate_result_column, dst_row, dest_stride); + src, source_stride, source_width, filter_params.y4, filter_params.ix4, + filter_params.iy4, gamma, delta, intermediate_result_column, dst_row, + dest_stride); return; } - if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) { + if ((filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0)) { // Outside the frame vertically. - WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha, - beta, x4, ix4, iy4, intermediate_result); + WarpRegion3<is_compound, DestType>( + src, source_stride, source_height, alpha, beta, filter_params.x4, + filter_params.ix4, filter_params.iy4, intermediate_result); } else { // Inside the frame. - WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4, - iy4, intermediate_result); + WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, + filter_params.x4, filter_params.ix4, + filter_params.iy4, intermediate_result); } // Region 3 and 4 vertical filter. - VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta, - dst_row, dest_stride); + VerticalFilter<is_compound, DestType>(intermediate_result, filter_params.y4, + gamma, delta, dst_row, dest_stride); } template <bool is_compound> diff --git a/libgav1/src/dsp/x86/weight_mask_sse4.cc b/libgav1/src/dsp/x86/weight_mask_sse4.cc index 69cb784..53a374d 100644 --- a/libgav1/src/dsp/x86/weight_mask_sse4.cc +++ b/libgav1/src/dsp/x86/weight_mask_sse4.cc @@ -37,10 +37,10 @@ namespace { constexpr int kRoundingBits8bpp = 4; template <bool mask_is_inverse, bool is_store_16> -inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +inline void WeightMask16_SSE4_1(const int16_t* LIBGAV1_RESTRICT prediction_0, + const int16_t* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const __m128i pred_00 = LoadAligned16(prediction_0); const __m128i pred_10 = LoadAligned16(prediction_1); const __m128i difference_0 = RightShiftWithRounding_U16( @@ -78,7 +78,7 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0, } #define WEIGHT8_PAIR_WITHOUT_STRIDE \ - WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride) + WeightMask16_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride) #define WEIGHT8_PAIR_AND_STRIDE \ WEIGHT8_PAIR_WITHOUT_STRIDE; \ @@ -87,9 +87,10 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0, mask += mask_stride << 1 template <bool mask_is_inverse> -void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { +void WeightMask8x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); @@ -100,10 +101,10 @@ void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 3; @@ -116,10 +117,10 @@ void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 5; @@ -132,7 +133,7 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } #define WEIGHT16_WITHOUT_STRIDE \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride) + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride) #define WEIGHT16_AND_STRIDE \ WEIGHT16_WITHOUT_STRIDE; \ @@ -141,10 +142,10 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y = 7; @@ -155,10 +156,10 @@ void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 5; @@ -171,10 +172,10 @@ void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 6; @@ -190,10 +191,10 @@ void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 21; @@ -205,10 +206,11 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT16_WITHOUT_STRIDE; } -#define WEIGHT32_WITHOUT_STRIDE \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride) +#define WEIGHT32_WITHOUT_STRIDE \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride) #define WEIGHT32_AND_STRIDE \ WEIGHT32_WITHOUT_STRIDE; \ @@ -217,10 +219,10 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); WEIGHT32_AND_STRIDE; @@ -234,10 +236,10 @@ void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 5; @@ -250,10 +252,10 @@ void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 6; @@ -269,10 +271,10 @@ void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 21; @@ -284,14 +286,15 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT32_WITHOUT_STRIDE; } -#define WEIGHT64_WITHOUT_STRIDE \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride); \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ - mask + 32, mask_stride); \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ - mask + 48, mask_stride) +#define WEIGHT64_WITHOUT_STRIDE \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride); \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ + mask + 32, mask_stride); \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ + mask + 48, mask_stride) #define WEIGHT64_AND_STRIDE \ WEIGHT64_WITHOUT_STRIDE; \ @@ -300,10 +303,10 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -316,10 +319,10 @@ void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 0; @@ -335,10 +338,10 @@ void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -351,10 +354,10 @@ void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -368,10 +371,10 @@ void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -412,10 +415,10 @@ void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -466,8 +469,9 @@ void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, #define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \ dsp->weight_mask[w_index][h_index][0] = \ - WeightMask##width##x##height##_SSE4<0>; \ - dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1> + WeightMask##width##x##height##_SSE4_1<0>; \ + dsp->weight_mask[w_index][h_index][1] = \ + WeightMask##width##x##height##_SSE4_1<1> void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); @@ -501,7 +505,7 @@ constexpr int kRoundingBits10bpp = 6; constexpr int kScaledDiffShift = 4; template <bool mask_is_inverse, bool is_store_16> -inline void WeightMask16_10bpp_SSE4( +inline void WeightMask16_10bpp_SSE4_1( const uint16_t* LIBGAV1_RESTRICT prediction_0, const uint16_t* LIBGAV1_RESTRICT prediction_1, uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { @@ -562,9 +566,9 @@ inline void WeightMask16_10bpp_SSE4( } } -#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \ - mask_stride) +#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, \ + mask_stride) #define WEIGHT8_PAIR_AND_STRIDE_10BPP \ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; \ @@ -573,10 +577,10 @@ inline void WeightMask16_10bpp_SSE4( mask += mask_stride << 1 template <bool mask_is_inverse> -void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -587,10 +591,10 @@ void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 3; @@ -603,10 +607,10 @@ void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y5 = 5; @@ -618,9 +622,9 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; } -#define WEIGHT16_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ - mask_stride) +#define WEIGHT16_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride) #define WEIGHT16_AND_STRIDE_10BPP \ WEIGHT16_WITHOUT_STRIDE_10BPP; \ @@ -629,10 +633,10 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y = 7; @@ -643,10 +647,10 @@ void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 5; @@ -659,10 +663,10 @@ void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y5 = 6; @@ -678,10 +682,10 @@ void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 21; @@ -693,11 +697,11 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT16_WITHOUT_STRIDE_10BPP; } -#define WEIGHT32_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ - mask_stride); \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride) +#define WEIGHT32_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride) #define WEIGHT32_AND_STRIDE_10BPP \ WEIGHT32_WITHOUT_STRIDE_10BPP; \ @@ -706,10 +710,10 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); WEIGHT32_AND_STRIDE_10BPP; @@ -723,10 +727,10 @@ void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 5; @@ -739,10 +743,10 @@ void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y5 = 6; @@ -758,10 +762,10 @@ void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 21; @@ -773,15 +777,15 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT32_WITHOUT_STRIDE_10BPP; } -#define WEIGHT64_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ - mask_stride); \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride); \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ - mask + 32, mask_stride); \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ - mask + 48, mask_stride) +#define WEIGHT64_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride); \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ + mask + 32, mask_stride); \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ + mask + 48, mask_stride) #define WEIGHT64_AND_STRIDE_10BPP \ WEIGHT64_WITHOUT_STRIDE_10BPP; \ @@ -790,10 +794,10 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 5; @@ -806,10 +810,10 @@ void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y5 = 6; @@ -825,10 +829,10 @@ void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 21; @@ -841,10 +845,10 @@ void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 42; @@ -858,10 +862,10 @@ void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 21; @@ -902,10 +906,10 @@ void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 42; @@ -956,9 +960,9 @@ void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, #define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \ dsp->weight_mask[w_index][h_index][0] = \ - WeightMask##width##x##height##_10bpp_SSE4<0>; \ + WeightMask##width##x##height##_10bpp_SSE4_1<0>; \ dsp->weight_mask[w_index][h_index][1] = \ - WeightMask##width##x##height##_10bpp_SSE4<1> + WeightMask##width##x##height##_10bpp_SSE4_1<1> void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); assert(dsp != nullptr); diff --git a/libgav1/src/film_grain.cc b/libgav1/src/film_grain.cc index 5c64ff2..44a2543 100644 --- a/libgav1/src/film_grain.cc +++ b/libgav1/src/film_grain.cc @@ -824,5 +824,8 @@ template class FilmGrain<kBitdepth8>; #if LIBGAV1_MAX_BITDEPTH >= 10 template class FilmGrain<kBitdepth10>; #endif +#if LIBGAV1_MAX_BITDEPTH == 12 +template class FilmGrain<kBitdepth12>; +#endif } // namespace libgav1 diff --git a/libgav1/src/film_grain.h b/libgav1/src/film_grain.h index f2c1e93..bda8458 100644 --- a/libgav1/src/film_grain.h +++ b/libgav1/src/film_grain.h @@ -104,7 +104,9 @@ class FilmGrain { using Pixel = typename std::conditional<bitdepth == 8, uint8_t, uint16_t>::type; static constexpr int kScalingLutLength = - (kScalingLookupTableSize + kScalingLookupTablePadding) << (bitdepth - 8); + (bitdepth == 10) + ? (kScalingLookupTableSize + kScalingLookupTablePadding) << 2 + : kScalingLookupTableSize + kScalingLookupTablePadding; bool Init(); diff --git a/libgav1/src/gav1/decoder_buffer.h b/libgav1/src/gav1/decoder_buffer.h index 880c320..0a5586e 100644 --- a/libgav1/src/gav1/decoder_buffer.h +++ b/libgav1/src/gav1/decoder_buffer.h @@ -115,6 +115,27 @@ typedef enum Libgav1ColorRange { kLibgav1ColorRangeFull // YUV/RGB [0..255] } Libgav1ColorRange; +typedef struct Libgav1ObuMetadataHdrCll { // NOLINT + uint16_t max_cll; // Maximum content light level. + uint16_t max_fall; // Maximum frame-average light level. +} Libgav1ObuMetadataHdrCll; + +typedef struct Libgav1ObuMetadataHdrMdcv { // NOLINT + uint16_t primary_chromaticity_x[3]; + uint16_t primary_chromaticity_y[3]; + uint16_t white_point_chromaticity_x; + uint16_t white_point_chromaticity_y; + uint32_t luminance_max; + uint32_t luminance_min; +} Libgav1ObuMetadataHdrMdcv; + +typedef struct Libgav1ObuMetadataItutT35 { // NOLINT + uint8_t country_code; + uint8_t country_code_extension_byte; // Valid if country_code is 0xFF. + uint8_t* payload_bytes; + int payload_size; +} Libgav1ObuMetadataItutT35; + typedef struct Libgav1DecoderBuffer { #if defined(__cplusplus) LIBGAV1_PUBLIC int NumPlanes() const { @@ -146,6 +167,18 @@ typedef struct Libgav1DecoderBuffer { // Temporal id of this frame. int temporal_id; + Libgav1ObuMetadataHdrCll hdr_cll; + int has_hdr_cll; // 1 if the values in hdr_cll are valid for this frame. 0 + // otherwise. + + Libgav1ObuMetadataHdrMdcv hdr_mdcv; + int has_hdr_mdcv; // 1 if the values in hdr_mdcv are valid for this frame. 0 + // otherwise. + + Libgav1ObuMetadataItutT35 itut_t35; + int has_itut_t35; // 1 if the values in itut_t35 are valid for this frame. 0 + // otherwise. + // The |user_private_data| argument passed to Decoder::EnqueueFrame(). int64_t user_private_data; // The |private_data| field of FrameBuffer. Set by the get frame buffer @@ -264,6 +297,10 @@ using ColorRange = Libgav1ColorRange; constexpr ColorRange kColorRangeStudio = kLibgav1ColorRangeStudio; constexpr ColorRange kColorRangeFull = kLibgav1ColorRangeFull; +using ObuMetadataHdrCll = Libgav1ObuMetadataHdrCll; +using ObuMetadataHdrMdcv = Libgav1ObuMetadataHdrMdcv; +using ObuMetadataItutT35 = Libgav1ObuMetadataItutT35; + using DecoderBuffer = Libgav1DecoderBuffer; } // namespace libgav1 diff --git a/libgav1/src/gav1/version.h b/libgav1/src/gav1/version.h index 9bdc630..b386acc 100644 --- a/libgav1/src/gav1/version.h +++ b/libgav1/src/gav1/version.h @@ -23,7 +23,7 @@ // (https://semver.org). #define LIBGAV1_MAJOR_VERSION 0 -#define LIBGAV1_MINOR_VERSION 17 +#define LIBGAV1_MINOR_VERSION 18 #define LIBGAV1_PATCH_VERSION 0 #define LIBGAV1_VERSION \ diff --git a/libgav1/src/libgav1_decoder.cmake b/libgav1/src/libgav1_decoder.cmake index b97d09d..1314d0b 100644 --- a/libgav1/src/libgav1_decoder.cmake +++ b/libgav1/src/libgav1_decoder.cmake @@ -107,7 +107,7 @@ macro(libgav1_add_decoder_targets) list(APPEND libgav1_static_lib_sources ${libgav1_api_sources}) endif() - if(NOT ANDROID) + if(use_absl_threading) list(APPEND libgav1_absl_deps absl::base absl::synchronization) endif() diff --git a/libgav1/src/obu_parser.cc b/libgav1/src/obu_parser.cc index 445450b..9e9166a 100644 --- a/libgav1/src/obu_parser.cc +++ b/libgav1/src/obu_parser.cc @@ -1767,11 +1767,7 @@ bool ObuParser::ParseFrameParameters() { int64_t scratch; if (sequence_header_.reduced_still_picture_header) { frame_header_.show_frame = true; - current_frame_ = buffer_pool_->GetFreeBuffer(); - if (current_frame_ == nullptr) { - LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool."); - return false; - } + if (!EnsureCurrentFrameIsNotNull()) return false; } else { OBU_READ_BIT_OR_FAIL; frame_header_.show_existing_frame = scratch != 0; @@ -1840,11 +1836,7 @@ bool ObuParser::ParseFrameParameters() { } return true; } - current_frame_ = buffer_pool_->GetFreeBuffer(); - if (current_frame_ == nullptr) { - LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool."); - return false; - } + if (!EnsureCurrentFrameIsNotNull()) return false; OBU_READ_LITERAL_OR_FAIL(2); frame_header_.frame_type = static_cast<FrameType>(scratch); current_frame_->set_frame_type(frame_header_.frame_type); @@ -2395,50 +2387,58 @@ bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) { size -= metadata_type_size; int64_t scratch; switch (metadata_type) { - case kMetadataTypeHdrContentLightLevel: + case kMetadataTypeHdrContentLightLevel: { + ObuMetadataHdrCll hdr_cll; OBU_READ_LITERAL_OR_FAIL(16); - metadata_.max_cll = scratch; + hdr_cll.max_cll = scratch; OBU_READ_LITERAL_OR_FAIL(16); - metadata_.max_fall = scratch; + hdr_cll.max_fall = scratch; + if (!EnsureCurrentFrameIsNotNull()) return false; + current_frame_->set_hdr_cll(hdr_cll); break; - case kMetadataTypeHdrMasteringDisplayColorVolume: + } + case kMetadataTypeHdrMasteringDisplayColorVolume: { + ObuMetadataHdrMdcv hdr_mdcv; for (int i = 0; i < 3; ++i) { OBU_READ_LITERAL_OR_FAIL(16); - metadata_.primary_chromaticity_x[i] = scratch; + hdr_mdcv.primary_chromaticity_x[i] = scratch; OBU_READ_LITERAL_OR_FAIL(16); - metadata_.primary_chromaticity_y[i] = scratch; + hdr_mdcv.primary_chromaticity_y[i] = scratch; } OBU_READ_LITERAL_OR_FAIL(16); - metadata_.white_point_chromaticity_x = scratch; + hdr_mdcv.white_point_chromaticity_x = scratch; OBU_READ_LITERAL_OR_FAIL(16); - metadata_.white_point_chromaticity_y = scratch; + hdr_mdcv.white_point_chromaticity_y = scratch; OBU_READ_LITERAL_OR_FAIL(32); - metadata_.luminance_max = static_cast<uint32_t>(scratch); + hdr_mdcv.luminance_max = static_cast<uint32_t>(scratch); OBU_READ_LITERAL_OR_FAIL(32); - metadata_.luminance_min = static_cast<uint32_t>(scratch); + hdr_mdcv.luminance_min = static_cast<uint32_t>(scratch); + if (!EnsureCurrentFrameIsNotNull()) return false; + current_frame_->set_hdr_mdcv(hdr_mdcv); break; + } case kMetadataTypeScalability: if (!ParseMetadataScalability()) return false; break; case kMetadataTypeItutT35: { + ObuMetadataItutT35 itut_t35; OBU_READ_LITERAL_OR_FAIL(8); - metadata_.itu_t_t35_country_code = static_cast<uint8_t>(scratch); + itut_t35.country_code = static_cast<uint8_t>(scratch); ++data; --size; - if (metadata_.itu_t_t35_country_code == 0xFF) { + if (itut_t35.country_code == 0xFF) { OBU_READ_LITERAL_OR_FAIL(8); - metadata_.itu_t_t35_country_code_extension_byte = - static_cast<uint8_t>(scratch); + itut_t35.country_code_extension_byte = static_cast<uint8_t>(scratch); ++data; --size; } - // Read itu_t_t35_payload_bytes. Section 6.7.2 of the spec says: - // itu_t_t35_payload_bytes shall be bytes containing data registered as + // Read itut_t35.payload_bytes. Section 6.7.2 of the spec says: + // itut_t35.payload_bytes shall be bytes containing data registered as // specified in Recommendation ITU-T T.35. - // Therefore itu_t_t35_payload_bytes is byte aligned and the first - // trailing byte should be 0x80. Since the exact syntax of - // itu_t_t35_payload_bytes is not defined in the AV1 spec, identify the - // end of itu_t_t35_payload_bytes by searching for the trailing bit. + // Therefore itut_t35.payload_bytes is byte aligned and the first trailing + // byte should be 0x80. Since the exact syntax of itut_t35.payload_bytes + // is not defined in the AV1 spec, identify the end of + // itut_t35.payload_bytes by searching for the trailing bit. const int i = GetLastNonzeroByteIndex(data, size); if (i < 0) { LIBGAV1_DLOG(ERROR, "Trailing bit is missing."); @@ -2447,20 +2447,15 @@ bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) { if (data[i] != 0x80) { LIBGAV1_DLOG( ERROR, - "itu_t_t35_payload_bytes is not byte aligned. The last nonzero " - "byte of the payload data is 0x%x, should be 0x80.", + "itut_t35.payload_bytes is not byte aligned. The last nonzero byte " + "of the payload data is 0x%x, should be 0x80.", data[i]); return false; } - if (i != 0) { - // data[0]..data[i - 1] are itu_t_t35_payload_bytes. - metadata_.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[i]); - if (metadata_.itu_t_t35_payload_bytes == nullptr) { - LIBGAV1_DLOG(ERROR, "Allocation of itu_t_t35_payload_bytes failed."); - return false; - } - memcpy(metadata_.itu_t_t35_payload_bytes.get(), data, i); - metadata_.itu_t_t35_payload_size = i; + itut_t35.payload_size = i; + if (!EnsureCurrentFrameIsNotNull() || + !current_frame_->set_itut_t35(itut_t35, data)) { + return false; } // Skip all bits before the trailing bit. bit_reader_->SkipBytes(i); @@ -2637,6 +2632,16 @@ bool ObuParser::InitBitReader(const uint8_t* const data, size_t size) { return bit_reader_ != nullptr; } +bool ObuParser::EnsureCurrentFrameIsNotNull() { + if (current_frame_ != nullptr) return true; + current_frame_ = buffer_pool_->GetFreeBuffer(); + if (current_frame_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool."); + return false; + } + return true; +} + bool ObuParser::HasData() const { return size_ > 0; } StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) { @@ -2652,7 +2657,6 @@ StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) { // Clear everything except the sequence header. obu_headers_.clear(); frame_header_ = {}; - metadata_ = {}; tile_buffers_.clear(); next_tile_group_start_ = 0; sequence_header_changed_ = false; diff --git a/libgav1/src/obu_parser.h b/libgav1/src/obu_parser.h index 3f452ef..eba3370 100644 --- a/libgav1/src/obu_parser.h +++ b/libgav1/src/obu_parser.h @@ -221,26 +221,6 @@ enum MetadataType : uint8_t { // 32 and greater are reserved for AOM use. }; -struct ObuMetadata { - // Maximum content light level. - uint16_t max_cll; - // Maximum frame-average light level. - uint16_t max_fall; - uint16_t primary_chromaticity_x[3]; - uint16_t primary_chromaticity_y[3]; - uint16_t white_point_chromaticity_x; - uint16_t white_point_chromaticity_y; - uint32_t luminance_max; - uint32_t luminance_min; - // ITU-T T.35. - uint8_t itu_t_t35_country_code; - uint8_t itu_t_t35_country_code_extension_byte; // Valid if - // itu_t_t35_country_code is - // 0xFF. - std::unique_ptr<uint8_t[]> itu_t_t35_payload_bytes; - size_t itu_t_t35_payload_size; -}; - class ObuParser : public Allocable { public: ObuParser(const uint8_t* const data, size_t size, int operating_point, @@ -276,7 +256,6 @@ class ObuParser : public Allocable { const ObuSequenceHeader& sequence_header() const { return sequence_header_; } const ObuFrameHeader& frame_header() const { return frame_header_; } const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; } - const ObuMetadata& metadata() const { return metadata_; } // Returns true if the last call to ParseOneFrame() encountered a sequence // header change. bool sequence_header_changed() const { return sequence_header_changed_; } @@ -372,6 +351,11 @@ class ObuParser : public Allocable { size_t tg_header_size, size_t bytes_consumed_so_far); bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far); // 5.11.1. + // Populates |current_frame_| from the |buffer_pool_| if |current_frame_| is + // nullptr. Does not do anything otherwise. Returns true on success, false + // otherwise. + bool EnsureCurrentFrameIsNotNull(); + // Parser elements. std::unique_ptr<RawBitReader> bit_reader_; const uint8_t* data_; @@ -383,7 +367,6 @@ class ObuParser : public Allocable { ObuSequenceHeader sequence_header_ = {}; ObuFrameHeader frame_header_ = {}; Vector<TileBuffer> tile_buffers_; - ObuMetadata metadata_ = {}; // The expected starting tile number of the next Tile Group. int next_tile_group_start_ = 0; // If true, the sequence_header_ field is valid. diff --git a/libgav1/src/post_filter/deblock.cc b/libgav1/src/post_filter/deblock.cc index 48ad823..daee01c 100644 --- a/libgav1/src/post_filter/deblock.cc +++ b/libgav1/src/post_filter/deblock.cc @@ -329,7 +329,6 @@ void PostFilter::HorizontalDeblockFilter(int row4x4_start, int row4x4_end, src_row, src_stride, outer_thresh_[level], inner_thresh_[level], HevThresh(level)); } - // TODO(chengchen): use shifts instead of multiplication. src_row += row_step * src_stride; row_step = DivideBy4(row_step); } diff --git a/libgav1/src/quantizer.cc b/libgav1/src/quantizer.cc index cd720d6..eb13314 100644 --- a/libgav1/src/quantizer.cc +++ b/libgav1/src/quantizer.cc @@ -20,8 +20,9 @@ #include "src/utils/common.h" #include "src/utils/constants.h" -#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10 -#error LIBGAV1_MAX_BITDEPTH must be 8 or 10 +#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10 && \ + LIBGAV1_MAX_BITDEPTH != 12 +#error LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12 #endif namespace libgav1 { @@ -87,6 +88,43 @@ constexpr int16_t kDcLookup[][256] = { 4737, 4929, 5130, 5347 }, #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 + // Lookup table for 12 bit. + { + 4, 12, 18, 25, 33, 41, 50, 60, + 70, 80, 91, 103, 115, 127, 140, 153, + 166, 180, 194, 208, 222, 237, 251, 266, + 281, 296, 312, 327, 343, 358, 374, 390, + 405, 421, 437, 453, 469, 484, 500, 516, + 532, 548, 564, 580, 596, 611, 627, 643, + 659, 674, 690, 706, 721, 737, 752, 768, + 783, 798, 814, 829, 844, 859, 874, 889, + 904, 919, 934, 949, 964, 978, 993, 1008, + 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122, + 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, + 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, + 1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544, + 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741, + 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, + 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199, + 2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467, + 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788, + 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, + 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517, + 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951, + 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, + 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, + 5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517, + 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149, + 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, + 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715, + 7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788, + 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245, + 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, + 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812, + 16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387 + } +#endif // LIBGAV1_MAX_BITDEPTH == 12 }; constexpr int16_t kAcLookup[][256] = { @@ -142,6 +180,43 @@ constexpr int16_t kAcLookup[][256] = { 6900, 7036, 7172, 7312 }, #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 + // Lookup table for 12 bit. + { + 4, 13, 19, 27, 35, 44, 54, 64, + 75, 87, 99, 112, 126, 139, 154, 168, + 183, 199, 214, 230, 247, 263, 280, 297, + 314, 331, 349, 366, 384, 402, 420, 438, + 456, 475, 493, 511, 530, 548, 567, 586, + 604, 623, 642, 660, 679, 698, 716, 735, + 753, 772, 791, 809, 828, 846, 865, 884, + 902, 920, 939, 957, 976, 994, 1012, 1030, + 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175, + 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317, + 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, + 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, + 1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856, + 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118, + 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, + 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750, + 2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137, + 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619, + 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, + 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791, + 4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544, + 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, + 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, + 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635, + 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028, + 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, + 11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565, + 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806, + 16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414, + 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, + 21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070, + 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247 + } +#endif // LIBGAV1_MAX_BITDEPTH == 12 }; // clang-format on diff --git a/libgav1/src/tile.h b/libgav1/src/tile.h index 83c3423..fcab963 100644 --- a/libgav1/src/tile.h +++ b/libgav1/src/tile.h @@ -464,13 +464,14 @@ class Tile : public MaxAlignedAllocable { int* start_y, int* step_x, int* step_y); // 7.11.3.3. // If the method returns false, the caller only uses the output parameters // *ref_block_start_x and *ref_block_start_y. If the method returns true, the - // caller uses all three output parameters. + // caller uses all four output parameters. static bool GetReferenceBlockPosition( int reference_frame_index, bool is_scaled, int width, int height, int ref_start_x, int ref_last_x, int ref_start_y, int ref_last_y, int start_x, int start_y, int step_x, int step_y, int left_border, int right_border, int top_border, int bottom_border, - int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x); + int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x, + int* ref_block_end_y); template <typename Pixel> void BuildConvolveBlock(Plane plane, int reference_frame_index, diff --git a/libgav1/src/tile/prediction.cc b/libgav1/src/tile/prediction.cc index bba5a69..4348548 100644 --- a/libgav1/src/tile/prediction.cc +++ b/libgav1/src/tile/prediction.cc @@ -771,11 +771,10 @@ bool Tile::InterPrediction(const Block& block, const Plane plane, const int x, [static_cast<int>(prediction_parameters.mask_is_inverse)]( block.scratch_buffer->prediction_buffer[0], block.scratch_buffer->prediction_buffer[1], - block.scratch_buffer->weight_mask, - kMaxSuperBlockSizeInPixels); + block.scratch_buffer->weight_mask, block.width); } prediction_mask = block.scratch_buffer->weight_mask; - prediction_mask_stride = kMaxSuperBlockSizeInPixels; + prediction_mask_stride = block.width; } if (is_compound) { @@ -996,7 +995,7 @@ bool Tile::GetReferenceBlockPosition( const int start_y, const int step_x, const int step_y, const int left_border, const int right_border, const int top_border, const int bottom_border, int* ref_block_start_x, int* ref_block_start_y, - int* ref_block_end_x) { + int* ref_block_end_x, int* ref_block_end_y) { *ref_block_start_x = GetPixelPositionFromHighScale(start_x, 0, 0); *ref_block_start_y = GetPixelPositionFromHighScale(start_y, 0, 0); if (reference_frame_index == -1) { @@ -1006,7 +1005,7 @@ bool Tile::GetReferenceBlockPosition( *ref_block_start_y -= kConvolveBorderLeftTop; *ref_block_end_x = GetPixelPositionFromHighScale(start_x, step_x, width - 1) + kConvolveBorderRight; - int ref_block_end_y = + *ref_block_end_y = GetPixelPositionFromHighScale(start_y, step_y, height - 1) + kConvolveBorderBottom; if (is_scaled) { @@ -1015,13 +1014,13 @@ bool Tile::GetReferenceBlockPosition( kScaleSubPixelBits) + kSubPixelTaps; *ref_block_end_x += kConvolveScaleBorderRight - kConvolveBorderRight; - ref_block_end_y = *ref_block_start_y + block_height - 1; + *ref_block_end_y = *ref_block_start_y + block_height - 1; } // Determines if we need to extend beyond the left/right/top/bottom border. return *ref_block_start_x < (ref_start_x - left_border) || *ref_block_end_x > (ref_last_x + right_border) || *ref_block_start_y < (ref_start_y - top_border) || - ref_block_end_y > (ref_last_y + bottom_border); + *ref_block_end_y > (ref_last_y + bottom_border); } // Builds a block as the input for convolve, by copying the content of @@ -1140,6 +1139,7 @@ bool Tile::BlockInterPrediction( int ref_block_start_x; int ref_block_start_y; int ref_block_end_x; + int ref_block_end_y; const bool extend_block = GetReferenceBlockPosition( reference_frame_index, is_scaled, width, height, ref_start_x, ref_last_x, ref_start_y, ref_last_y, start_x, start_y, step_x, step_y, @@ -1147,24 +1147,15 @@ bool Tile::BlockInterPrediction( reference_buffer->right_border(plane), reference_buffer->top_border(plane), reference_buffer->bottom_border(plane), &ref_block_start_x, - &ref_block_start_y, &ref_block_end_x); + &ref_block_start_y, &ref_block_end_x, &ref_block_end_y); // In frame parallel mode, ensure that the reference block has been decoded // and available for referencing. if (reference_frame_index != -1 && frame_parallel_) { - int reference_y_max; - if (is_scaled) { - // TODO(vigneshv): For now, we wait for the entire reference frame to be - // decoded if we are using scaled references. This will eventually be - // fixed. - reference_y_max = reference_height; - } else { - reference_y_max = - std::min(ref_block_start_y + height + kSubPixelTaps, ref_last_y); - // For U and V planes with subsampling, we need to multiply - // reference_y_max by 2 since we only track the progress of Y planes. - reference_y_max = LeftShift(reference_y_max, subsampling_y); - } + // For U and V planes with subsampling, we need to multiply the value of + // ref_block_end_y by 2 since we only track the progress of the Y planes. + const int reference_y_max = LeftShift( + std::min(ref_block_end_y + kSubPixelTaps, ref_last_y), subsampling_y); if (reference_frame_progress_cache_[reference_frame_index] < reference_y_max && !reference_frames_[reference_frame_index]->WaitUntil( @@ -1297,11 +1288,12 @@ bool Tile::BlockWarpProcess(const Block& block, const Plane plane, start_x += 8) { const int src_x = (start_x + 4) << subsampling_x_[plane]; const int src_y = (start_y + 4) << subsampling_y_[plane]; - const int dst_y = src_x * warp_params->params[4] + - src_y * warp_params->params[5] + - warp_params->params[1]; - const int y4 = dst_y >> subsampling_y_[plane]; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const int64_t dst_y = + src_x * warp_params->params[4] + + static_cast<int64_t>(src_y) * warp_params->params[5] + + warp_params->params[1]; + const int64_t y4 = dst_y >> subsampling_y_[plane]; + const int iy4 = static_cast<int>(y4 >> kWarpedModelPrecisionBits); reference_y_max = std::max(iy4 + 8, reference_y_max); } } diff --git a/libgav1/src/utils/constants.h b/libgav1/src/utils/constants.h index 1126ad6..8281aad 100644 --- a/libgav1/src/utils/constants.h +++ b/libgav1/src/utils/constants.h @@ -37,6 +37,10 @@ enum { }; // anonymous enum enum { + // Documentation variables. + kBitdepth8 = 8, + kBitdepth10 = 10, + kBitdepth12 = 12, kInvalidMvValue = -32768, kCdfMaxProbability = 32768, kBlockWidthCount = 5, @@ -59,6 +63,13 @@ enum { kRestorationTypeSymbolCount = 3, kSgrProjParamsBits = 4, kSgrProjPrecisionBits = 7, + // Precision of a division table (mtable) + kSgrProjScaleBits = 20, + kSgrProjReciprocalBits = 12, + // Core self-guided restoration precision bits. + kSgrProjSgrBits = 8, + // Precision bits of generated values higher than source before projection. + kSgrProjRestoreBits = 4, // Padding on left and right side of a restoration block. // 3 is enough, but padding to 4 is more efficient, and makes the temporary // source buffer 8-pixel aligned. @@ -177,6 +188,15 @@ enum { // On Linux, the cache line size can be looked up with the command: // getconf LEVEL1_DCACHE_LINESIZE kCacheLineSize = 64, + // InterRound0, Section 7.11.3.2. + kInterRoundBitsHorizontal = 3, // 8 & 10-bit. + kInterRoundBitsHorizontal12bpp = 5, + kInterRoundBitsCompoundVertical = 7, // 8, 10 & 12-bit compound prediction. + kInterRoundBitsVertical = 11, // 8 & 10-bit, single prediction. + kInterRoundBitsVertical12bpp = 9, + // Offset applied to 10bpp and 12bpp predictors to allow storing them in + // uint16_t. Removed before blending. + kCompoundOffset = (1 << 14) + (1 << 13), }; // anonymous enum enum FrameType : uint8_t { diff --git a/libgav1/src/utils/segmentation_map.cc b/libgav1/src/utils/segmentation_map.cc index 4284ca2..bbf40c3 100644 --- a/libgav1/src/utils/segmentation_map.cc +++ b/libgav1/src/utils/segmentation_map.cc @@ -21,9 +21,12 @@ namespace libgav1 { bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) { + if (rows4x4 * columns4x4 > rows4x4_ * columns4x4_) { + segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4 * columns4x4]); + } + rows4x4_ = rows4x4; columns4x4_ = columns4x4; - segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4_ * columns4x4_]); if (segment_id_buffer_ == nullptr) return false; segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get()); return true; diff --git a/libgav1/src/warp_prediction.cc b/libgav1/src/warp_prediction.cc index 69b40e8..0da8a1f 100644 --- a/libgav1/src/warp_prediction.cc +++ b/libgav1/src/warp_prediction.cc @@ -231,9 +231,6 @@ bool WarpEstimation(const int num_samples, const int block_width4x4, Clip3(vx, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1); params[1] = Clip3(vy, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1); - - params[6] = 0; - params[7] = 0; return true; } |