diff options
Diffstat (limited to 'libgav1/src/tile/tile.cc')
-rw-r--r-- | libgav1/src/tile/tile.cc | 1802 |
1 files changed, 994 insertions, 808 deletions
diff --git a/libgav1/src/tile/tile.cc b/libgav1/src/tile/tile.cc index 96c724f..f79158f 100644 --- a/libgav1/src/tile/tile.cc +++ b/libgav1/src/tile/tile.cc @@ -17,6 +17,7 @@ #include <algorithm> #include <array> #include <cassert> +#include <climits> #include <cstdlib> #include <cstring> #include <memory> @@ -25,9 +26,12 @@ #include <type_traits> #include <utility> +#include "src/frame_scratch_buffer.h" #include "src/motion_vector.h" #include "src/reconstruction.h" #include "src/utils/bit_mask_set.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" #include "src/utils/logging.h" #include "src/utils/segmentation.h" #include "src/utils/stack.h" @@ -45,8 +49,6 @@ constexpr int kReferenceScaleShift = 14; // process is activated. constexpr int kQuantizerCoefficientBaseRange = 12; constexpr int kNumQuantizerBaseLevels = 2; -constexpr int kQuantizerCoefficientBaseRangeContextClamp = - kQuantizerCoefficientBaseRange + kNumQuantizerBaseLevels + 1; constexpr int kCoeffBaseRangeMaxIterations = kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1); constexpr int kEntropyContextLeft = 0; @@ -99,6 +101,14 @@ constexpr PredictionMode kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal, kPredictionModeD157, kPredictionModeDc}; +// Mask used to determine the index for mode_deltas lookup. +constexpr BitMaskSet kPredictionModeDeltasMask( + kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv, + kPredictionModeNearestNearestMv, kPredictionModeNearNearMv, + kPredictionModeNearestNewMv, kPredictionModeNewNearestMv, + kPredictionModeNearNewMv, kPredictionModeNewNearMv, + kPredictionModeNewNewMv); + // This is computed as: // min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4. constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = { @@ -146,7 +156,10 @@ constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = { {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}}; /* clang-format on */ -constexpr uint8_t kCoeffBasePositionContextOffset[3] = {26, 31, 36}; +// Extended the table size from 3 to 16 by repeating the last element to avoid +// the clips to row or column indices. +constexpr uint8_t kCoeffBasePositionContextOffset[16] = { + 26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36}; constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = { kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal, @@ -235,7 +248,7 @@ constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = { constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31}; -constexpr int8_t kWienerDefaultFilter[3] = {3, -7, 15}; +constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15}; // Maps compound prediction modes into single modes. For e.g. // kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0 @@ -264,31 +277,8 @@ PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) { // log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because // dqDenom is always a power of two and hence right shift can be used instead of // division. -constexpr BitMaskSet kQuantizationShift2Mask(kTransformSize32x64, - kTransformSize64x32, - kTransformSize64x64); -constexpr BitMaskSet kQuantizationShift1Mask(kTransformSize16x32, - kTransformSize16x64, - kTransformSize32x16, - kTransformSize32x32, - kTransformSize64x16); -int GetQuantizationShift(TransformSize tx_size) { - if (kQuantizationShift2Mask.Contains(tx_size)) { - return 2; - } - if (kQuantizationShift1Mask.Contains(tx_size)) { - return 1; - } - return 0; -} - -// Input: 1d array index |index|, which indexes into a 2d array of width -// 1 << |tx_width_log2|. -// Output: 1d array index which indexes into a 2d array of width -// (1 << |tx_width_log2|) + kQuantizedCoefficientBufferPadding. -int PaddedIndex(int index, int tx_width_log2) { - return index + MultiplyBy4(index >> tx_width_log2); -} +constexpr uint8_t kQuantizationShift[kNumTransformSizes] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2}; // Returns the minimum of |length| or |max|-|start|. This is used to clamp array // indices when accessing arrays whose bound is equal to |max|. @@ -296,40 +286,151 @@ int GetNumElements(int length, int start, int max) { return std::min(length, max - start); } +template <typename T> +void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) { + // Specialize all columns cases (values in kTransformWidth4x4[]) for better + // performance. + switch (columns) { + case 1: + MemSetBlock<T>(rows, 1, value, dst, stride); + break; + case 2: + MemSetBlock<T>(rows, 2, value, dst, stride); + break; + case 4: + MemSetBlock<T>(rows, 4, value, dst, stride); + break; + case 8: + MemSetBlock<T>(rows, 8, value, dst, stride); + break; + default: + assert(columns == 16); + MemSetBlock<T>(rows, 16, value, dst, stride); + break; + } +} + void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4, TransformType tx_type, TransformType transform_types[32][32]) { const int y_offset = y4 - block.row4x4; const int x_offset = x4 - block.column4x4; - static_assert(sizeof(transform_types[0][0]) == 1, ""); - for (int i = 0; i < h4; ++i) { - memset(&transform_types[y_offset + i][x_offset], tx_type, w4); - } + TransformType* const dst = &transform_types[y_offset][x_offset]; + SetBlockValues<TransformType>(h4, w4, tx_type, dst, 32); +} + +void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store, + const MotionVector& mv_to_store, ptrdiff_t stride, + int rows, int columns, + ReferenceFrameType* reference_frame_row_start, + MotionVector* mv) { + static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), ""); + do { + // Don't switch the following two memory setting functions. + // Some ARM CPUs are quite sensitive to the order. + memset(reference_frame_row_start, reference_frame_to_store, columns); + std::fill(mv, mv + columns, mv_to_store); + reference_frame_row_start += stride; + mv += stride; + } while (--rows != 0); +} + +// Inverse transform process assumes that the quantized coefficients are stored +// as a virtual 2d array of size |tx_width| x tx_height. If transform width is +// 64, then this assumption is broken because the scan order used for populating +// the coefficients for such transforms is the same as the one used for +// corresponding transform with width 32 (e.g. the scan order used for 64x16 is +// the same as the one used for 32x16). So we must restore the coefficients to +// their correct positions and clean the positions they occupied. +template <typename ResidualType> +void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width, + ResidualType* residual) { + if (tx_width != 64) return; + const int rows = clamped_tx_height - 2; + auto* src = residual + 32 * rows; + residual += 64 * rows; + // Process 2 rows in each loop in reverse order to avoid overwrite. + int x = rows >> 1; + do { + // The 2 rows can be processed in order. + memcpy(residual, src, 32 * sizeof(src[0])); + memcpy(residual + 64, src + 32, 32 * sizeof(src[0])); + memset(src + 32, 0, 32 * sizeof(src[0])); + src -= 64; + residual -= 128; + } while (--x); + // Process the second row. The first row is already correct. + memcpy(residual + 64, src + 32, 32 * sizeof(src[0])); + memset(src + 32, 0, 32 * sizeof(src[0])); +} + +void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) { + // 7.10.2.14 (part 1). (also contains implementations of 5.11.53 + // and 5.11.54). + constexpr int kMvBorder4x4 = 4; + const int row_border = kMvBorder4x4 + block.height4x4; + const int column_border = kMvBorder4x4 + block.width4x4; + const int macroblocks_to_top_edge = -block.row4x4; + const int macroblocks_to_bottom_edge = + block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4; + const int macroblocks_to_left_edge = -block.column4x4; + const int macroblocks_to_right_edge = + block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4; + min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border); + min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border); + max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border); + max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border); +} + +// Section 8.3.2 in the spec, under coeff_base_eob. +int GetCoeffBaseContextEob(TransformSize tx_size, int index) { + if (index == 0) return 0; + const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size]; + const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size]; + const int tx_height = kTransformHeight[adjusted_tx_size]; + if (index <= DivideBy8(tx_height << tx_width_log2)) return 1; + if (index <= DivideBy4(tx_height << tx_width_log2)) return 2; + return 3; +} + +// Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based +// on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in +// the end of block case. +int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos, + TransformClass tx_class) { + if (pos == 0) return 0; + const int tx_width = 1 << adjusted_tx_width_log2; + const int row = pos >> adjusted_tx_width_log2; + const int column = pos & (tx_width - 1); + // This return statement is equivalent to: + // return ((tx_class == kTransformClass2D && (row | column) < 2) || + // (tx_class == kTransformClassHorizontal && column == 0) || + // (tx_class == kTransformClassVertical && row == 0)) + // ? 7 + // : 14; + return 14 >> ((static_cast<int>(tx_class == kTransformClass2D) & + static_cast<int>((row | column) < 2)) | + (tx_class & static_cast<int>(column == 0)) | + ((tx_class >> 1) & static_cast<int>(row == 0))); } } // namespace -Tile::Tile( - int tile_number, const uint8_t* const data, size_t size, - const ObuSequenceHeader& sequence_header, - const ObuFrameHeader& frame_header, RefCountedBuffer* const current_frame, - const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias, - const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>& - reference_frames, - Array2D<TemporalMotionVector>* const motion_field_mv, - const std::array<uint8_t, kNumReferenceFrameTypes>& reference_order_hint, - const std::array<uint8_t, kWedgeMaskSize>& wedge_masks, - const SymbolDecoderContext& symbol_decoder_context, - SymbolDecoderContext* const saved_symbol_decoder_context, - const SegmentationMap* prev_segment_ids, PostFilter* const post_filter, - BlockParametersHolder* const block_parameters_holder, - Array2D<int16_t>* const cdef_index, - Array2D<TransformSize>* const inter_transform_sizes, - const dsp::Dsp* const dsp, ThreadPool* const thread_pool, - ResidualBufferPool* const residual_buffer_pool, - DecoderScratchBufferPool* const decoder_scratch_buffer_pool, - BlockingCounterWithStatus* const pending_tiles) +Tile::Tile(int tile_number, const uint8_t* const data, size_t size, + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, + RefCountedBuffer* const current_frame, const DecoderState& state, + FrameScratchBuffer* const frame_scratch_buffer, + const WedgeMaskArray& wedge_masks, + SymbolDecoderContext* const saved_symbol_decoder_context, + const SegmentationMap* prev_segment_ids, + PostFilter* const post_filter, const dsp::Dsp* const dsp, + ThreadPool* const thread_pool, + BlockingCounterWithStatus* const pending_tiles, bool frame_parallel, + bool use_intra_prediction_buffer) : number_(tile_number), + row_(number_ / frame_header.tile_info.tile_columns), + column_(number_ % frame_header.tile_info.tile_columns), data_(data), size_(size), read_deltas_(false), @@ -340,19 +441,18 @@ Tile::Tile( current_quantizer_index_(frame_header.quantizer.base_index), sequence_header_(sequence_header), frame_header_(frame_header), - current_frame_(*current_frame), - reference_frame_sign_bias_(reference_frame_sign_bias), - reference_frames_(reference_frames), - motion_field_mv_(motion_field_mv), - reference_order_hint_(reference_order_hint), + reference_frame_sign_bias_(state.reference_frame_sign_bias), + reference_frames_(state.reference_frame), + motion_field_(frame_scratch_buffer->motion_field), + reference_order_hint_(state.reference_order_hint), wedge_masks_(wedge_masks), reader_(data_, size_, frame_header_.enable_cdf_update), - symbol_decoder_context_(symbol_decoder_context), + symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context), saved_symbol_decoder_context_(saved_symbol_decoder_context), prev_segment_ids_(prev_segment_ids), dsp_(*dsp), post_filter_(*post_filter), - block_parameters_holder_(*block_parameters_holder), + block_parameters_holder_(frame_scratch_buffer->block_parameters_holder), quantizer_(sequence_header_.color_config.bitdepth, &frame_header_.quantizer), residual_size_((sequence_header_.color_config.bitdepth == 8) @@ -362,15 +462,20 @@ Tile::Tile( frame_header_.allow_intrabc ? (sequence_header_.use_128x128_superblock ? 3 : 5) : 1), - cdef_index_(*cdef_index), - inter_transform_sizes_(*inter_transform_sizes), + current_frame_(*current_frame), + cdef_index_(frame_scratch_buffer->cdef_index), + inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes), thread_pool_(thread_pool), - residual_buffer_pool_(residual_buffer_pool), - decoder_scratch_buffer_pool_(decoder_scratch_buffer_pool), + residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()), + tile_scratch_buffer_pool_( + &frame_scratch_buffer->tile_scratch_buffer_pool), pending_tiles_(pending_tiles), - build_bit_mask_when_parsing_(false) { - row_ = number_ / frame_header.tile_info.tile_columns; - column_ = number_ % frame_header.tile_info.tile_columns; + frame_parallel_(frame_parallel), + use_intra_prediction_buffer_(use_intra_prediction_buffer), + intra_prediction_buffer_( + use_intra_prediction_buffer_ + ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_] + : nullptr) { row4x4_start_ = frame_header.tile_info.tile_row_start[row_]; row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1]; column4x4_start_ = frame_header.tile_info.tile_column_start[column_]; @@ -382,16 +487,71 @@ Tile::Tile( superblock_columns_ = (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >> block_width4x4_log2; - // Enable multi-threading within a tile only if there are at least as many - // superblock columns as |intra_block_copy_lag_|. - split_parse_and_decode_ = - thread_pool_ != nullptr && superblock_columns_ > intra_block_copy_lag_; + // If |split_parse_and_decode_| is true, we do the necessary setup for + // splitting the parsing and the decoding steps. This is done in the following + // two cases: + // 1) If there is multi-threading within a tile (this is done if + // |thread_pool_| is not nullptr and if there are at least as many + // superblock columns as |intra_block_copy_lag_|). + // 2) If |frame_parallel| is true. + split_parse_and_decode_ = (thread_pool_ != nullptr && + superblock_columns_ > intra_block_copy_lag_) || + frame_parallel; + if (frame_parallel_) { + reference_frame_progress_cache_.fill(INT_MIN); + } memset(delta_lf_, 0, sizeof(delta_lf_)); delta_lf_all_zero_ = true; - YuvBuffer* const buffer = current_frame->buffer(); + const YuvBuffer& buffer = post_filter_.frame_buffer(); for (int plane = 0; plane < PlaneCount(); ++plane) { - buffer_[plane].Reset(buffer->height(plane) + buffer->bottom_border(plane), - buffer->stride(plane), buffer->data(plane)); + // Verify that the borders are big enough for Reconstruct(). max_tx_length + // is the maximum value of tx_width and tx_height for the plane. + const int max_tx_length = (plane == kPlaneY) ? 64 : 32; + // Reconstruct() may overwrite on the right. Since the right border of a + // row is followed in memory by the left border of the next row, the + // number of extra pixels to the right of a row is at least the sum of the + // left and right borders. + // + // Note: This assertion actually checks the sum of the left and right + // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally + // and vertically shifted version of |buffer|. Since the sum of the left and + // right borders is not changed by the shift, we can just check the sum of + // the left and right borders of |buffer|. + assert(buffer.left_border(plane) + buffer.right_border(plane) >= + max_tx_length - 1); + // Reconstruct() may overwrite on the bottom. We need an extra border row + // on the bottom because we need the left border of that row. + // + // Note: This assertion checks the bottom border of + // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical + // shift that the PostFilter constructor applied to |buffer| and reduce the + // bottom border by that amount. +#ifndef NDEBUG + const int vertical_shift = static_cast<int>( + (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) / + buffer.stride(plane)); + const int bottom_border = buffer.bottom_border(plane) - vertical_shift; + assert(bottom_border >= max_tx_length); +#endif + // In AV1, a transform block of height H starts at a y coordinate that is + // a multiple of H. If a transform block at the bottom of the frame has + // height H, then Reconstruct() will write up to the row with index + // Align(buffer.height(plane), H) - 1. Therefore the maximum number of + // rows Reconstruct() may write to is + // Align(buffer.height(plane), max_tx_length). + buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length), + buffer.stride(plane), + post_filter_.GetUnfilteredBuffer(plane)); + const int plane_height = + RightShiftWithRounding(frame_header_.height, subsampling_y_[plane]); + deblock_row_limit_[plane] = + std::min(frame_header_.rows4x4, DivideBy4(plane_height + 3) + << subsampling_y_[plane]); + const int plane_width = + RightShiftWithRounding(frame_header_.width, subsampling_x_[plane]); + deblock_column_limit_[plane] = + std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3) + << subsampling_x_[plane]); } } @@ -418,7 +578,10 @@ bool Tile::Init() { return false; } } else { - residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(32, 4096 * residual_size_); + // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary + // checks when parsing quantized coefficients. + residual_buffer_ = MakeAlignedUniquePtr<uint8_t>( + 32, (4096 + 32 * kResidualPaddingVertical) * residual_size_); if (residual_buffer_ == nullptr) { LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed."); return false; @@ -429,62 +592,165 @@ bool Tile::Init() { return false; } } + if (frame_header_.use_ref_frame_mvs) { + assert(sequence_header_.enable_order_hint); + SetupMotionField(frame_header_, current_frame_, reference_frames_, + row4x4_start_, row4x4_end_, column4x4_start_, + column4x4_end_, &motion_field_); + } + ResetLoopRestorationParams(); return true; } -bool Tile::Decode(bool is_main_thread) { - if (!Init()) { - pending_tiles_->Decrement(false); - return false; +template <ProcessingMode processing_mode, bool save_symbol_decoder_context> +bool Tile::ProcessSuperBlockRow(int row4x4, + TileScratchBuffer* const scratch_buffer) { + if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true; + assert(scratch_buffer != nullptr); + const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; + for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_; + column4x4 += block_width4x4) { + if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer, + processing_mode)) { + LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d", + row4x4, column4x4); + return false; + } } - if (frame_header_.use_ref_frame_mvs) { - SetupMotionField(sequence_header_, frame_header_, current_frame_, - reference_frames_, motion_field_mv_, row4x4_start_, - row4x4_end_, column4x4_start_, column4x4_end_); + if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) { + SaveSymbolDecoderContext(); } - ResetLoopRestorationParams(); + if (processing_mode == kProcessingModeDecodeOnly || + processing_mode == kProcessingModeParseAndDecode) { + PopulateIntraPredictionBuffer(row4x4); + } + return true; +} + +// Used in frame parallel mode. The symbol decoder context need not be saved in +// this case since it was done when parsing was complete. +template bool Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>( + int row4x4, TileScratchBuffer* scratch_buffer); +// Used in non frame parallel mode. +template bool Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>( + int row4x4, TileScratchBuffer* scratch_buffer); + +void Tile::SaveSymbolDecoderContext() { + if (frame_header_.enable_frame_end_update_cdf && + number_ == frame_header_.tile_info.context_update_id) { + *saved_symbol_decoder_context_ = symbol_decoder_context_; + } +} + +bool Tile::ParseAndDecode() { // If this is the main thread, we build the loop filter bit masks when parsing // so that it happens in the current thread. This ensures that the main thread // does as much work as possible. - build_bit_mask_when_parsing_ = is_main_thread; if (split_parse_and_decode_) { - if (!ThreadedDecode()) return false; - } else { - const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; - std::unique_ptr<DecoderScratchBuffer> scratch_buffer = - decoder_scratch_buffer_pool_->Get(); - if (scratch_buffer == nullptr) { + if (!ThreadedParseAndDecode()) return false; + SaveSymbolDecoderContext(); + return true; + } + std::unique_ptr<TileScratchBuffer> scratch_buffer = + tile_scratch_buffer_pool_->Get(); + if (scratch_buffer == nullptr) { + pending_tiles_->Decrement(false); + LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer."); + return false; + } + const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; + for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_; + row4x4 += block_width4x4) { + if (!ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>( + row4x4, scratch_buffer.get())) { pending_tiles_->Decrement(false); - LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer."); return false; } - for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_; - row4x4 += block_width4x4) { - for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_; - column4x4 += block_width4x4) { - if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, - scratch_buffer.get(), - kProcessingModeParseAndDecode)) { - pending_tiles_->Decrement(false); - LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d", - row4x4, column4x4); - return false; - } - } + } + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); + pending_tiles_->Decrement(true); + return true; +} + +bool Tile::Parse() { + const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; + std::unique_ptr<TileScratchBuffer> scratch_buffer = + tile_scratch_buffer_pool_->Get(); + if (scratch_buffer == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer."); + return false; + } + for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_; + row4x4 += block_width4x4) { + if (!ProcessSuperBlockRow<kProcessingModeParseOnly, false>( + row4x4, scratch_buffer.get())) { + return false; } - decoder_scratch_buffer_pool_->Release(std::move(scratch_buffer)); } - if (frame_header_.enable_frame_end_update_cdf && - number_ == frame_header_.tile_info.context_update_id) { - *saved_symbol_decoder_context_ = symbol_decoder_context_; + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); + SaveSymbolDecoderContext(); + return true; +} + +bool Tile::Decode( + std::mutex* const mutex, int* const superblock_row_progress, + std::condition_variable* const superblock_row_progress_condvar) { + const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16; + const int block_width4x4_log2 = + sequence_header_.use_128x128_superblock ? 5 : 4; + std::unique_ptr<TileScratchBuffer> scratch_buffer = + tile_scratch_buffer_pool_->Get(); + if (scratch_buffer == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer."); + return false; } - if (!split_parse_and_decode_) { - pending_tiles_->Decrement(true); + for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2; + row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) { + if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>( + row4x4, scratch_buffer.get())) { + return false; + } + if (post_filter_.DoDeblock()) { + // Apply vertical deblock filtering for all the columns in this tile + // except for the first 64 columns. + post_filter_.ApplyDeblockFilter( + kLoopFilterTypeVertical, row4x4, + column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_, + block_width4x4); + // If this is the first superblock row of the tile, then we cannot apply + // horizontal deblocking here since we don't know if the top row is + // available. So it will be done by the calling thread in that case. + if (row4x4 != row4x4_start_) { + // Apply horizontal deblock filtering for all the columns in this tile + // except for the first and the last 64 columns. + // Note about the last tile of each row: For the last tile, + // column4x4_end may not be a multiple of 16. In that case it is still + // okay to simply subtract 16 since ApplyDeblockFilter() will only do + // the filters in increments of 64 columns (or 32 columns for chroma + // with subsampling). + post_filter_.ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, + column4x4_start_ + kNum4x4InLoopFilterUnit, + column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4); + } + } + bool notify; + { + std::unique_lock<std::mutex> lock(*mutex); + notify = ++superblock_row_progress[index] == + frame_header_.tile_info.tile_columns; + } + if (notify) { + // We are done decoding this superblock row. Notify the post filtering + // thread. + superblock_row_progress_condvar[index].notify_one(); + } } + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); return true; } -bool Tile::ThreadedDecode() { +bool Tile::ThreadedParseAndDecode() { { std::lock_guard<std::mutex> lock(threading_.mutex); if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) { @@ -499,8 +765,8 @@ bool Tile::ThreadedDecode() { const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; // Begin parsing. - std::unique_ptr<DecoderScratchBuffer> scratch_buffer = - decoder_scratch_buffer_pool_->Get(); + std::unique_ptr<TileScratchBuffer> scratch_buffer = + tile_scratch_buffer_pool_->Get(); if (scratch_buffer == nullptr) { pending_tiles_->Decrement(false); LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer."); @@ -535,7 +801,7 @@ bool Tile::ThreadedDecode() { std::lock_guard<std::mutex> lock(threading_.mutex); if (threading_.abort) break; } - decoder_scratch_buffer_pool_->Release(std::move(scratch_buffer)); + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); // We are done parsing. We can return here since the calling thread will make // sure that it waits for all the superblocks to be decoded. @@ -593,13 +859,13 @@ void Tile::DecodeSuperBlock(int row_index, int column_index, int block_width4x4) { const int row4x4 = row4x4_start_ + (row_index * block_width4x4); const int column4x4 = column4x4_start_ + (column_index * block_width4x4); - std::unique_ptr<DecoderScratchBuffer> scratch_buffer = - decoder_scratch_buffer_pool_->Get(); + std::unique_ptr<TileScratchBuffer> scratch_buffer = + tile_scratch_buffer_pool_->Get(); bool ok = scratch_buffer != nullptr; if (ok) { ok = ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer.get(), kProcessingModeDecodeOnly); - decoder_scratch_buffer_pool_->Release(std::move(scratch_buffer)); + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); } std::unique_lock<std::mutex> lock(threading_.mutex); if (ok) { @@ -647,9 +913,38 @@ void Tile::DecodeSuperBlock(int row_index, int column_index, } } -bool Tile::IsInside(int row4x4, int column4x4) const { - return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_ && - column4x4 >= column4x4_start_ && column4x4 < column4x4_end_; +void Tile::PopulateIntraPredictionBuffer(int row4x4) { + const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; + if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) { + return; + } + const size_t pixel_size = + (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t) + : sizeof(uint16_t)); + for (int plane = 0; plane < PlaneCount(); ++plane) { + const int row_to_copy = + (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1; + const size_t pixels_to_copy = + (MultiplyBy4(column4x4_end_ - column4x4_start_) >> + subsampling_x_[plane]) * + pixel_size; + const size_t column_start = + MultiplyBy4(column4x4_start_) >> subsampling_x_[plane]; + void* start; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (sequence_header_.color_config.bitdepth > 8) { + Array2DView<uint16_t> buffer( + buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t), + reinterpret_cast<uint16_t*>(&buffer_[plane][0][0])); + start = &buffer[row_to_copy][column_start]; + } else // NOLINT +#endif + { + start = &buffer_[plane][row_to_copy][column_start]; + } + memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size, + start, pixels_to_copy); + } } int Tile::GetTransformAllZeroContext(const Block& block, Plane plane, @@ -660,7 +955,7 @@ int Tile::GetTransformAllZeroContext(const Block& block, Plane plane, const int tx_width = kTransformWidth[tx_size]; const int tx_height = kTransformHeight[tx_size]; - const BlockSize plane_size = block.residual_size[GetPlaneType(plane)]; + const BlockSize plane_size = block.residual_size[plane]; const int block_width = kBlockWidthPixels[plane_size]; const int block_height = kBlockHeightPixels[plane_size]; @@ -785,150 +1080,167 @@ void Tile::ReadTransformType(const Block& block, int x4, int y4, kTransformHeight4x4[tx_size], tx_type, transform_types_); } -// Section 8.3.2 in the spec, under coeff_base_eob. -int Tile::GetCoeffBaseContextEob(TransformSize tx_size, int index) { - if (index == 0) return 0; - const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size]; - const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size]; - const int tx_height = kTransformHeight[adjusted_tx_size]; - if (index <= DivideBy8(tx_height << tx_width_log2)) return 1; - if (index <= DivideBy4(tx_height << tx_width_log2)) return 2; - return 3; -} - -// Section 8.3.2 in the spec, under coeff_base. -int Tile::GetCoeffBaseContext2D(const int32_t* const quantized_buffer, - TransformSize tx_size, - int adjusted_tx_width_log2, uint16_t pos) { - if (pos == 0) return 0; +// Section 8.3.2 in the spec, under coeff_base and coeff_br. +// Bottom boundary checks are avoided by the padded rows. +// For a coefficient near the right boundary, the two right neighbors and the +// one bottom-right neighbor may be out of boundary. We don't check the right +// boundary for them, because the out of boundary neighbors project to positions +// above the diagonal line which goes through the current coefficient and these +// positions are still all 0s according to the diagonal scan order. +template <typename ResidualType> +void Tile::ReadCoeffBase2D( + const uint16_t* scan, PlaneType plane_type, TransformSize tx_size, + int clamped_tx_size_context, int adjusted_tx_width_log2, int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + ResidualType* const quantized_buffer) { const int tx_width = 1 << adjusted_tx_width_log2; - const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding; - const int32_t* const quantized = - &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)]; - const int context = std::min( - 4, DivideBy2(1 + (std::min(quantized[1], 3) + // {0, 1} - std::min(quantized[padded_tx_width], 3) + // {1, 0} - std::min(quantized[padded_tx_width + 1], 3) + // {1, 1} - std::min(quantized[2], 3) + // {0, 2} - std::min(quantized[MultiplyBy2(padded_tx_width)], - 3)))); // {2, 0} - const int row = pos >> adjusted_tx_width_log2; - const int column = pos & (tx_width - 1); - return context + kCoeffBaseContextOffset[tx_size][std::min(row, 4)] - [std::min(column, 4)]; + int i = eob - 2; + do { + constexpr auto threshold = static_cast<ResidualType>(3); + const uint16_t pos = scan[i]; + const int row = pos >> adjusted_tx_width_log2; + const int column = pos & (tx_width - 1); + auto* const quantized = &quantized_buffer[pos]; + int context; + if (pos == 0) { + context = 0; + } else { + context = std::min( + 4, DivideBy2( + 1 + (std::min(quantized[1], threshold) + // {0, 1} + std::min(quantized[tx_width], threshold) + // {1, 0} + std::min(quantized[tx_width + 1], threshold) + // {1, 1} + std::min(quantized[2], threshold) + // {0, 2} + std::min(quantized[MultiplyBy2(tx_width)], + threshold)))); // {2, 0} + context += kCoeffBaseContextOffset[tx_size][std::min(row, 4)] + [std::min(column, 4)]; + } + int level = + reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]); + if (level > kNumQuantizerBaseLevels) { + // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS + // + 1, because we clip the overall output to 6 and the unclipped + // quantized values will always result in an output of greater than 6. + context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1} + quantized[tx_width] + // {1, 0} + quantized[tx_width + 1])); // {1, 1} + if (pos != 0) { + context += 14 >> static_cast<int>((row | column) < 2); + } + level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type); + } + quantized[0] = level; + } while (--i >= 0); } -// Section 8.3.2 in the spec, under coeff_base. -int Tile::GetCoeffBaseContextHorizontal(const int32_t* const quantized_buffer, - TransformSize /*tx_size*/, - int adjusted_tx_width_log2, - uint16_t pos) { +// Section 8.3.2 in the spec, under coeff_base and coeff_br. +// Bottom boundary checks are avoided by the padded rows. +// For a coefficient near the right boundary, the four right neighbors may be +// out of boundary. We don't do the boundary check for the first three right +// neighbors, because even for the transform blocks with smallest width 4, the +// first three out of boundary neighbors project to positions left of the +// current coefficient and these positions are still all 0s according to the +// column scan order. However, when transform block width is 4 and the current +// coefficient is on the right boundary, its fourth right neighbor projects to +// the under position on the same column, which could be nonzero. Therefore, we +// must skip the fourth right neighbor. To make it simple, for any coefficient, +// we always do the boundary check for its fourth right neighbor. +template <typename ResidualType> +void Tile::ReadCoeffBaseHorizontal( + const uint16_t* scan, PlaneType plane_type, TransformSize /*tx_size*/, + int clamped_tx_size_context, int adjusted_tx_width_log2, int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + ResidualType* const quantized_buffer) { const int tx_width = 1 << adjusted_tx_width_log2; - const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding; - const int32_t* const quantized = - &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)]; - const int context = std::min( - 4, DivideBy2(1 + (std::min(quantized[1], 3) + // {0, 1} - std::min(quantized[padded_tx_width], 3) + // {1, 0} - std::min(quantized[2], 3) + // {0, 2} - std::min(quantized[3], 3) + // {0, 3} - std::min(quantized[4], 3)))); // {0, 4} - const int index = pos & (tx_width - 1); - return context + kCoeffBasePositionContextOffset[std::min(index, 2)]; + int i = eob - 2; + do { + constexpr auto threshold = static_cast<ResidualType>(3); + const uint16_t pos = scan[i]; + const int column = pos & (tx_width - 1); + auto* const quantized = &quantized_buffer[pos]; + int context = std::min( + 4, + DivideBy2(1 + + (std::min(quantized[1], threshold) + // {0, 1} + std::min(quantized[tx_width], threshold) + // {1, 0} + std::min(quantized[2], threshold) + // {0, 2} + std::min(quantized[3], threshold) + // {0, 3} + std::min(quantized[4], + static_cast<ResidualType>( + (column + 4 < tx_width) ? 3 : 0))))); // {0, 4} + context += kCoeffBasePositionContextOffset[column]; + int level = + reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]); + if (level > kNumQuantizerBaseLevels) { + // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS + // + 1, because we clip the overall output to 6 and the unclipped + // quantized values will always result in an output of greater than 6. + context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1} + quantized[tx_width] + // {1, 0} + quantized[2])); // {0, 2} + if (pos != 0) { + context += 14 >> static_cast<int>(column == 0); + } + level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type); + } + quantized[0] = level; + } while (--i >= 0); } -// Section 8.3.2 in the spec, under coeff_base. -int Tile::GetCoeffBaseContextVertical(const int32_t* const quantized_buffer, - TransformSize /*tx_size*/, - int adjusted_tx_width_log2, - uint16_t pos) { +// Section 8.3.2 in the spec, under coeff_base and coeff_br. +// Bottom boundary checks are avoided by the padded rows. +// Right boundary check is performed explicitly. +template <typename ResidualType> +void Tile::ReadCoeffBaseVertical( + const uint16_t* scan, PlaneType plane_type, TransformSize /*tx_size*/, + int clamped_tx_size_context, int adjusted_tx_width_log2, int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + ResidualType* const quantized_buffer) { const int tx_width = 1 << adjusted_tx_width_log2; - const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding; - const int32_t* const quantized = - &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)]; - const int context = std::min( - 4, DivideBy2(1 + (std::min(quantized[1], 3) + // {0, 1} - std::min(quantized[padded_tx_width], 3) + // {1, 0} - std::min(quantized[MultiplyBy2(padded_tx_width)], - 3) + // {2, 0} - std::min(quantized[padded_tx_width * 3], 3) + // {3, 0} - std::min(quantized[MultiplyBy4(padded_tx_width)], - 3)))); // {4, 0} - - const int index = pos >> adjusted_tx_width_log2; - return context + kCoeffBasePositionContextOffset[std::min(index, 2)]; -} - -// Section 8.3.2 in the spec, under coeff_br. -int Tile::GetCoeffBaseRangeContext2D(const int32_t* const quantized_buffer, - int adjusted_tx_width_log2, int pos) { - const uint8_t tx_width = 1 << adjusted_tx_width_log2; - const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding; - const int32_t* const quantized = - &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)]; - const int context = std::min( - 6, DivideBy2( - 1 + - std::min(quantized[1], - kQuantizerCoefficientBaseRangeContextClamp) + // {0, 1} - std::min(quantized[padded_tx_width], - kQuantizerCoefficientBaseRangeContextClamp) + // {1, 0} - std::min(quantized[padded_tx_width + 1], - kQuantizerCoefficientBaseRangeContextClamp))); // {1, 1} - if (pos == 0) return context; - const int row = pos >> adjusted_tx_width_log2; - const int column = pos & (tx_width - 1); - return context + (((row | column) < 2) ? 7 : 14); -} - -// Section 8.3.2 in the spec, under coeff_br. -int Tile::GetCoeffBaseRangeContextHorizontal( - const int32_t* const quantized_buffer, int adjusted_tx_width_log2, - int pos) { - const uint8_t tx_width = 1 << adjusted_tx_width_log2; - const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding; - const int32_t* const quantized = - &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)]; - const int context = std::min( - 6, DivideBy2( - 1 + - std::min(quantized[1], - kQuantizerCoefficientBaseRangeContextClamp) + // {0, 1} - std::min(quantized[padded_tx_width], - kQuantizerCoefficientBaseRangeContextClamp) + // {1, 0} - std::min(quantized[2], - kQuantizerCoefficientBaseRangeContextClamp))); // {0, 2} - if (pos == 0) return context; - const int column = pos & (tx_width - 1); - return context + ((column == 0) ? 7 : 14); -} - -// Section 8.3.2 in the spec, under coeff_br. -int Tile::GetCoeffBaseRangeContextVertical( - const int32_t* const quantized_buffer, int adjusted_tx_width_log2, - int pos) { - const uint8_t tx_width = 1 << adjusted_tx_width_log2; - const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding; - const int32_t* const quantized = - &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)]; - const int context = std::min( - 6, DivideBy2( - 1 + - std::min(quantized[1], - kQuantizerCoefficientBaseRangeContextClamp) + // {0, 1} - std::min(quantized[padded_tx_width], - kQuantizerCoefficientBaseRangeContextClamp) + // {1, 0} - std::min(quantized[MultiplyBy2(padded_tx_width)], - kQuantizerCoefficientBaseRangeContextClamp))); // {2, 0} - if (pos == 0) return context; - const int row = pos >> adjusted_tx_width_log2; - return context + ((row == 0) ? 7 : 14); + int i = eob - 2; + do { + constexpr auto threshold = static_cast<ResidualType>(3); + const uint16_t pos = scan[i]; + const int row = pos >> adjusted_tx_width_log2; + const int column = pos & (tx_width - 1); + auto* const quantized = &quantized_buffer[pos]; + const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0; + int context = + std::min(4, DivideBy2(1 + (std::min(quantized_column1, 3) + // {0, 1} + std::min(quantized[tx_width], + threshold) + // {1, 0} + std::min(quantized[MultiplyBy2(tx_width)], + threshold) + // {2, 0} + std::min(quantized[tx_width * 3], + threshold) + // {3, 0} + std::min(quantized[MultiplyBy4(tx_width)], + threshold)))); // {4, 0} + context += kCoeffBasePositionContextOffset[row]; + int level = + reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]); + if (level > kNumQuantizerBaseLevels) { + // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS + // + 1, because we clip the overall output to 6 and the unclipped + // quantized values will always result in an output of greater than 6. + int context = + std::min(6, DivideBy2(1 + quantized_column1 + // {0, 1} + quantized[tx_width] + // {1, 0} + quantized[MultiplyBy2(tx_width)])); // {2, 0} + if (pos != 0) { + context += 14 >> static_cast<int>(row == 0); + } + level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type); + } + quantized[0] = level; + } while (--i >= 0); } int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) { const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane]; const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4]; - int dc_sign = std::accumulate( + // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension. + int8_t dc_sign = std::accumulate( dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0); const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane]; dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4]; @@ -938,6 +1250,8 @@ int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) { // if (dc_sign < 0) return 1; // if (dc_sign > 0) return 2; // return 0; + // And it is better than: + // return static_cast<int>(dc_sign != 0) + static_cast<int>(dc_sign > 0); return static_cast<int>(dc_sign < 0) + MultiplyBy2(static_cast<int>(dc_sign > 0)); } @@ -1020,23 +1334,21 @@ void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane, } } -template <bool is_dc_coefficient> +template <typename ResidualType, bool is_dc_coefficient> bool Tile::ReadSignAndApplyDequantization( - const Block& block, int32_t* const quantized_buffer, - const uint16_t* const scan, int i, int adjusted_tx_width_log2, int tx_width, - int q_value, const uint8_t* const quantizer_matrix, int shift, - int min_value, int max_value, uint16_t* const dc_sign_cdf, - int8_t* const dc_category, int* const coefficient_level) { - int pos = is_dc_coefficient ? 0 : scan[i]; - const int pos_index = - is_dc_coefficient ? 0 : PaddedIndex(pos, adjusted_tx_width_log2); - // If quantized_buffer[pos_index] is zero, then the rest of the function has - // no effect. - if (quantized_buffer[pos_index] == 0) return true; - const bool sign = is_dc_coefficient ? reader_.ReadSymbol(dc_sign_cdf) - : static_cast<bool>(reader_.ReadBit()); - if (quantized_buffer[pos_index] > - kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) { + const uint16_t* const scan, int i, int q_value, + const uint8_t* const quantizer_matrix, int shift, int max_value, + uint16_t* const dc_sign_cdf, int8_t* const dc_category, + int* const coefficient_level, ResidualType* residual_buffer) { + const int pos = is_dc_coefficient ? 0 : scan[i]; + // If residual_buffer[pos] is zero, then the rest of the function has no + // effect. + int level = residual_buffer[pos]; + if (level == 0) return true; + const int sign = is_dc_coefficient + ? static_cast<int>(reader_.ReadSymbol(dc_sign_cdf)) + : reader_.ReadBit(); + if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) { int length = 0; bool golomb_length_bit = false; do { @@ -1051,13 +1363,13 @@ bool Tile::ReadSignAndApplyDequantization( for (int i = length - 2; i >= 0; --i) { x = (x << 1) | reader_.ReadBit(); } - quantized_buffer[pos_index] += x - 1; + level += x - 1; } - if (is_dc_coefficient && quantized_buffer[0] > 0) { - *dc_category = sign ? -1 : 1; + if (is_dc_coefficient) { + *dc_category = (sign != 0) ? -1 : 1; } - quantized_buffer[pos_index] &= 0xfffff; - *coefficient_level += quantized_buffer[pos_index]; + level &= 0xfffff; + *coefficient_level += level; // Apply dequantization. Step 1 of section 7.12.3 in the spec. int q = q_value; if (quantizer_matrix != nullptr) { @@ -1065,34 +1377,21 @@ bool Tile::ReadSignAndApplyDequantization( } // The intermediate multiplication can exceed 32 bits, so it has to be // performed by promoting one of the values to int64_t. - int32_t dequantized_value = - (static_cast<int64_t>(q) * quantized_buffer[pos_index]) & 0xffffff; + int32_t dequantized_value = (static_cast<int64_t>(q) * level) & 0xffffff; dequantized_value >>= shift; - if (sign) { - dequantized_value = -dequantized_value; - } - // Inverse transform process assumes that the quantized coefficients are - // stored as a virtual 2d array of size |tx_width| x |tx_height|. If - // transform width is 64, then this assumption is broken because the scan - // order used for populating the coefficients for such transforms is the - // same as the one used for corresponding transform with width 32 (e.g. the - // scan order used for 64x16 is the same as the one used for 32x16). So we - // have to recompute the value of pos so that it reflects the index of the - // 2d array of size 64 x |tx_height|. - if (!is_dc_coefficient && tx_width == 64) { - const int row_index = DivideBy32(pos); - const int column_index = Mod32(pos); - pos = MultiplyBy64(row_index) + column_index; - } - if (sequence_header_.color_config.bitdepth == 8) { - auto* const residual_buffer = reinterpret_cast<int16_t*>(*block.residual); - residual_buffer[pos] = Clip3(dequantized_value, min_value, max_value); -#if LIBGAV1_MAX_BITDEPTH >= 10 - } else { - auto* const residual_buffer = reinterpret_cast<int32_t*>(*block.residual); - residual_buffer[pos] = Clip3(dequantized_value, min_value, max_value); -#endif - } + // At this point: + // * |dequantized_value| is always non-negative. + // * |sign| can be either 0 or 1. + // * min_value = -(max_value + 1). + // We need to apply the following: + // dequantized_value = sign ? -dequantized_value : dequantized_value; + // dequantized_value = Clip3(dequantized_value, min_value, max_value); + // + // Note that -x == ~(x - 1). + // + // Now, The above two lines can be done with a std::min and xor as follows: + dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign; + residual_buffer[pos] = dequantized_value; return true; } @@ -1109,10 +1408,11 @@ int Tile::ReadCoeffBaseRange(int clamped_tx_size_context, int cdf_context, return level; } -int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane, - int start_x, int start_y, - TransformSize tx_size, - TransformType* const tx_type) { +template <typename ResidualType> +int Tile::ReadTransformCoefficients(const Block& block, Plane plane, + int start_x, int start_y, + TransformSize tx_size, + TransformType* const tx_type) { const int x4 = DivideBy4(start_x); const int y4 = DivideBy4(start_y); const int w4 = kTransformWidth4x4[tx_size]; @@ -1134,19 +1434,15 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane, } const int tx_width = kTransformWidth[tx_size]; const int tx_height = kTransformHeight[tx_size]; - memset(*block.residual, 0, tx_width * tx_height * residual_size_); - const int clamped_tx_width = std::min(tx_width, 32); + const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size]; + const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size]; + const int tx_padding = + (1 << adjusted_tx_width_log2) * kResidualPaddingVertical; + auto* residual = reinterpret_cast<ResidualType*>(*block.residual); + // Clear padding to avoid bottom boundary checks when parsing quantized + // coefficients. + memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_); const int clamped_tx_height = std::min(tx_height, 32); - const int padded_tx_width = - clamped_tx_width + kQuantizedCoefficientBufferPadding; - const int padded_tx_height = - clamped_tx_height + kQuantizedCoefficientBufferPadding; - int32_t* const quantized = block.scratch_buffer->quantized_buffer; - // Only the first |padded_tx_width| * |padded_tx_height| values of |quantized| - // will be used by this function and the functions to which it is passed into. - // So we simply need to zero out those values before it is being used. - memset(quantized, 0, - padded_tx_width * padded_tx_height * sizeof(quantized[0])); if (plane == kPlaneY) { ReadTransformType(block, x4, y4, tx_size); } @@ -1181,9 +1477,9 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane, cdf = symbol_decoder_context_.eob_pt_1024_cdf[plane_type]; break; } - const int16_t eob_pt = + const int eob_pt = 1 + reader_.ReadSymbol(cdf, kEobPt16SymbolCount + eob_multi_size); - int16_t eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1); + int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1); if (eob_pt >= 3) { context = eob_pt - 3; const bool eob_extra = reader_.ReadSymbol( @@ -1199,23 +1495,6 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane, } } const uint16_t* scan = kScan[tx_class][tx_size]; - const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size]; - const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size]; - // Lookup used to call the right variant of GetCoeffBaseContext*() based on - // the transform class. - static constexpr int (Tile::*kGetCoeffBaseContextFunc[])( - const int32_t*, TransformSize, int, uint16_t) = { - &Tile::GetCoeffBaseContext2D, &Tile::GetCoeffBaseContextHorizontal, - &Tile::GetCoeffBaseContextVertical}; - auto get_coeff_base_context_func = kGetCoeffBaseContextFunc[tx_class]; - // Lookup used to call the right variant of GetCoeffBaseRangeContext*() based - // on the transform class. - static constexpr int (Tile::*kGetCoeffBaseRangeContextFunc[])( - const int32_t*, int, int) = {&Tile::GetCoeffBaseRangeContext2D, - &Tile::GetCoeffBaseRangeContextHorizontal, - &Tile::GetCoeffBaseRangeContextVertical}; - auto get_coeff_base_range_context_func = - kGetCoeffBaseRangeContextFunc[tx_class]; const int clamped_tx_size_context = std::min(tx_size_context, 3); // Read the last coefficient. { @@ -1227,36 +1506,37 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane, .coeff_base_eob_cdf[tx_size_context][plane_type][context], kCoeffBaseEobSymbolCount); if (level > kNumQuantizerBaseLevels) { - level += ReadCoeffBaseRange(clamped_tx_size_context, - (this->*get_coeff_base_range_context_func)( - quantized, adjusted_tx_width_log2, pos), - plane_type); + level += ReadCoeffBaseRange( + clamped_tx_size_context, + GetCoeffBaseRangeContextEob(adjusted_tx_width_log2, pos, tx_class), + plane_type); } - quantized[PaddedIndex(pos, adjusted_tx_width_log2)] = level; + residual[pos] = level; + } + if (eob > 1) { + // Read all the other coefficients. + // Lookup used to call the right variant of ReadCoeffBase*() based on the + // transform class. + static constexpr void (Tile::*kGetCoeffBaseFunc[])( + const uint16_t* scan, PlaneType plane_type, TransformSize tx_size, + int clamped_tx_size_context, int adjusted_tx_width_log2, int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + ResidualType* quantized_buffer) = { + &Tile::ReadCoeffBase2D<ResidualType>, + &Tile::ReadCoeffBaseHorizontal<ResidualType>, + &Tile::ReadCoeffBaseVertical<ResidualType>}; + (this->*kGetCoeffBaseFunc[tx_class])( + scan, plane_type, tx_size, clamped_tx_size_context, + adjusted_tx_width_log2, eob, + symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type], + residual); } - // Read all the other coefficients. - for (int i = eob - 2; i >= 0; --i) { - const uint16_t pos = scan[i]; - context = (this->*get_coeff_base_context_func)(quantized, tx_size, - adjusted_tx_width_log2, pos); - int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>( - symbol_decoder_context_ - .coeff_base_cdf[tx_size_context][plane_type][context]); - if (level > kNumQuantizerBaseLevels) { - level += ReadCoeffBaseRange(clamped_tx_size_context, - (this->*get_coeff_base_range_context_func)( - quantized, adjusted_tx_width_log2, pos), - plane_type); - } - quantized[PaddedIndex(pos, adjusted_tx_width_log2)] = level; - } - const int min_value = -(1 << (7 + sequence_header_.color_config.bitdepth)); const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1; const int current_quantizer_index = GetQIndex( frame_header_.segmentation, bp.segment_id, current_quantizer_index_); const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index); const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index); - const int shift = GetQuantizationShift(tx_size); + const int shift = kQuantizationShift[tx_size]; const uint8_t* const quantizer_matrix = (frame_header_.quantizer.use_matrix && *tx_type < kTransformTypeIdentityIdentity && @@ -1268,24 +1548,27 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane, int coefficient_level = 0; int8_t dc_category = 0; uint16_t* const dc_sign_cdf = - (quantized[0] != 0) + (residual[0] != 0) ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext( x4, y4, w4, h4, plane)] : nullptr; assert(scan[0] == 0); - if (!ReadSignAndApplyDequantization</*is_dc_coefficient=*/true>( - block, quantized, scan, 0, adjusted_tx_width_log2, tx_width, - dc_q_value, quantizer_matrix, shift, min_value, max_value, - dc_sign_cdf, &dc_category, &coefficient_level)) { + if (!ReadSignAndApplyDequantization<ResidualType, /*is_dc_coefficient=*/true>( + scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf, + &dc_category, &coefficient_level, residual)) { return -1; } - for (int i = 1; i < eob; ++i) { - if (!ReadSignAndApplyDequantization</*is_dc_coefficient=*/false>( - block, quantized, scan, i, adjusted_tx_width_log2, tx_width, - ac_q_value, quantizer_matrix, shift, min_value, max_value, nullptr, - nullptr, &coefficient_level)) { - return -1; - } + if (eob > 1) { + int i = 1; + do { + if (!ReadSignAndApplyDequantization<ResidualType, + /*is_dc_coefficient=*/false>( + scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr, + nullptr, &coefficient_level, residual)) { + return -1; + } + } while (++i < eob); + MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual); } SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level), dc_category); @@ -1295,6 +1578,25 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane, return eob; } +// CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template +// |function| depending on the value of |sequence_header_.color_config.bitdepth| +// with the variadic arguments. +#if LIBGAV1_MAX_BITDEPTH >= 10 +#define CALL_BITDEPTH_FUNCTION(function, ...) \ + do { \ + if (sequence_header_.color_config.bitdepth > 8) { \ + function<uint16_t>(__VA_ARGS__); \ + } else { \ + function<uint8_t>(__VA_ARGS__); \ + } \ + } while (false) +#else +#define CALL_BITDEPTH_FUNCTION(function, ...) \ + do { \ + function<uint8_t>(__VA_ARGS__); \ + } while (false) +#endif + bool Tile::TransformBlock(const Block& block, Plane plane, int base_x, int base_y, TransformSize tx_size, int x, int y, ProcessingMode mode) { @@ -1317,15 +1619,8 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x, mode == kProcessingModeParseAndDecode; if (do_decode && !bp.is_inter) { if (bp.palette_mode_info.size[GetPlaneType(plane)] > 0) { - if (sequence_header_.color_config.bitdepth == 8) { - PalettePrediction<uint8_t>(block, plane, start_x, start_y, x, y, - tx_size); -#if LIBGAV1_MAX_BITDEPTH >= 10 - } else { - PalettePrediction<uint16_t>(block, plane, start_x, start_y, x, y, - tx_size); -#endif - } + CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y, + x, y, tx_size); } else { const PredictionMode mode = (plane == kPlaneY) @@ -1337,37 +1632,17 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x, (sub_block_column4x4 >> subsampling_x) + step_x + 1; const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1; const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x); - const bool has_left = - x > 0 || (plane == kPlaneY ? block.left_available - : block.LeftAvailableChroma()); - const bool has_top = - y > 0 || - (plane == kPlaneY ? block.top_available : block.TopAvailableChroma()); - if (sequence_header_.color_config.bitdepth == 8) { - IntraPrediction<uint8_t>( - block, plane, start_x, start_y, has_left, has_top, - block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4], - block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4], - mode, tx_size); -#if LIBGAV1_MAX_BITDEPTH >= 10 - } else { - IntraPrediction<uint16_t>( - block, plane, start_x, start_y, has_left, has_top, - block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4], - block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4], - mode, tx_size); -#endif - } + const bool has_left = x > 0 || block.left_available[plane]; + const bool has_top = y > 0 || block.top_available[plane]; + + CALL_BITDEPTH_FUNCTION( + IntraPrediction, block, plane, start_x, start_y, has_left, has_top, + block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4], + block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4], + mode, tx_size); if (plane != kPlaneY && bp.uv_mode == kPredictionModeChromaFromLuma) { - if (sequence_header_.color_config.bitdepth == 8) { - ChromaFromLumaPrediction<uint8_t>(block, plane, start_x, start_y, - tx_size); -#if LIBGAV1_MAX_BITDEPTH >= 10 - } else { - ChromaFromLumaPrediction<uint16_t>(block, plane, start_x, start_y, - tx_size); -#endif - } + CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x, + start_y, tx_size); } } if (plane == kPlaneY) { @@ -1381,34 +1656,35 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x, if (!bp.skip) { const int sb_row_index = SuperBlockRowIndex(block.row4x4); const int sb_column_index = SuperBlockColumnIndex(block.column4x4); - switch (mode) { - case kProcessingModeParseAndDecode: { - TransformType tx_type; - const int16_t non_zero_coeff_count = ReadTransformCoefficients( + if (mode == kProcessingModeDecodeOnly) { + TransformParameterQueue& tx_params = + *residual_buffer_threaded_[sb_row_index][sb_column_index] + ->transform_parameters(); + ReconstructBlock(block, plane, start_x, start_y, tx_size, + tx_params.Type(), tx_params.NonZeroCoeffCount()); + tx_params.Pop(); + } else { + TransformType tx_type; + int non_zero_coeff_count; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (sequence_header_.color_config.bitdepth > 8) { + non_zero_coeff_count = ReadTransformCoefficients<int32_t>( block, plane, start_x, start_y, tx_size, &tx_type); - if (non_zero_coeff_count < 0) return false; + } else // NOLINT +#endif + { + non_zero_coeff_count = ReadTransformCoefficients<int16_t>( + block, plane, start_x, start_y, tx_size, &tx_type); + } + if (non_zero_coeff_count < 0) return false; + if (mode == kProcessingModeParseAndDecode) { ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type, non_zero_coeff_count); - break; - } - case kProcessingModeParseOnly: { - TransformType tx_type; - const int16_t non_zero_coeff_count = ReadTransformCoefficients( - block, plane, start_x, start_y, tx_size, &tx_type); - if (non_zero_coeff_count < 0) return false; + } else { + assert(mode == kProcessingModeParseOnly); residual_buffer_threaded_[sb_row_index][sb_column_index] ->transform_parameters() ->Push(non_zero_coeff_count, tx_type); - break; - } - case kProcessingModeDecodeOnly: { - TransformParameterQueue& tx_params = - *residual_buffer_threaded_[sb_row_index][sb_column_index] - ->transform_parameters(); - ReconstructBlock(block, plane, start_x, start_y, tx_size, - tx_params.Type(), tx_params.NonZeroCoeffCount()); - tx_params.Pop(); - break; } } } @@ -1417,11 +1693,8 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x, &block.scratch_buffer ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1] [(sub_block_column4x4 >> subsampling_x) + 1]; - for (int i = 0; i < step_y; ++i) { - static_assert(sizeof(bool) == 1, ""); - memset(block_decoded, 1, step_x); - block_decoded += DecoderScratchBuffer::kBlockDecodedStride; - } + SetBlockValues<bool>(step_y, step_x, true, block_decoded, + TileScratchBuffer::kBlockDecodedStride); } return true; } @@ -1437,7 +1710,7 @@ bool Tile::TransformTree(const Block& block, int start_x, int start_y, stack.Push(TransformTreeNode(start_x, start_y, static_cast<TransformSize>(plane_size))); - while (!stack.Empty()) { + do { TransformTreeNode node = stack.Pop(); const int row = DivideBy4(node.y); const int column = DivideBy4(node.x); @@ -1479,24 +1752,18 @@ bool Tile::TransformTree(const Block& block, int start_x, int start_y, stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size)); stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size)); stack.Push(TransformTreeNode(node.x, node.y, split_tx_size)); - } + } while (!stack.Empty()); return true; } void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x, int start_y, TransformSize tx_size, - TransformType tx_type, - int16_t non_zero_coeff_count) { + TransformType tx_type, int non_zero_coeff_count) { + // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec. assert(non_zero_coeff_count >= 0); if (non_zero_coeff_count == 0) return; - // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec. - if (sequence_header_.color_config.bitdepth == 8) { - Reconstruct(dsp_, tx_type, tx_size, - frame_header_.segmentation.lossless[block.bp->segment_id], - reinterpret_cast<int16_t*>(*block.residual), start_x, start_y, - &buffer_[plane], non_zero_coeff_count); #if LIBGAV1_MAX_BITDEPTH >= 10 - } else { + if (sequence_header_.color_config.bitdepth > 8) { Array2DView<uint16_t> buffer( buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t), reinterpret_cast<uint16_t*>(&buffer_[plane][0][0])); @@ -1504,7 +1771,13 @@ void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x, frame_header_.segmentation.lossless[block.bp->segment_id], reinterpret_cast<int32_t*>(*block.residual), start_x, start_y, &buffer, non_zero_coeff_count); + } else // NOLINT #endif + { + Reconstruct(dsp_, tx_type, tx_size, + frame_header_.segmentation.lossless[block.bp->segment_id], + reinterpret_cast<int16_t*>(*block.residual), start_x, start_y, + &buffer_[plane], non_zero_coeff_count); } if (split_parse_and_decode_) { *block.residual += @@ -1513,8 +1786,8 @@ void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x, } bool Tile::Residual(const Block& block, ProcessingMode mode) { - const int width_chunks = std::max(1, kBlockWidthPixels[block.size] >> 6); - const int height_chunks = std::max(1, kBlockHeightPixels[block.size] >> 6); + const int width_chunks = std::max(1, block.width >> 6); + const int height_chunks = std::max(1, block.height >> 6); const BlockSize size_chunk4x4 = (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size; const BlockParameters& bp = *block.bp; @@ -1574,7 +1847,7 @@ bool Tile::Residual(const Block& block, ProcessingMode mode) { bool Tile::IsMvValid(const Block& block, bool is_compound) const { const BlockParameters& bp = *block.bp; for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) { - for (int mv_component : bp.mv[i].mv) { + for (int mv_component : bp.mv.mv[i].mv) { if (std::abs(mv_component) >= (1 << 14)) { return false; } @@ -1583,22 +1856,20 @@ bool Tile::IsMvValid(const Block& block, bool is_compound) const { if (!block.bp->prediction_parameters->use_intra_block_copy) { return true; } - const int block_width = kBlockWidthPixels[block.size]; - const int block_height = kBlockHeightPixels[block.size]; - if ((bp.mv[0].mv[0] & 7) != 0 || (bp.mv[0].mv[1] & 7) != 0) { + if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) { return false; } - const int delta_row = bp.mv[0].mv[0] >> 3; - const int delta_column = bp.mv[0].mv[1] >> 3; + const int delta_row = bp.mv.mv[0].mv[0] >> 3; + const int delta_column = bp.mv.mv[0].mv[1] >> 3; int src_top_edge = MultiplyBy4(block.row4x4) + delta_row; int src_left_edge = MultiplyBy4(block.column4x4) + delta_column; - const int src_bottom_edge = src_top_edge + block_height; - const int src_right_edge = src_left_edge + block_width; + const int src_bottom_edge = src_top_edge + block.height; + const int src_right_edge = src_left_edge + block.width; if (block.HasChroma()) { - if (block_width < 8 && subsampling_x_[kPlaneU] != 0) { + if (block.width < 8 && subsampling_x_[kPlaneU] != 0) { src_left_edge -= 4; } - if (block_height < 8 && subsampling_y_[kPlaneU] != 0) { + if (block.height < 8 && subsampling_y_[kPlaneU] != 0) { src_top_edge -= 4; } } @@ -1636,58 +1907,102 @@ bool Tile::IsMvValid(const Block& block, bool is_compound) const { wavefront_offset; } -bool Tile::AssignMv(const Block& block, bool is_compound) { - MotionVector predicted_mv[2] = {}; +bool Tile::AssignInterMv(const Block& block, bool is_compound) { + int min[2]; + int max[2]; + GetClampParameters(block, min, max); BlockParameters& bp = *block.bp; - for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) { - const PredictionParameters& prediction_parameters = - *block.bp->prediction_parameters; - const PredictionMode mode = prediction_parameters.use_intra_block_copy - ? kPredictionModeNewMv - : GetSinglePredictionMode(i, bp.y_mode); - if (prediction_parameters.use_intra_block_copy) { - predicted_mv[0] = prediction_parameters.ref_mv_stack[0].mv[0]; - if (predicted_mv[0].mv[0] == 0 && predicted_mv[0].mv[1] == 0) { - predicted_mv[0] = prediction_parameters.ref_mv_stack[1].mv[0]; - } - if (predicted_mv[0].mv[0] == 0 && predicted_mv[0].mv[1] == 0) { - const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()]; - if (block.row4x4 - super_block_size4x4 < row4x4_start_) { - predicted_mv[0].mv[1] = -MultiplyBy8( - MultiplyBy4(super_block_size4x4) + kIntraBlockCopyDelayPixels); - } else { - predicted_mv[0].mv[0] = -MultiplyBy32(super_block_size4x4); + const PredictionParameters& prediction_parameters = *bp.prediction_parameters; + if (is_compound) { + for (int i = 0; i < 2; ++i) { + const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode); + MotionVector predicted_mv; + if (mode == kPredictionModeGlobalMv) { + predicted_mv = prediction_parameters.global_mv[i]; + } else { + const int ref_mv_index = (mode == kPredictionModeNearestMv || + (mode == kPredictionModeNewMv && + prediction_parameters.ref_mv_count <= 1)) + ? 0 + : prediction_parameters.ref_mv_index; + predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i); + if (ref_mv_index < prediction_parameters.ref_mv_count) { + predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]); + predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]); } } - } else if (mode == kPredictionModeGlobalMv) { - predicted_mv[i] = prediction_parameters.global_mv[i]; + if (mode == kPredictionModeNewMv) { + ReadMotionVector(block, i); + bp.mv.mv[i].mv[0] += predicted_mv.mv[0]; + bp.mv.mv[i].mv[1] += predicted_mv.mv[1]; + } else { + bp.mv.mv[i] = predicted_mv; + } + } + } else { + const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode); + MotionVector predicted_mv; + if (mode == kPredictionModeGlobalMv) { + predicted_mv = prediction_parameters.global_mv[0]; } else { const int ref_mv_index = (mode == kPredictionModeNearestMv || (mode == kPredictionModeNewMv && prediction_parameters.ref_mv_count <= 1)) ? 0 : prediction_parameters.ref_mv_index; - predicted_mv[i] = prediction_parameters.ref_mv_stack[ref_mv_index].mv[i]; + predicted_mv = prediction_parameters.reference_mv(ref_mv_index); + if (ref_mv_index < prediction_parameters.ref_mv_count) { + predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]); + predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]); + } } if (mode == kPredictionModeNewMv) { - ReadMotionVector(block, i); - bp.mv[i].mv[0] += predicted_mv[i].mv[0]; - bp.mv[i].mv[1] += predicted_mv[i].mv[1]; + ReadMotionVector(block, 0); + bp.mv.mv[0].mv[0] += predicted_mv.mv[0]; + bp.mv.mv[0].mv[1] += predicted_mv.mv[1]; } else { - bp.mv[i] = predicted_mv[i]; + bp.mv.mv[0] = predicted_mv; } } return IsMvValid(block, is_compound); } +bool Tile::AssignIntraMv(const Block& block) { + // TODO(linfengz): Check if the clamping process is necessary. + int min[2]; + int max[2]; + GetClampParameters(block, min, max); + BlockParameters& bp = *block.bp; + const PredictionParameters& prediction_parameters = *bp.prediction_parameters; + const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0); + ReadMotionVector(block, 0); + if (ref_mv_0.mv32 == 0) { + const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1); + if (ref_mv_1.mv32 == 0) { + const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()]; + if (block.row4x4 - super_block_size4x4 < row4x4_start_) { + bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4); + bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels); + } else { + bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4); + } + } else { + bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]); + bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]); + } + } else { + bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]); + bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]); + } + return IsMvValid(block, /*is_compound=*/false); +} + void Tile::ResetEntropyContext(const Block& block) { - const int block_width4x4 = kNum4x4BlocksWide[block.size]; - const int block_height4x4 = kNum4x4BlocksHigh[block.size]; for (int plane = 0; plane < (block.HasChroma() ? PlaneCount() : 1); ++plane) { const int subsampling_x = subsampling_x_[plane]; const int start_x = block.column4x4 >> subsampling_x; const int end_x = - std::min((block.column4x4 + block_width4x4) >> subsampling_x, + std::min((block.column4x4 + block.width4x4) >> subsampling_x, frame_header_.columns4x4); memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0, end_x - start_x); @@ -1696,7 +2011,7 @@ void Tile::ResetEntropyContext(const Block& block) { const int subsampling_y = subsampling_y_[plane]; const int start_y = block.row4x4 >> subsampling_y; const int end_y = - std::min((block.row4x4 + block_height4x4) >> subsampling_y, + std::min((block.row4x4 + block.height4x4) >> subsampling_y, frame_header_.rows4x4); memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0, end_y - start_y); @@ -1705,12 +2020,15 @@ void Tile::ResetEntropyContext(const Block& block) { } } -void Tile::ComputePrediction(const Block& block) { +bool Tile::ComputePrediction(const Block& block) { + const BlockParameters& bp = *block.bp; + if (!bp.is_inter) return true; const int mask = (1 << (4 + static_cast<int>(sequence_header_.use_128x128_superblock))) - 1; const int sub_block_row4x4 = block.row4x4 & mask; const int sub_block_column4x4 = block.column4x4 & mask; + const int plane_count = block.HasChroma() ? PlaneCount() : 1; // Returns true if this block applies local warping. The state is determined // in the Y plane and carried for use in the U/V planes. // But the U/V planes will not apply warping when the block size is smaller @@ -1718,20 +2036,19 @@ void Tile::ComputePrediction(const Block& block) { bool is_local_valid = false; // Local warping parameters, similar usage as is_local_valid. GlobalMotion local_warp_params; - for (int plane = 0; plane < (block.HasChroma() ? PlaneCount() : 1); ++plane) { + int plane = 0; + do { const int8_t subsampling_x = subsampling_x_[plane]; const int8_t subsampling_y = subsampling_y_[plane]; - const BlockSize plane_size = - block.residual_size[GetPlaneType(static_cast<Plane>(plane))]; + const BlockSize plane_size = block.residual_size[plane]; const int block_width4x4 = kNum4x4BlocksWide[plane_size]; const int block_height4x4 = kNum4x4BlocksHigh[plane_size]; const int block_width = MultiplyBy4(block_width4x4); const int block_height = MultiplyBy4(block_height4x4); const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x); const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y); - const BlockParameters& bp = *block.bp; - if (bp.is_inter && bp.reference_frame[1] == kReferenceFrameIntra) { - const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y); + if (bp.reference_frame[1] == kReferenceFrameIntra) { + const int tr_row4x4 = sub_block_row4x4 >> subsampling_y; const int tr_column4x4 = (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1; const int bl_row4x4 = @@ -1740,88 +2057,98 @@ void Tile::ComputePrediction(const Block& block) { const TransformSize tx_size = k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]] [k4x4HeightLog2[plane_size]]; - const bool has_left = - plane == kPlaneY ? block.left_available : block.LeftAvailableChroma(); - const bool has_top = - plane == kPlaneY ? block.top_available : block.TopAvailableChroma(); - if (sequence_header_.color_config.bitdepth == 8) { - IntraPrediction<uint8_t>( - block, static_cast<Plane>(plane), base_x, base_y, has_left, has_top, - block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4], - block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4], - kInterIntraToIntraMode[block.bp->prediction_parameters - ->inter_intra_mode], - tx_size); -#if LIBGAV1_MAX_BITDEPTH >= 10 - } else { - IntraPrediction<uint16_t>( - block, static_cast<Plane>(plane), base_x, base_y, has_left, has_top, - block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4], - block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4], - kInterIntraToIntraMode[block.bp->prediction_parameters - ->inter_intra_mode], - tx_size); -#endif - } + const bool has_left = block.left_available[plane]; + const bool has_top = block.top_available[plane]; + CALL_BITDEPTH_FUNCTION( + IntraPrediction, block, static_cast<Plane>(plane), base_x, base_y, + has_left, has_top, + block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4], + block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4], + kInterIntraToIntraMode[block.bp->prediction_parameters + ->inter_intra_mode], + tx_size); } - if (bp.is_inter) { - int candidate_row = (block.row4x4 >> subsampling_y) << subsampling_y; - int candidate_column = (block.column4x4 >> subsampling_x) - << subsampling_x; - bool some_use_intra = false; - for (int r = 0; r < (block_height4x4 << subsampling_y); ++r) { - for (int c = 0; c < (block_width4x4 << subsampling_x); ++c) { - auto* const bp = block_parameters_holder_.Find(candidate_row + r, - candidate_column + c); - if (bp != nullptr && bp->reference_frame[0] == kReferenceFrameIntra) { - some_use_intra = true; - break; - } + int candidate_row = block.row4x4; + int candidate_column = block.column4x4; + bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra; + if (!some_use_intra && plane != 0) { + candidate_row = (candidate_row >> subsampling_y) << subsampling_y; + candidate_column = (candidate_column >> subsampling_x) << subsampling_x; + if (candidate_row != block.row4x4) { + // Top block. + const BlockParameters& bp_top = + *block_parameters_holder_.Find(candidate_row, block.column4x4); + some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra; + if (!some_use_intra && candidate_column != block.column4x4) { + // Top-left block. + const BlockParameters& bp_top_left = + *block_parameters_holder_.Find(candidate_row, candidate_column); + some_use_intra = + bp_top_left.reference_frame[0] == kReferenceFrameIntra; } - if (some_use_intra) break; - } - int prediction_width; - int prediction_height; - if (some_use_intra) { - candidate_row = block.row4x4; - candidate_column = block.column4x4; - prediction_width = block_width; - prediction_height = block_height; - } else { - prediction_width = kBlockWidthPixels[block.size] >> subsampling_x; - prediction_height = kBlockHeightPixels[block.size] >> subsampling_y; } - for (int r = 0, y = 0; y < block_height; y += prediction_height, ++r) { - for (int c = 0, x = 0; x < block_width; x += prediction_width, ++c) { - InterPrediction(block, static_cast<Plane>(plane), base_x + x, - base_y + y, prediction_width, prediction_height, - candidate_row + r, candidate_column + c, - &is_local_valid, &local_warp_params); - } + if (!some_use_intra && candidate_column != block.column4x4) { + // Left block. + const BlockParameters& bp_left = + *block_parameters_holder_.Find(block.row4x4, candidate_column); + some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra; } } - } + int prediction_width; + int prediction_height; + if (some_use_intra) { + candidate_row = block.row4x4; + candidate_column = block.column4x4; + prediction_width = block_width; + prediction_height = block_height; + } else { + prediction_width = block.width >> subsampling_x; + prediction_height = block.height >> subsampling_y; + } + int r = 0; + int y = 0; + do { + int c = 0; + int x = 0; + do { + if (!InterPrediction(block, static_cast<Plane>(plane), base_x + x, + base_y + y, prediction_width, prediction_height, + candidate_row + r, candidate_column + c, + &is_local_valid, &local_warp_params)) { + return false; + } + ++c; + x += prediction_width; + } while (x < block_width); + ++r; + y += prediction_height; + } while (y < block_height); + } while (++plane < plane_count); + return true; } +#undef CALL_BITDEPTH_FUNCTION + void Tile::PopulateDeblockFilterLevel(const Block& block) { if (!post_filter_.DoDeblock()) return; BlockParameters& bp = *block.bp; + const int mode_id = + static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode)); for (int i = 0; i < kFrameLfCount; ++i) { if (delta_lf_all_zero_) { bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel( - bp.segment_id, i, bp.reference_frame[0], - LoopFilterMask::GetModeId(bp.y_mode)); + bp.segment_id, i, bp.reference_frame[0], mode_id); } else { bp.deblock_filter_level[i] = deblock_filter_levels_[bp.segment_id][i][bp.reference_frame[0]] - [LoopFilterMask::GetModeId(bp.y_mode)]; + [mode_id]; } } } bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size, ParameterTree* const tree, - DecoderScratchBuffer* const scratch_buffer, + TileScratchBuffer* const scratch_buffer, ResidualPtr* residual) { // Do not process the block if the starting point is beyond the visible frame. // This is equivalent to the has_row/has_column check in the @@ -1831,34 +2158,34 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size, column4x4 >= frame_header_.columns4x4) { return true; } - Block block(*this, row4x4, column4x4, block_size, scratch_buffer, residual, - tree->parameters()); - block.bp->size = block_size; - block_parameters_holder_.FillCache(row4x4, column4x4, block_size, - tree->parameters()); - block.bp->prediction_parameters = + BlockParameters& bp = *tree->parameters(); + block_parameters_holder_.FillCache(row4x4, column4x4, block_size, &bp); + Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual); + bp.size = block_size; + bp.prediction_parameters = split_parse_and_decode_ ? std::unique_ptr<PredictionParameters>( new (std::nothrow) PredictionParameters()) : std::move(prediction_parameters_); - if (block.bp->prediction_parameters == nullptr) return false; + if (bp.prediction_parameters == nullptr) return false; if (!DecodeModeInfo(block)) return false; + bp.is_global_mv_block = (bp.y_mode == kPredictionModeGlobalMv || + bp.y_mode == kPredictionModeGlobalGlobalMv) && + !IsBlockDimension4(bp.size); PopulateDeblockFilterLevel(block); if (!ReadPaletteTokens(block)) return false; DecodeTransformSize(block); - BlockParameters& bp = *block.bp; // Part of Section 5.11.37 in the spec (implemented as a simple lookup). - bp.uv_transform_size = - frame_header_.segmentation.lossless[bp.segment_id] - ? kTransformSize4x4 - : kUVTransformSize[block.residual_size[kPlaneTypeUV]]; + bp.uv_transform_size = frame_header_.segmentation.lossless[bp.segment_id] + ? kTransformSize4x4 + : kUVTransformSize[block.residual_size[kPlaneU]]; if (bp.skip) ResetEntropyContext(block); - const int block_width4x4 = kNum4x4BlocksWide[block_size]; - const int block_height4x4 = kNum4x4BlocksHigh[block_size]; if (split_parse_and_decode_) { if (!Residual(block, kProcessingModeParseOnly)) return false; } else { - ComputePrediction(block); - if (!Residual(block, kProcessingModeParseAndDecode)) return false; + if (!ComputePrediction(block) || + !Residual(block, kProcessingModeParseAndDecode)) { + return false; + } } // If frame_header_.segmentation.enabled is false, bp.segment_id is 0 for all // blocks. We don't need to call save bp.segment_id in the current frame @@ -1870,25 +2197,22 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size, // save bp.segment_id in the current frame. if (frame_header_.segmentation.enabled && frame_header_.segmentation.update_map) { - const int x_limit = - std::min(frame_header_.columns4x4 - column4x4, block_width4x4); - const int y_limit = - std::min(frame_header_.rows4x4 - row4x4, block_height4x4); + const int x_limit = std::min(frame_header_.columns4x4 - column4x4, + static_cast<int>(block.width4x4)); + const int y_limit = std::min(frame_header_.rows4x4 - row4x4, + static_cast<int>(block.height4x4)); current_frame_.segmentation_map()->FillBlock(row4x4, column4x4, x_limit, y_limit, bp.segment_id); } - if (build_bit_mask_when_parsing_ || !split_parse_and_decode_) { - BuildBitMask(row4x4, column4x4, block_size); - } + StoreMotionFieldMvsIntoCurrentFrame(block); if (!split_parse_and_decode_) { - StoreMotionFieldMvsIntoCurrentFrame(block); - prediction_parameters_ = std::move(block.bp->prediction_parameters); + prediction_parameters_ = std::move(bp.prediction_parameters); } return true; } bool Tile::DecodeBlock(ParameterTree* const tree, - DecoderScratchBuffer* const scratch_buffer, + TileScratchBuffer* const scratch_buffer, ResidualPtr* residual) { const int row4x4 = tree->row4x4(); const int column4x4 = tree->column4x4(); @@ -1897,21 +2221,18 @@ bool Tile::DecodeBlock(ParameterTree* const tree, return true; } const BlockSize block_size = tree->block_size(); - Block block(*this, row4x4, column4x4, block_size, scratch_buffer, residual, - tree->parameters()); - ComputePrediction(block); - if (!Residual(block, kProcessingModeDecodeOnly)) return false; - if (!build_bit_mask_when_parsing_) { - BuildBitMask(row4x4, column4x4, block_size); + Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual); + if (!ComputePrediction(block) || + !Residual(block, kProcessingModeDecodeOnly)) { + return false; } - StoreMotionFieldMvsIntoCurrentFrame(block); block.bp->prediction_parameters.reset(nullptr); return true; } bool Tile::ProcessPartition(int row4x4_start, int column4x4_start, ParameterTree* const root, - DecoderScratchBuffer* const scratch_buffer, + TileScratchBuffer* const scratch_buffer, ResidualPtr* residual) { Stack<ParameterTree*, kDfsStackSize> stack; @@ -2025,7 +2346,7 @@ void Tile::ResetLoopRestorationParams() { } void Tile::ResetCdef(const int row4x4, const int column4x4) { - if (cdef_index_[0] == nullptr) return; + if (!sequence_header_.enable_cdef) return; const int row = DivideBy16(row4x4); const int column = DivideBy16(column4x4); cdef_index_[row][column] = -1; @@ -2039,7 +2360,7 @@ void Tile::ResetCdef(const int row4x4, const int column4x4) { } } -void Tile::ClearBlockDecoded(DecoderScratchBuffer* const scratch_buffer, +void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer, int row4x4, int column4x4) { // Set everything to false. memset(scratch_buffer->block_decoded, 0, @@ -2075,7 +2396,7 @@ void Tile::ClearBlockDecoded(DecoderScratchBuffer* const scratch_buffer, } bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4, - DecoderScratchBuffer* const scratch_buffer, + TileScratchBuffer* const scratch_buffer, ProcessingMode mode) { const bool parsing = mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode; @@ -2139,11 +2460,11 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4, } bool Tile::DecodeSuperBlock(ParameterTree* const tree, - DecoderScratchBuffer* const scratch_buffer, + TileScratchBuffer* const scratch_buffer, ResidualPtr* residual) { Stack<ParameterTree*, kDfsStackSize> stack; stack.Push(tree); - while (!stack.Empty()) { + do { ParameterTree* const node = stack.Pop(); if (node->partition() != kPartitionNone) { for (int i = 3; i >= 0; --i) { @@ -2157,7 +2478,7 @@ bool Tile::DecodeSuperBlock(ParameterTree* const tree, node->row4x4(), node->column4x4()); return false; } - } + } while (!stack.Empty()); return true; } @@ -2189,222 +2510,87 @@ void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4, } } -void Tile::BuildBitMask(int row4x4, int column4x4, BlockSize block_size) { - if (!post_filter_.DoDeblock()) return; - if (block_size <= kBlock64x64) { - BuildBitMaskHelper(row4x4, column4x4, block_size, true, true); - } else { - const int block_width4x4 = kNum4x4BlocksWide[block_size]; - const int block_height4x4 = kNum4x4BlocksHigh[block_size]; - for (int y = 0; y < block_height4x4; y += 16) { - for (int x = 0; x < block_width4x4; x += 16) { - BuildBitMaskHelper(row4x4 + y, column4x4 + x, kBlock64x64, x == 0, - y == 0); - } - } - } -} - -void Tile::BuildBitMaskHelper(int row4x4, int column4x4, BlockSize block_size, - const bool is_vertical_block_border, - const bool is_horizontal_block_border) { - const int block_width4x4 = kNum4x4BlocksWide[block_size]; - const int block_height4x4 = kNum4x4BlocksHigh[block_size]; - BlockParameters& bp = *block_parameters_holder_.Find(row4x4, column4x4); - const bool skip = bp.skip && bp.is_inter; - LoopFilterMask* const masks = post_filter_.masks(); - const int unit_id = DivideBy16(row4x4) * masks->num_64x64_blocks_per_row() + - DivideBy16(column4x4); - - for (int plane = kPlaneY; plane < PlaneCount(); ++plane) { - // For U and V planes, do not build bit masks if level == 0. - if (plane > kPlaneY && frame_header_.loop_filter.level[plane + 1] == 0) { - continue; - } - // Build bit mask for vertical edges. - const int subsampling_x = subsampling_x_[plane]; - const int subsampling_y = subsampling_y_[plane]; - const int plane_width = - RightShiftWithRounding(frame_header_.width, subsampling_x); - const int column_limit = - std::min({column4x4 + block_width4x4, frame_header_.columns4x4, - DivideBy4(plane_width + 3) << subsampling_x}); - const int plane_height = - RightShiftWithRounding(frame_header_.height, subsampling_y); - const int row_limit = - std::min({row4x4 + block_height4x4, frame_header_.rows4x4, - DivideBy4(plane_height + 3) << subsampling_y}); - const int row_start = GetDeblockPosition(row4x4, subsampling_y); - const int column_start = GetDeblockPosition(column4x4, subsampling_x); - if (row_start >= row_limit || column_start >= column_limit) { - continue; - } - const int vertical_step = 1 << subsampling_y; - const int horizontal_step = 1 << subsampling_x; - const BlockParameters& bp = - *block_parameters_holder_.Find(row_start, column_start); - const int horizontal_level_index = - kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal]; - const int vertical_level_index = - kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical]; - const uint8_t vertical_level = - bp.deblock_filter_level[vertical_level_index]; - - for (int row = row_start; row < row_limit; row += vertical_step) { - for (int column = column_start; column < column_limit;) { - const TransformSize tx_size = (plane == kPlaneY) - ? inter_transform_sizes_[row][column] - : bp.uv_transform_size; - // (1). Don't filter frame boundary. - // (2). For tile boundary, we don't know whether the previous tile is - // available or not, thus we handle it after all tiles are decoded. - const bool is_vertical_border = - (column == column_start) && is_vertical_block_border; - if (column == GetDeblockPosition(column4x4_start_, subsampling_x) || - (skip && !is_vertical_border)) { - column += kNum4x4BlocksWide[tx_size] << subsampling_x; - continue; - } - - // bp_left is the parameter of the left prediction block which - // is guaranteed to be inside the tile. - const BlockParameters& bp_left = - *block_parameters_holder_.Find(row, column - horizontal_step); - const uint8_t left_level = - is_vertical_border - ? bp_left.deblock_filter_level[vertical_level_index] - : vertical_level; - // We don't have to check if the left block is skipped or not, - // because if the current transform block is on the edge of the coding - // block, is_vertical_border is true; if it's not on the edge, - // left skip is equal to skip. - if (vertical_level != 0 || left_level != 0) { - const TransformSize left_tx_size = - (plane == kPlaneY) - ? inter_transform_sizes_[row][column - horizontal_step] - : bp_left.uv_transform_size; - const LoopFilterTransformSizeId transform_size_id = - GetTransformSizeIdWidth(tx_size, left_tx_size); - const int r = row & (kNum4x4InLoopFilterMaskUnit - 1); - const int c = column & (kNum4x4InLoopFilterMaskUnit - 1); - const int shift = LoopFilterMask::GetShift(r, c); - const int index = LoopFilterMask::GetIndex(r); - const auto mask = static_cast<uint64_t>(1) << shift; - masks->SetLeft(mask, unit_id, plane, transform_size_id, index); - const uint8_t current_level = - (vertical_level == 0) ? left_level : vertical_level; - masks->SetLevel(current_level, unit_id, plane, - kLoopFilterTypeVertical, - LoopFilterMask::GetLevelOffset(r, c)); - } - column += kNum4x4BlocksWide[tx_size] << subsampling_x; - } - } - - // Build bit mask for horizontal edges. - const uint8_t horizontal_level = - bp.deblock_filter_level[horizontal_level_index]; - for (int column = column_start; column < column_limit; - column += horizontal_step) { - for (int row = row_start; row < row_limit;) { - const TransformSize tx_size = (plane == kPlaneY) - ? inter_transform_sizes_[row][column] - : bp.uv_transform_size; - - // (1). Don't filter frame boundary. - // (2). For tile boundary, we don't know whether the previous tile is - // available or not, thus we handle it after all tiles are decoded. - const bool is_horizontal_border = - (row == row_start) && is_horizontal_block_border; - if (row == GetDeblockPosition(row4x4_start_, subsampling_y) || - (skip && !is_horizontal_border)) { - row += kNum4x4BlocksHigh[tx_size] << subsampling_y; - continue; - } - - // bp_top is the parameter of the top prediction block which is - // guaranteed to be inside the tile. - const BlockParameters& bp_top = - *block_parameters_holder_.Find(row - vertical_step, column); - const uint8_t top_level = - is_horizontal_border - ? bp_top.deblock_filter_level[horizontal_level_index] - : horizontal_level; - // We don't have to check it the top block is skippped or not, - // because if the current transform block is on the edge of the coding - // block, is_horizontal_border is true; if it's not on the edge, - // top skip is equal to skip. - if (horizontal_level != 0 || top_level != 0) { - const TransformSize top_tx_size = - (plane == kPlaneY) - ? inter_transform_sizes_[row - vertical_step][column] - : bp_top.uv_transform_size; - const LoopFilterTransformSizeId transform_size_id = - static_cast<LoopFilterTransformSizeId>( - std::min({kTransformHeightLog2[tx_size] - 2, - kTransformHeightLog2[top_tx_size] - 2, 2})); - const int r = row & (kNum4x4InLoopFilterMaskUnit - 1); - const int c = column & (kNum4x4InLoopFilterMaskUnit - 1); - const int shift = LoopFilterMask::GetShift(r, c); - const int index = LoopFilterMask::GetIndex(r); - const auto mask = static_cast<uint64_t>(1) << shift; - masks->SetTop(mask, unit_id, plane, transform_size_id, index); - const uint8_t current_level = - (horizontal_level == 0) ? top_level : horizontal_level; - masks->SetLevel(current_level, unit_id, plane, - kLoopFilterTypeHorizontal, - LoopFilterMask::GetLevelOffset(r, c)); - } - row += kNum4x4BlocksHigh[tx_size] << subsampling_y; - } - } +void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) { + if (frame_header_.refresh_frame_flags == 0 || + IsIntraFrame(frame_header_.frame_type)) { + return; } -} + // Iterate over odd rows/columns beginning at the first odd row/column for the + // block. It is done this way because motion field mvs are only needed at a + // 8x8 granularity. + const int row_start4x4 = block.row4x4 | 1; + const int row_limit4x4 = + std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4); + if (row_start4x4 >= row_limit4x4) return; + const int column_start4x4 = block.column4x4 | 1; + const int column_limit4x4 = + std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4); + if (column_start4x4 >= column_limit4x4) return; -void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) { // The largest reference MV component that can be saved. constexpr int kRefMvsLimit = (1 << 12) - 1; const BlockParameters& bp = *block.bp; - ReferenceFrameType reference_frame_to_store = kReferenceFrameNone; - MotionVector mv_to_store = {}; + ReferenceInfo* reference_info = current_frame_.reference_info(); for (int i = 1; i >= 0; --i) { - if (bp.reference_frame[i] > kReferenceFrameIntra && - std::abs(bp.mv[i].mv[MotionVector::kRow]) <= kRefMvsLimit && - std::abs(bp.mv[i].mv[MotionVector::kColumn]) <= kRefMvsLimit && - GetRelativeDistance( - reference_order_hint_ - [frame_header_.reference_frame_index[bp.reference_frame[i] - - kReferenceFrameLast]], - frame_header_.order_hint, sequence_header_.enable_order_hint, - sequence_header_.order_hint_bits) < 0) { - reference_frame_to_store = bp.reference_frame[i]; - mv_to_store = bp.mv[i]; - break; - } - } - // Iterate over odd rows/columns beginning at the first odd row/column for the - // block. It is done this way because motion field mvs are only needed at a - // 8x8 granularity. - const int row_start = block.row4x4 | 1; - const int row_limit = std::min(block.row4x4 + kNum4x4BlocksHigh[block.size], - frame_header_.rows4x4); - const int column_start = block.column4x4 | 1; - const int column_limit = - std::min(block.column4x4 + kNum4x4BlocksWide[block.size], - frame_header_.columns4x4); - for (int row = row_start; row < row_limit; row += 2) { - const int row_index = DivideBy2(row); - ReferenceFrameType* const reference_frame_row_start = - current_frame_.motion_field_reference_frame(row_index, - DivideBy2(column_start)); - static_assert(sizeof(reference_frame_to_store) == sizeof(int8_t), ""); - memset(reference_frame_row_start, reference_frame_to_store, - DivideBy2(column_limit - column_start + 1)); - if (reference_frame_to_store <= kReferenceFrameIntra) continue; - for (int column = column_start; column < column_limit; column += 2) { + const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i]; + // Must make a local copy so that StoreMotionFieldMvs() knows there is no + // overlap between load and store. + const MotionVector mv_to_store = bp.mv.mv[i]; + const int mv_row = std::abs(mv_to_store.mv[MotionVector::kRow]); + const int mv_column = std::abs(mv_to_store.mv[MotionVector::kColumn]); + if (reference_frame_to_store > kReferenceFrameIntra && + // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two + // absolute values and then compare with kRefMvsLimit to save a branch. + // The next line is equivalent to: + // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit + (mv_row | mv_column) <= kRefMvsLimit && + reference_info->relative_distance_from[reference_frame_to_store] < 0) { + const int row_start8x8 = DivideBy2(row_start4x4); + const int row_limit8x8 = DivideBy2(row_limit4x4); + const int column_start8x8 = DivideBy2(column_start4x4); + const int column_limit8x8 = DivideBy2(column_limit4x4); + const int rows = row_limit8x8 - row_start8x8; + const int columns = column_limit8x8 - column_start8x8; + const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4()); + ReferenceFrameType* const reference_frame_row_start = + &reference_info + ->motion_field_reference_frame[row_start8x8][column_start8x8]; MotionVector* const mv = - current_frame_.motion_field_mv(row_index, DivideBy2(column)); - *mv = mv_to_store; + &reference_info->motion_field_mv[row_start8x8][column_start8x8]; + + // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined + // and simplifies std::fill() for these cases. + if (columns <= 1) { + // Don't change the above condition to (columns == 1). + // Condition (columns <= 1) may help the compiler simplify the inlining + // of the general case of StoreMotionFieldMvs() by eliminating the + // (columns == 0) case. + assert(columns == 1); + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + 1, reference_frame_row_start, mv); + } else if (columns == 2) { + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + 2, reference_frame_row_start, mv); + } else if (columns == 4) { + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + 4, reference_frame_row_start, mv); + } else if (columns == 8) { + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + 8, reference_frame_row_start, mv); + } else if (columns == 16) { + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + 16, reference_frame_row_start, mv); + } else if (columns < 16) { + // This always true condition (columns < 16) may help the compiler + // simplify the inlining of the following function. + // This general case is rare and usually only happens to the blocks + // which contain the right boundary of the frame. + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + columns, reference_frame_row_start, mv); + } else { + assert(false); + } + return; } } } |