1 files changed, 994 insertions, 808 deletions
diff --git a/libgav1/src/tile/tile.cc b/libgav1/src/tile/tile.cc
index 96c724f..f79158f 100644
--- a/libgav1/src/tile/tile.cc
+++ b/libgav1/src/tile/tile.cc
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <climits>
 #include <cstdlib>
 #include <cstring>
 #include <memory>
@@ -25,9 +26,12 @@
 #include <type_traits>
 #include <utility>
 
+#include "src/frame_scratch_buffer.h"
 #include "src/motion_vector.h"
 #include "src/reconstruction.h"
 #include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
 #include "src/utils/logging.h"
 #include "src/utils/segmentation.h"
 #include "src/utils/stack.h"
@@ -45,8 +49,6 @@ constexpr int kReferenceScaleShift = 14;
 // process is activated.
 constexpr int kQuantizerCoefficientBaseRange = 12;
 constexpr int kNumQuantizerBaseLevels = 2;
-constexpr int kQuantizerCoefficientBaseRangeContextClamp =
-    kQuantizerCoefficientBaseRange + kNumQuantizerBaseLevels + 1;
 constexpr int kCoeffBaseRangeMaxIterations =
     kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1);
 constexpr int kEntropyContextLeft = 0;
@@ -99,6 +101,14 @@ constexpr PredictionMode
         kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
         kPredictionModeD157, kPredictionModeDc};
 
+// Mask used to determine the index for mode_deltas lookup.
+constexpr BitMaskSet kPredictionModeDeltasMask(
+    kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
+    kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
+    kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
+    kPredictionModeNearNewMv, kPredictionModeNewNearMv,
+    kPredictionModeNewNewMv);
+
 // This is computed as:
 // min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
 constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
@@ -146,7 +156,10 @@ constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = {
      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}};
 /* clang-format on */
 
-constexpr uint8_t kCoeffBasePositionContextOffset[3] = {26, 31, 36};
+// Extended the table size from 3 to 16 by repeating the last element to avoid
+// the clips to row or column indices.
+constexpr uint8_t kCoeffBasePositionContextOffset[16] = {
+    26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36};
 
 constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = {
     kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
@@ -235,7 +248,7 @@ constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = {
 
 constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31};
 
-constexpr int8_t kWienerDefaultFilter[3] = {3, -7, 15};
+constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15};
 
 // Maps compound prediction modes into single modes. For e.g.
 // kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0
@@ -264,31 +277,8 @@ PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) {
 // log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because
 // dqDenom is always a power of two and hence right shift can be used instead of
 // division.
-constexpr BitMaskSet kQuantizationShift2Mask(kTransformSize32x64,
-                                             kTransformSize64x32,
-                                             kTransformSize64x64);
-constexpr BitMaskSet kQuantizationShift1Mask(kTransformSize16x32,
-                                             kTransformSize16x64,
-                                             kTransformSize32x16,
-                                             kTransformSize32x32,
-                                             kTransformSize64x16);
-int GetQuantizationShift(TransformSize tx_size) {
-  if (kQuantizationShift2Mask.Contains(tx_size)) {
-    return 2;
-  }
-  if (kQuantizationShift1Mask.Contains(tx_size)) {
-    return 1;
-  }
-  return 0;
-}
-
-// Input: 1d array index |index|, which indexes into a 2d array of width
-//     1 << |tx_width_log2|.
-// Output: 1d array index which indexes into a 2d array of width
-//     (1 << |tx_width_log2|) + kQuantizedCoefficientBufferPadding.
-int PaddedIndex(int index, int tx_width_log2) {
-  return index + MultiplyBy4(index >> tx_width_log2);
-}
+constexpr uint8_t kQuantizationShift[kNumTransformSizes] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2};
 
 // Returns the minimum of |length| or |max|-|start|. This is used to clamp array
 // indices when accessing arrays whose bound is equal to |max|.
@@ -296,40 +286,151 @@ int GetNumElements(int length, int start, int max) {
   return std::min(length, max - start);
 }
 
+template <typename T>
+void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+  // Specialize all columns cases (values in kTransformWidth4x4[]) for better
+  // performance.
+  switch (columns) {
+    case 1:
+      MemSetBlock<T>(rows, 1, value, dst, stride);
+      break;
+    case 2:
+      MemSetBlock<T>(rows, 2, value, dst, stride);
+      break;
+    case 4:
+      MemSetBlock<T>(rows, 4, value, dst, stride);
+      break;
+    case 8:
+      MemSetBlock<T>(rows, 8, value, dst, stride);
+      break;
+    default:
+      assert(columns == 16);
+      MemSetBlock<T>(rows, 16, value, dst, stride);
+      break;
+  }
+}
+
 void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4,
                       TransformType tx_type,
                       TransformType transform_types[32][32]) {
   const int y_offset = y4 - block.row4x4;
   const int x_offset = x4 - block.column4x4;
-  static_assert(sizeof(transform_types[0][0]) == 1, "");
-  for (int i = 0; i < h4; ++i) {
-    memset(&transform_types[y_offset + i][x_offset], tx_type, w4);
-  }
+  TransformType* const dst = &transform_types[y_offset][x_offset];
+  SetBlockValues<TransformType>(h4, w4, tx_type, dst, 32);
+}
+
+void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,
+                         const MotionVector& mv_to_store, ptrdiff_t stride,
+                         int rows, int columns,
+                         ReferenceFrameType* reference_frame_row_start,
+                         MotionVector* mv) {
+  static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), "");
+  do {
+    // Don't switch the following two memory setting functions.
+    // Some ARM CPUs are quite sensitive to the order.
+    memset(reference_frame_row_start, reference_frame_to_store, columns);
+    std::fill(mv, mv + columns, mv_to_store);
+    reference_frame_row_start += stride;
+    mv += stride;
+  } while (--rows != 0);
+}
+
+// Inverse transform process assumes that the quantized coefficients are stored
+// as a virtual 2d array of size |tx_width| x tx_height. If transform width is
+// 64, then this assumption is broken because the scan order used for populating
+// the coefficients for such transforms is the same as the one used for
+// corresponding transform with width 32 (e.g. the scan order used for 64x16 is
+// the same as the one used for 32x16). So we must restore the coefficients to
+// their correct positions and clean the positions they occupied.
+template <typename ResidualType>
+void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width,
+                                  ResidualType* residual) {
+  if (tx_width != 64) return;
+  const int rows = clamped_tx_height - 2;
+  auto* src = residual + 32 * rows;
+  residual += 64 * rows;
+  // Process 2 rows in each loop in reverse order to avoid overwrite.
+  int x = rows >> 1;
+  do {
+    // The 2 rows can be processed in order.
+    memcpy(residual, src, 32 * sizeof(src[0]));
+    memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+    memset(src + 32, 0, 32 * sizeof(src[0]));
+    src -= 64;
+    residual -= 128;
+  } while (--x);
+  // Process the second row. The first row is already correct.
+  memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+  memset(src + 32, 0, 32 * sizeof(src[0]));
+}
+
+void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) {
+  // 7.10.2.14 (part 1). (also contains implementations of 5.11.53
+  // and 5.11.54).
+  constexpr int kMvBorder4x4 = 4;
+  const int row_border = kMvBorder4x4 + block.height4x4;
+  const int column_border = kMvBorder4x4 + block.width4x4;
+  const int macroblocks_to_top_edge = -block.row4x4;
+  const int macroblocks_to_bottom_edge =
+      block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4;
+  const int macroblocks_to_left_edge = -block.column4x4;
+  const int macroblocks_to_right_edge =
+      block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4;
+  min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border);
+  min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border);
+  max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border);
+  max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border);
+}
+
+// Section 8.3.2 in the spec, under coeff_base_eob.
+int GetCoeffBaseContextEob(TransformSize tx_size, int index) {
+  if (index == 0) return 0;
+  const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+  const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+  const int tx_height = kTransformHeight[adjusted_tx_size];
+  if (index <= DivideBy8(tx_height << tx_width_log2)) return 1;
+  if (index <= DivideBy4(tx_height << tx_width_log2)) return 2;
+  return 3;
+}
+
+// Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based
+// on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in
+// the end of block case.
+int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos,
+                                TransformClass tx_class) {
+  if (pos == 0) return 0;
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  const int row = pos >> adjusted_tx_width_log2;
+  const int column = pos & (tx_width - 1);
+  // This return statement is equivalent to:
+  // return ((tx_class == kTransformClass2D && (row | column) < 2) ||
+  //         (tx_class == kTransformClassHorizontal && column == 0) ||
+  //         (tx_class == kTransformClassVertical && row == 0))
+  //            ? 7
+  //            : 14;
+  return 14 >> ((static_cast<int>(tx_class == kTransformClass2D) &
+                 static_cast<int>((row | column) < 2)) |
+                (tx_class & static_cast<int>(column == 0)) |
+                ((tx_class >> 1) & static_cast<int>(row == 0)));
 }
 
 }  // namespace
 
-Tile::Tile(
-    int tile_number, const uint8_t* const data, size_t size,
-    const ObuSequenceHeader& sequence_header,
-    const ObuFrameHeader& frame_header, RefCountedBuffer* const current_frame,
-    const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias,
-    const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
-        reference_frames,
-    Array2D<TemporalMotionVector>* const motion_field_mv,
-    const std::array<uint8_t, kNumReferenceFrameTypes>& reference_order_hint,
-    const std::array<uint8_t, kWedgeMaskSize>& wedge_masks,
-    const SymbolDecoderContext& symbol_decoder_context,
-    SymbolDecoderContext* const saved_symbol_decoder_context,
-    const SegmentationMap* prev_segment_ids, PostFilter* const post_filter,
-    BlockParametersHolder* const block_parameters_holder,
-    Array2D<int16_t>* const cdef_index,
-    Array2D<TransformSize>* const inter_transform_sizes,
-    const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
-    ResidualBufferPool* const residual_buffer_pool,
-    DecoderScratchBufferPool* const decoder_scratch_buffer_pool,
-    BlockingCounterWithStatus* const pending_tiles)
+Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
+           const ObuSequenceHeader& sequence_header,
+           const ObuFrameHeader& frame_header,
+           RefCountedBuffer* const current_frame, const DecoderState& state,
+           FrameScratchBuffer* const frame_scratch_buffer,
+           const WedgeMaskArray& wedge_masks,
+           SymbolDecoderContext* const saved_symbol_decoder_context,
+           const SegmentationMap* prev_segment_ids,
+           PostFilter* const post_filter, const dsp::Dsp* const dsp,
+           ThreadPool* const thread_pool,
+           BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+           bool use_intra_prediction_buffer)
     : number_(tile_number),
+      row_(number_ / frame_header.tile_info.tile_columns),
+      column_(number_ % frame_header.tile_info.tile_columns),
       data_(data),
       size_(size),
       read_deltas_(false),
@@ -340,19 +441,18 @@ Tile::Tile(
       current_quantizer_index_(frame_header.quantizer.base_index),
       sequence_header_(sequence_header),
       frame_header_(frame_header),
-      current_frame_(*current_frame),
-      reference_frame_sign_bias_(reference_frame_sign_bias),
-      reference_frames_(reference_frames),
-      motion_field_mv_(motion_field_mv),
-      reference_order_hint_(reference_order_hint),
+      reference_frame_sign_bias_(state.reference_frame_sign_bias),
+      reference_frames_(state.reference_frame),
+      motion_field_(frame_scratch_buffer->motion_field),
+      reference_order_hint_(state.reference_order_hint),
       wedge_masks_(wedge_masks),
       reader_(data_, size_, frame_header_.enable_cdf_update),
-      symbol_decoder_context_(symbol_decoder_context),
+      symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
       saved_symbol_decoder_context_(saved_symbol_decoder_context),
       prev_segment_ids_(prev_segment_ids),
       dsp_(*dsp),
       post_filter_(*post_filter),
-      block_parameters_holder_(*block_parameters_holder),
+      block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
       quantizer_(sequence_header_.color_config.bitdepth,
                  &frame_header_.quantizer),
       residual_size_((sequence_header_.color_config.bitdepth == 8)
@@ -362,15 +462,20 @@ Tile::Tile(
           frame_header_.allow_intrabc
               ? (sequence_header_.use_128x128_superblock ? 3 : 5)
               : 1),
-      cdef_index_(*cdef_index),
-      inter_transform_sizes_(*inter_transform_sizes),
+      current_frame_(*current_frame),
+      cdef_index_(frame_scratch_buffer->cdef_index),
+      inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
       thread_pool_(thread_pool),
-      residual_buffer_pool_(residual_buffer_pool),
-      decoder_scratch_buffer_pool_(decoder_scratch_buffer_pool),
+      residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()),
+      tile_scratch_buffer_pool_(
+          &frame_scratch_buffer->tile_scratch_buffer_pool),
       pending_tiles_(pending_tiles),
-      build_bit_mask_when_parsing_(false) {
-  row_ = number_ / frame_header.tile_info.tile_columns;
-  column_ = number_ % frame_header.tile_info.tile_columns;
+      frame_parallel_(frame_parallel),
+      use_intra_prediction_buffer_(use_intra_prediction_buffer),
+      intra_prediction_buffer_(
+          use_intra_prediction_buffer_
+              ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
+              : nullptr) {
   row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
   row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
   column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
@@ -382,16 +487,71 @@ Tile::Tile(
   superblock_columns_ =
       (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >>
       block_width4x4_log2;
-  // Enable multi-threading within a tile only if there are at least as many
-  // superblock columns as |intra_block_copy_lag_|.
-  split_parse_and_decode_ =
-      thread_pool_ != nullptr && superblock_columns_ > intra_block_copy_lag_;
+  // If |split_parse_and_decode_| is true, we do the necessary setup for
+  // splitting the parsing and the decoding steps. This is done in the following
+  // two cases:
+  //  1) If there is multi-threading within a tile (this is done if
+  //     |thread_pool_| is not nullptr and if there are at least as many
+  //     superblock columns as |intra_block_copy_lag_|).
+  //  2) If |frame_parallel| is true.
+  split_parse_and_decode_ = (thread_pool_ != nullptr &&
+                             superblock_columns_ > intra_block_copy_lag_) ||
+                            frame_parallel;
+  if (frame_parallel_) {
+    reference_frame_progress_cache_.fill(INT_MIN);
+  }
   memset(delta_lf_, 0, sizeof(delta_lf_));
   delta_lf_all_zero_ = true;
-  YuvBuffer* const buffer = current_frame->buffer();
+  const YuvBuffer& buffer = post_filter_.frame_buffer();
   for (int plane = 0; plane < PlaneCount(); ++plane) {
-    buffer_[plane].Reset(buffer->height(plane) + buffer->bottom_border(plane),
-                         buffer->stride(plane), buffer->data(plane));
+    // Verify that the borders are big enough for Reconstruct(). max_tx_length
+    // is the maximum value of tx_width and tx_height for the plane.
+    const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
+    // Reconstruct() may overwrite on the right. Since the right border of a
+    // row is followed in memory by the left border of the next row, the
+    // number of extra pixels to the right of a row is at least the sum of the
+    // left and right borders.
+    //
+    // Note: This assertion actually checks the sum of the left and right
+    // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally
+    // and vertically shifted version of |buffer|. Since the sum of the left and
+    // right borders is not changed by the shift, we can just check the sum of
+    // the left and right borders of |buffer|.
+    assert(buffer.left_border(plane) + buffer.right_border(plane) >=
+           max_tx_length - 1);
+    // Reconstruct() may overwrite on the bottom. We need an extra border row
+    // on the bottom because we need the left border of that row.
+    //
+    // Note: This assertion checks the bottom border of
+    // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical
+    // shift that the PostFilter constructor applied to |buffer| and reduce the
+    // bottom border by that amount.
+#ifndef NDEBUG
+    const int vertical_shift = static_cast<int>(
+        (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) /
+        buffer.stride(plane));
+    const int bottom_border = buffer.bottom_border(plane) - vertical_shift;
+    assert(bottom_border >= max_tx_length);
+#endif
+    // In AV1, a transform block of height H starts at a y coordinate that is
+    // a multiple of H. If a transform block at the bottom of the frame has
+    // height H, then Reconstruct() will write up to the row with index
+    // Align(buffer.height(plane), H) - 1. Therefore the maximum number of
+    // rows Reconstruct() may write to is
+    // Align(buffer.height(plane), max_tx_length).
+    buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length),
+                         buffer.stride(plane),
+                         post_filter_.GetUnfilteredBuffer(plane));
+    const int plane_height =
+        RightShiftWithRounding(frame_header_.height, subsampling_y_[plane]);
+    deblock_row_limit_[plane] =
+        std::min(frame_header_.rows4x4, DivideBy4(plane_height + 3)
+                                            << subsampling_y_[plane]);
+    const int plane_width =
+        RightShiftWithRounding(frame_header_.width, subsampling_x_[plane]);
+    deblock_column_limit_[plane] =
+        std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3)
+                                               << subsampling_x_[plane]);
   }
 }
 
@@ -418,7 +578,10 @@ bool Tile::Init() {
       return false;
     }
   } else {
-    residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(32, 4096 * residual_size_);
+    // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary
+    // checks when parsing quantized coefficients.
+    residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(
+        32, (4096 + 32 * kResidualPaddingVertical) * residual_size_);
     if (residual_buffer_ == nullptr) {
       LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed.");
       return false;
@@ -429,62 +592,165 @@ bool Tile::Init() {
       return false;
     }
   }
+  if (frame_header_.use_ref_frame_mvs) {
+    assert(sequence_header_.enable_order_hint);
+    SetupMotionField(frame_header_, current_frame_, reference_frames_,
+                     row4x4_start_, row4x4_end_, column4x4_start_,
+                     column4x4_end_, &motion_field_);
+  }
+  ResetLoopRestorationParams();
   return true;
 }
 
-bool Tile::Decode(bool is_main_thread) {
-  if (!Init()) {
-    pending_tiles_->Decrement(false);
-    return false;
+template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+bool Tile::ProcessSuperBlockRow(int row4x4,
+                                TileScratchBuffer* const scratch_buffer) {
+  if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true;
+  assert(scratch_buffer != nullptr);
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
+       column4x4 += block_width4x4) {
+    if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer,
+                           processing_mode)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
+                   row4x4, column4x4);
+      return false;
+    }
   }
-  if (frame_header_.use_ref_frame_mvs) {
-    SetupMotionField(sequence_header_, frame_header_, current_frame_,
-                     reference_frames_, motion_field_mv_, row4x4_start_,
-                     row4x4_end_, column4x4_start_, column4x4_end_);
+  if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) {
+    SaveSymbolDecoderContext();
   }
-  ResetLoopRestorationParams();
+  if (processing_mode == kProcessingModeDecodeOnly ||
+      processing_mode == kProcessingModeParseAndDecode) {
+    PopulateIntraPredictionBuffer(row4x4);
+  }
+  return true;
+}
+
+// Used in frame parallel mode. The symbol decoder context need not be saved in
+// this case since it was done when parsing was complete.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+// Used in non frame parallel mode.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+
+void Tile::SaveSymbolDecoderContext() {
+  if (frame_header_.enable_frame_end_update_cdf &&
+      number_ == frame_header_.tile_info.context_update_id) {
+    *saved_symbol_decoder_context_ = symbol_decoder_context_;
+  }
+}
+
+bool Tile::ParseAndDecode() {
   // If this is the main thread, we build the loop filter bit masks when parsing
   // so that it happens in the current thread. This ensures that the main thread
   // does as much work as possible.
-  build_bit_mask_when_parsing_ = is_main_thread;
   if (split_parse_and_decode_) {
-    if (!ThreadedDecode()) return false;
-  } else {
-    const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
-    std::unique_ptr<DecoderScratchBuffer> scratch_buffer =
-        decoder_scratch_buffer_pool_->Get();
-    if (scratch_buffer == nullptr) {
+    if (!ThreadedParseAndDecode()) return false;
+    SaveSymbolDecoderContext();
+    return true;
+  }
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    pending_tiles_->Decrement(false);
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4) {
+    if (!ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+            row4x4, scratch_buffer.get())) {
       pending_tiles_->Decrement(false);
-      LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
       return false;
     }
-    for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
-         row4x4 += block_width4x4) {
-      for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
-           column4x4 += block_width4x4) {
-        if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4,
-                               scratch_buffer.get(),
-                               kProcessingModeParseAndDecode)) {
-          pending_tiles_->Decrement(false);
-          LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
-                       row4x4, column4x4);
-          return false;
-        }
-      }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  pending_tiles_->Decrement(true);
+  return true;
+}
+
+bool Tile::Parse() {
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4) {
+    if (!ProcessSuperBlockRow<kProcessingModeParseOnly, false>(
+            row4x4, scratch_buffer.get())) {
+      return false;
     }
-    decoder_scratch_buffer_pool_->Release(std::move(scratch_buffer));
   }
-  if (frame_header_.enable_frame_end_update_cdf &&
-      number_ == frame_header_.tile_info.context_update_id) {
-    *saved_symbol_decoder_context_ = symbol_decoder_context_;
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  SaveSymbolDecoderContext();
+  return true;
+}
+
+bool Tile::Decode(
+    std::mutex* const mutex, int* const superblock_row_progress,
+    std::condition_variable* const superblock_row_progress_condvar) {
+  const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+  const int block_width4x4_log2 =
+      sequence_header_.use_128x128_superblock ? 5 : 4;
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
   }
-  if (!split_parse_and_decode_) {
-    pending_tiles_->Decrement(true);
+  for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
+       row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
+    if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+            row4x4, scratch_buffer.get())) {
+      return false;
+    }
+    if (post_filter_.DoDeblock()) {
+      // Apply vertical deblock filtering for all the columns in this tile
+      // except for the first 64 columns.
+      post_filter_.ApplyDeblockFilter(
+          kLoopFilterTypeVertical, row4x4,
+          column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
+          block_width4x4);
+      // If this is the first superblock row of the tile, then we cannot apply
+      // horizontal deblocking here since we don't know if the top row is
+      // available. So it will be done by the calling thread in that case.
+      if (row4x4 != row4x4_start_) {
+        // Apply horizontal deblock filtering for all the columns in this tile
+        // except for the first and the last 64 columns.
+        // Note about the last tile of each row: For the last tile,
+        // column4x4_end may not be a multiple of 16. In that case it is still
+        // okay to simply subtract 16 since ApplyDeblockFilter() will only do
+        // the filters in increments of 64 columns (or 32 columns for chroma
+        // with subsampling).
+        post_filter_.ApplyDeblockFilter(
+            kLoopFilterTypeHorizontal, row4x4,
+            column4x4_start_ + kNum4x4InLoopFilterUnit,
+            column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
+      }
+    }
+    bool notify;
+    {
+      std::unique_lock<std::mutex> lock(*mutex);
+      notify = ++superblock_row_progress[index] ==
+               frame_header_.tile_info.tile_columns;
+    }
+    if (notify) {
+      // We are done decoding this superblock row. Notify the post filtering
+      // thread.
+      superblock_row_progress_condvar[index].notify_one();
+    }
   }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
   return true;
 }
 
-bool Tile::ThreadedDecode() {
+bool Tile::ThreadedParseAndDecode() {
   {
     std::lock_guard<std::mutex> lock(threading_.mutex);
     if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
@@ -499,8 +765,8 @@ bool Tile::ThreadedDecode() {
   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
 
   // Begin parsing.
-  std::unique_ptr<DecoderScratchBuffer> scratch_buffer =
-      decoder_scratch_buffer_pool_->Get();
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
   if (scratch_buffer == nullptr) {
     pending_tiles_->Decrement(false);
     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
@@ -535,7 +801,7 @@ bool Tile::ThreadedDecode() {
     std::lock_guard<std::mutex> lock(threading_.mutex);
     if (threading_.abort) break;
   }
-  decoder_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
 
   // We are done parsing. We can return here since the calling thread will make
   // sure that it waits for all the superblocks to be decoded.
@@ -593,13 +859,13 @@ void Tile::DecodeSuperBlock(int row_index, int column_index,
                             int block_width4x4) {
   const int row4x4 = row4x4_start_ + (row_index * block_width4x4);
   const int column4x4 = column4x4_start_ + (column_index * block_width4x4);
-  std::unique_ptr<DecoderScratchBuffer> scratch_buffer =
-      decoder_scratch_buffer_pool_->Get();
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
   bool ok = scratch_buffer != nullptr;
   if (ok) {
     ok = ProcessSuperBlock(row4x4, column4x4, block_width4x4,
                            scratch_buffer.get(), kProcessingModeDecodeOnly);
-    decoder_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+    tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
   }
   std::unique_lock<std::mutex> lock(threading_.mutex);
   if (ok) {
@@ -647,9 +913,38 @@ void Tile::DecodeSuperBlock(int row_index, int column_index,
   }
 }
 
-bool Tile::IsInside(int row4x4, int column4x4) const {
-  return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_ &&
-         column4x4 >= column4x4_start_ && column4x4 < column4x4_end_;
+void Tile::PopulateIntraPredictionBuffer(int row4x4) {
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
+    return;
+  }
+  const size_t pixel_size =
+      (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                   : sizeof(uint16_t));
+  for (int plane = 0; plane < PlaneCount(); ++plane) {
+    const int row_to_copy =
+        (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
+    const size_t pixels_to_copy =
+        (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
+         subsampling_x_[plane]) *
+        pixel_size;
+    const size_t column_start =
+        MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
+    void* start;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (sequence_header_.color_config.bitdepth > 8) {
+      Array2DView<uint16_t> buffer(
+          buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+          reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+      start = &buffer[row_to_copy][column_start];
+    } else  // NOLINT
+#endif
+    {
+      start = &buffer_[plane][row_to_copy][column_start];
+    }
+    memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
+           start, pixels_to_copy);
+  }
 }
 
 int Tile::GetTransformAllZeroContext(const Block& block, Plane plane,
@@ -660,7 +955,7 @@ int Tile::GetTransformAllZeroContext(const Block& block, Plane plane,
 
   const int tx_width = kTransformWidth[tx_size];
   const int tx_height = kTransformHeight[tx_size];
-  const BlockSize plane_size = block.residual_size[GetPlaneType(plane)];
+  const BlockSize plane_size = block.residual_size[plane];
   const int block_width = kBlockWidthPixels[plane_size];
   const int block_height = kBlockHeightPixels[plane_size];
 
@@ -785,150 +1080,167 @@ void Tile::ReadTransformType(const Block& block, int x4, int y4,
                    kTransformHeight4x4[tx_size], tx_type, transform_types_);
 }
 
-// Section 8.3.2 in the spec, under coeff_base_eob.
-int Tile::GetCoeffBaseContextEob(TransformSize tx_size, int index) {
-  if (index == 0) return 0;
-  const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
-  const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
-  const int tx_height = kTransformHeight[adjusted_tx_size];
-  if (index <= DivideBy8(tx_height << tx_width_log2)) return 1;
-  if (index <= DivideBy4(tx_height << tx_width_log2)) return 2;
-  return 3;
-}
-
-// Section 8.3.2 in the spec, under coeff_base.
-int Tile::GetCoeffBaseContext2D(const int32_t* const quantized_buffer,
-                                TransformSize tx_size,
-                                int adjusted_tx_width_log2, uint16_t pos) {
-  if (pos == 0) return 0;
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the two right neighbors and the
+// one bottom-right neighbor may be out of boundary. We don't check the right
+// boundary for them, because the out of boundary neighbors project to positions
+// above the diagonal line which goes through the current coefficient and these
+// positions are still all 0s according to the diagonal scan order.
+template <typename ResidualType>
+void Tile::ReadCoeffBase2D(
+    const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
+    int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    ResidualType* const quantized_buffer) {
   const int tx_width = 1 << adjusted_tx_width_log2;
-  const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding;
-  const int32_t* const quantized =
-      &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)];
-  const int context = std::min(
-      4, DivideBy2(1 + (std::min(quantized[1], 3) +                    // {0, 1}
-                        std::min(quantized[padded_tx_width], 3) +      // {1, 0}
-                        std::min(quantized[padded_tx_width + 1], 3) +  // {1, 1}
-                        std::min(quantized[2], 3) +                    // {0, 2}
-                        std::min(quantized[MultiplyBy2(padded_tx_width)],
-                                 3))));  // {2, 0}
-  const int row = pos >> adjusted_tx_width_log2;
-  const int column = pos & (tx_width - 1);
-  return context + kCoeffBaseContextOffset[tx_size][std::min(row, 4)]
-                                          [std::min(column, 4)];
+  int i = eob - 2;
+  do {
+    constexpr auto threshold = static_cast<ResidualType>(3);
+    const uint16_t pos = scan[i];
+    const int row = pos >> adjusted_tx_width_log2;
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    int context;
+    if (pos == 0) {
+      context = 0;
+    } else {
+      context = std::min(
+          4, DivideBy2(
+                 1 + (std::min(quantized[1], threshold) +             // {0, 1}
+                      std::min(quantized[tx_width], threshold) +      // {1, 0}
+                      std::min(quantized[tx_width + 1], threshold) +  // {1, 1}
+                      std::min(quantized[2], threshold) +             // {0, 2}
+                      std::min(quantized[MultiplyBy2(tx_width)],
+                               threshold))));  // {2, 0}
+      context += kCoeffBaseContextOffset[tx_size][std::min(row, 4)]
+                                        [std::min(column, 4)];
+    }
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      context = std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
+                                      quantized[tx_width] +       // {1, 0}
+                                      quantized[tx_width + 1]));  // {1, 1}
+      if (pos != 0) {
+        context += 14 >> static_cast<int>((row | column) < 2);
+      }
+      level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type);
+    }
+    quantized[0] = level;
+  } while (--i >= 0);
 }
 
-// Section 8.3.2 in the spec, under coeff_base.
-int Tile::GetCoeffBaseContextHorizontal(const int32_t* const quantized_buffer,
-                                        TransformSize /*tx_size*/,
-                                        int adjusted_tx_width_log2,
-                                        uint16_t pos) {
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the four right neighbors may be
+// out of boundary. We don't do the boundary check for the first three right
+// neighbors, because even for the transform blocks with smallest width 4, the
+// first three out of boundary neighbors project to positions left of the
+// current coefficient and these positions are still all 0s according to the
+// column scan order. However, when transform block width is 4 and the current
+// coefficient is on the right boundary, its fourth right neighbor projects to
+// the under position on the same column, which could be nonzero. Therefore, we
+// must skip the fourth right neighbor. To make it simple, for any coefficient,
+// we always do the boundary check for its fourth right neighbor.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseHorizontal(
+    const uint16_t* scan, PlaneType plane_type, TransformSize /*tx_size*/,
+    int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    ResidualType* const quantized_buffer) {
   const int tx_width = 1 << adjusted_tx_width_log2;
-  const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding;
-  const int32_t* const quantized =
-      &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)];
-  const int context = std::min(
-      4, DivideBy2(1 + (std::min(quantized[1], 3) +                // {0, 1}
-                        std::min(quantized[padded_tx_width], 3) +  // {1, 0}
-                        std::min(quantized[2], 3) +                // {0, 2}
-                        std::min(quantized[3], 3) +                // {0, 3}
-                        std::min(quantized[4], 3))));              // {0, 4}
-  const int index = pos & (tx_width - 1);
-  return context + kCoeffBasePositionContextOffset[std::min(index, 2)];
+  int i = eob - 2;
+  do {
+    constexpr auto threshold = static_cast<ResidualType>(3);
+    const uint16_t pos = scan[i];
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    int context = std::min(
+        4,
+        DivideBy2(1 +
+                  (std::min(quantized[1], threshold) +         // {0, 1}
+                   std::min(quantized[tx_width], threshold) +  // {1, 0}
+                   std::min(quantized[2], threshold) +         // {0, 2}
+                   std::min(quantized[3], threshold) +         // {0, 3}
+                   std::min(quantized[4],
+                            static_cast<ResidualType>(
+                                (column + 4 < tx_width) ? 3 : 0)))));  // {0, 4}
+    context += kCoeffBasePositionContextOffset[column];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      context = std::min(6, DivideBy2(1 + quantized[1] +     // {0, 1}
+                                      quantized[tx_width] +  // {1, 0}
+                                      quantized[2]));        // {0, 2}
+      if (pos != 0) {
+        context += 14 >> static_cast<int>(column == 0);
+      }
+      level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type);
+    }
+    quantized[0] = level;
+  } while (--i >= 0);
 }
 
-// Section 8.3.2 in the spec, under coeff_base.
-int Tile::GetCoeffBaseContextVertical(const int32_t* const quantized_buffer,
-                                      TransformSize /*tx_size*/,
-                                      int adjusted_tx_width_log2,
-                                      uint16_t pos) {
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// Right boundary check is performed explicitly.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseVertical(
+    const uint16_t* scan, PlaneType plane_type, TransformSize /*tx_size*/,
+    int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    ResidualType* const quantized_buffer) {
   const int tx_width = 1 << adjusted_tx_width_log2;
-  const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding;
-  const int32_t* const quantized =
-      &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)];
-  const int context = std::min(
-      4, DivideBy2(1 + (std::min(quantized[1], 3) +                // {0, 1}
-                        std::min(quantized[padded_tx_width], 3) +  // {1, 0}
-                        std::min(quantized[MultiplyBy2(padded_tx_width)],
-                                 3) +                                  // {2, 0}
-                        std::min(quantized[padded_tx_width * 3], 3) +  // {3, 0}
-                        std::min(quantized[MultiplyBy4(padded_tx_width)],
-                                 3))));  // {4, 0}
-
-  const int index = pos >> adjusted_tx_width_log2;
-  return context + kCoeffBasePositionContextOffset[std::min(index, 2)];
-}
-
-// Section 8.3.2 in the spec, under coeff_br.
-int Tile::GetCoeffBaseRangeContext2D(const int32_t* const quantized_buffer,
-                                     int adjusted_tx_width_log2, int pos) {
-  const uint8_t tx_width = 1 << adjusted_tx_width_log2;
-  const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding;
-  const int32_t* const quantized =
-      &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)];
-  const int context = std::min(
-      6, DivideBy2(
-             1 +
-             std::min(quantized[1],
-                      kQuantizerCoefficientBaseRangeContextClamp) +  // {0, 1}
-             std::min(quantized[padded_tx_width],
-                      kQuantizerCoefficientBaseRangeContextClamp) +  // {1, 0}
-             std::min(quantized[padded_tx_width + 1],
-                      kQuantizerCoefficientBaseRangeContextClamp)));  // {1, 1}
-  if (pos == 0) return context;
-  const int row = pos >> adjusted_tx_width_log2;
-  const int column = pos & (tx_width - 1);
-  return context + (((row | column) < 2) ? 7 : 14);
-}
-
-// Section 8.3.2 in the spec, under coeff_br.
-int Tile::GetCoeffBaseRangeContextHorizontal(
-    const int32_t* const quantized_buffer, int adjusted_tx_width_log2,
-    int pos) {
-  const uint8_t tx_width = 1 << adjusted_tx_width_log2;
-  const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding;
-  const int32_t* const quantized =
-      &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)];
-  const int context = std::min(
-      6, DivideBy2(
-             1 +
-             std::min(quantized[1],
-                      kQuantizerCoefficientBaseRangeContextClamp) +  // {0, 1}
-             std::min(quantized[padded_tx_width],
-                      kQuantizerCoefficientBaseRangeContextClamp) +  // {1, 0}
-             std::min(quantized[2],
-                      kQuantizerCoefficientBaseRangeContextClamp)));  // {0, 2}
-  if (pos == 0) return context;
-  const int column = pos & (tx_width - 1);
-  return context + ((column == 0) ? 7 : 14);
-}
-
-// Section 8.3.2 in the spec, under coeff_br.
-int Tile::GetCoeffBaseRangeContextVertical(
-    const int32_t* const quantized_buffer, int adjusted_tx_width_log2,
-    int pos) {
-  const uint8_t tx_width = 1 << adjusted_tx_width_log2;
-  const int padded_tx_width = tx_width + kQuantizedCoefficientBufferPadding;
-  const int32_t* const quantized =
-      &quantized_buffer[PaddedIndex(pos, adjusted_tx_width_log2)];
-  const int context = std::min(
-      6, DivideBy2(
-             1 +
-             std::min(quantized[1],
-                      kQuantizerCoefficientBaseRangeContextClamp) +  // {0, 1}
-             std::min(quantized[padded_tx_width],
-                      kQuantizerCoefficientBaseRangeContextClamp) +  // {1, 0}
-             std::min(quantized[MultiplyBy2(padded_tx_width)],
-                      kQuantizerCoefficientBaseRangeContextClamp)));  // {2, 0}
-  if (pos == 0) return context;
-  const int row = pos >> adjusted_tx_width_log2;
-  return context + ((row == 0) ? 7 : 14);
+  int i = eob - 2;
+  do {
+    constexpr auto threshold = static_cast<ResidualType>(3);
+    const uint16_t pos = scan[i];
+    const int row = pos >> adjusted_tx_width_log2;
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
+    int context =
+        std::min(4, DivideBy2(1 + (std::min(quantized_column1, 3) +  // {0, 1}
+                                   std::min(quantized[tx_width],
+                                            threshold) +  // {1, 0}
+                                   std::min(quantized[MultiplyBy2(tx_width)],
+                                            threshold) +  // {2, 0}
+                                   std::min(quantized[tx_width * 3],
+                                            threshold) +  // {3, 0}
+                                   std::min(quantized[MultiplyBy4(tx_width)],
+                                            threshold))));  // {4, 0}
+    context += kCoeffBasePositionContextOffset[row];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      int context =
+          std::min(6, DivideBy2(1 + quantized_column1 +              // {0, 1}
+                                quantized[tx_width] +                // {1, 0}
+                                quantized[MultiplyBy2(tx_width)]));  // {2, 0}
+      if (pos != 0) {
+        context += 14 >> static_cast<int>(row == 0);
+      }
+      level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type);
+    }
+    quantized[0] = level;
+  } while (--i >= 0);
 }
 
 int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) {
   const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
   const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
-  int dc_sign = std::accumulate(
+  // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension.
+  int8_t dc_sign = std::accumulate(
       dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0);
   const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
   dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
@@ -938,6 +1250,8 @@ int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) {
   //   if (dc_sign < 0) return 1;
   //   if (dc_sign > 0) return 2;
   //   return 0;
+  // And it is better than:
+  //   return static_cast<int>(dc_sign != 0) + static_cast<int>(dc_sign > 0);
   return static_cast<int>(dc_sign < 0) +
          MultiplyBy2(static_cast<int>(dc_sign > 0));
 }
@@ -1020,23 +1334,21 @@ void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane,
   }
 }
 
-template <bool is_dc_coefficient>
+template <typename ResidualType, bool is_dc_coefficient>
 bool Tile::ReadSignAndApplyDequantization(
-    const Block& block, int32_t* const quantized_buffer,
-    const uint16_t* const scan, int i, int adjusted_tx_width_log2, int tx_width,
-    int q_value, const uint8_t* const quantizer_matrix, int shift,
-    int min_value, int max_value, uint16_t* const dc_sign_cdf,
-    int8_t* const dc_category, int* const coefficient_level) {
-  int pos = is_dc_coefficient ? 0 : scan[i];
-  const int pos_index =
-      is_dc_coefficient ? 0 : PaddedIndex(pos, adjusted_tx_width_log2);
-  // If quantized_buffer[pos_index] is zero, then the rest of the function has
-  // no effect.
-  if (quantized_buffer[pos_index] == 0) return true;
-  const bool sign = is_dc_coefficient ? reader_.ReadSymbol(dc_sign_cdf)
-                                      : static_cast<bool>(reader_.ReadBit());
-  if (quantized_buffer[pos_index] >
-      kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) {
+    const uint16_t* const scan, int i, int q_value,
+    const uint8_t* const quantizer_matrix, int shift, int max_value,
+    uint16_t* const dc_sign_cdf, int8_t* const dc_category,
+    int* const coefficient_level, ResidualType* residual_buffer) {
+  const int pos = is_dc_coefficient ? 0 : scan[i];
+  // If residual_buffer[pos] is zero, then the rest of the function has no
+  // effect.
+  int level = residual_buffer[pos];
+  if (level == 0) return true;
+  const int sign = is_dc_coefficient
+                       ? static_cast<int>(reader_.ReadSymbol(dc_sign_cdf))
+                       : reader_.ReadBit();
+  if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) {
     int length = 0;
     bool golomb_length_bit = false;
     do {
@@ -1051,13 +1363,13 @@ bool Tile::ReadSignAndApplyDequantization(
     for (int i = length - 2; i >= 0; --i) {
       x = (x << 1) | reader_.ReadBit();
     }
-    quantized_buffer[pos_index] += x - 1;
+    level += x - 1;
   }
-  if (is_dc_coefficient && quantized_buffer[0] > 0) {
-    *dc_category = sign ? -1 : 1;
+  if (is_dc_coefficient) {
+    *dc_category = (sign != 0) ? -1 : 1;
   }
-  quantized_buffer[pos_index] &= 0xfffff;
-  *coefficient_level += quantized_buffer[pos_index];
+  level &= 0xfffff;
+  *coefficient_level += level;
   // Apply dequantization. Step 1 of section 7.12.3 in the spec.
   int q = q_value;
   if (quantizer_matrix != nullptr) {
@@ -1065,34 +1377,21 @@ bool Tile::ReadSignAndApplyDequantization(
   }
   // The intermediate multiplication can exceed 32 bits, so it has to be
   // performed by promoting one of the values to int64_t.
-  int32_t dequantized_value =
-      (static_cast<int64_t>(q) * quantized_buffer[pos_index]) & 0xffffff;
+  int32_t dequantized_value = (static_cast<int64_t>(q) * level) & 0xffffff;
   dequantized_value >>= shift;
-  if (sign) {
-    dequantized_value = -dequantized_value;
-  }
-  // Inverse transform process assumes that the quantized coefficients are
-  // stored as a virtual 2d array of size |tx_width| x |tx_height|. If
-  // transform width is 64, then this assumption is broken because the scan
-  // order used for populating the coefficients for such transforms is the
-  // same as the one used for corresponding transform with width 32 (e.g. the
-  // scan order used for 64x16 is the same as the one used for 32x16). So we
-  // have to recompute the value of pos so that it reflects the index of the
-  // 2d array of size 64 x |tx_height|.
-  if (!is_dc_coefficient && tx_width == 64) {
-    const int row_index = DivideBy32(pos);
-    const int column_index = Mod32(pos);
-    pos = MultiplyBy64(row_index) + column_index;
-  }
-  if (sequence_header_.color_config.bitdepth == 8) {
-    auto* const residual_buffer = reinterpret_cast<int16_t*>(*block.residual);
-    residual_buffer[pos] = Clip3(dequantized_value, min_value, max_value);
-#if LIBGAV1_MAX_BITDEPTH >= 10
-  } else {
-    auto* const residual_buffer = reinterpret_cast<int32_t*>(*block.residual);
-    residual_buffer[pos] = Clip3(dequantized_value, min_value, max_value);
-#endif
-  }
+  // At this point:
+  //   * |dequantized_value| is always non-negative.
+  //   * |sign| can be either 0 or 1.
+  //   * min_value = -(max_value + 1).
+  // We need to apply the following:
+  // dequantized_value = sign ? -dequantized_value : dequantized_value;
+  // dequantized_value = Clip3(dequantized_value, min_value, max_value);
+  //
+  // Note that -x == ~(x - 1).
+  //
+  // Now, The above two lines can be done with a std::min and xor as follows:
+  dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign;
+  residual_buffer[pos] = dequantized_value;
   return true;
 }
 
@@ -1109,10 +1408,11 @@ int Tile::ReadCoeffBaseRange(int clamped_tx_size_context, int cdf_context,
   return level;
 }
 
-int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane,
-                                        int start_x, int start_y,
-                                        TransformSize tx_size,
-                                        TransformType* const tx_type) {
+template <typename ResidualType>
+int Tile::ReadTransformCoefficients(const Block& block, Plane plane,
+                                    int start_x, int start_y,
+                                    TransformSize tx_size,
+                                    TransformType* const tx_type) {
   const int x4 = DivideBy4(start_x);
   const int y4 = DivideBy4(start_y);
   const int w4 = kTransformWidth4x4[tx_size];
@@ -1134,19 +1434,15 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane,
   }
   const int tx_width = kTransformWidth[tx_size];
   const int tx_height = kTransformHeight[tx_size];
-  memset(*block.residual, 0, tx_width * tx_height * residual_size_);
-  const int clamped_tx_width = std::min(tx_width, 32);
+  const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+  const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+  const int tx_padding =
+      (1 << adjusted_tx_width_log2) * kResidualPaddingVertical;
+  auto* residual = reinterpret_cast<ResidualType*>(*block.residual);
+  // Clear padding to avoid bottom boundary checks when parsing quantized
+  // coefficients.
+  memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
   const int clamped_tx_height = std::min(tx_height, 32);
-  const int padded_tx_width =
-      clamped_tx_width + kQuantizedCoefficientBufferPadding;
-  const int padded_tx_height =
-      clamped_tx_height + kQuantizedCoefficientBufferPadding;
-  int32_t* const quantized = block.scratch_buffer->quantized_buffer;
-  // Only the first |padded_tx_width| * |padded_tx_height| values of |quantized|
-  // will be used by this function and the functions to which it is passed into.
-  // So we simply need to zero out those values before it is being used.
-  memset(quantized, 0,
-         padded_tx_width * padded_tx_height * sizeof(quantized[0]));
   if (plane == kPlaneY) {
     ReadTransformType(block, x4, y4, tx_size);
   }
@@ -1181,9 +1477,9 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane,
       cdf = symbol_decoder_context_.eob_pt_1024_cdf[plane_type];
       break;
   }
-  const int16_t eob_pt =
+  const int eob_pt =
       1 + reader_.ReadSymbol(cdf, kEobPt16SymbolCount + eob_multi_size);
-  int16_t eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
+  int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
   if (eob_pt >= 3) {
     context = eob_pt - 3;
     const bool eob_extra = reader_.ReadSymbol(
@@ -1199,23 +1495,6 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane,
     }
   }
   const uint16_t* scan = kScan[tx_class][tx_size];
-  const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
-  const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
-  // Lookup used to call the right variant of GetCoeffBaseContext*() based on
-  // the transform class.
-  static constexpr int (Tile::*kGetCoeffBaseContextFunc[])(
-      const int32_t*, TransformSize, int, uint16_t) = {
-      &Tile::GetCoeffBaseContext2D, &Tile::GetCoeffBaseContextHorizontal,
-      &Tile::GetCoeffBaseContextVertical};
-  auto get_coeff_base_context_func = kGetCoeffBaseContextFunc[tx_class];
-  // Lookup used to call the right variant of GetCoeffBaseRangeContext*() based
-  // on the transform class.
-  static constexpr int (Tile::*kGetCoeffBaseRangeContextFunc[])(
-      const int32_t*, int, int) = {&Tile::GetCoeffBaseRangeContext2D,
-                                   &Tile::GetCoeffBaseRangeContextHorizontal,
-                                   &Tile::GetCoeffBaseRangeContextVertical};
-  auto get_coeff_base_range_context_func =
-      kGetCoeffBaseRangeContextFunc[tx_class];
   const int clamped_tx_size_context = std::min(tx_size_context, 3);
   // Read the last coefficient.
   {
@@ -1227,36 +1506,37 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane,
                     .coeff_base_eob_cdf[tx_size_context][plane_type][context],
                 kCoeffBaseEobSymbolCount);
     if (level > kNumQuantizerBaseLevels) {
-      level += ReadCoeffBaseRange(clamped_tx_size_context,
-                                  (this->*get_coeff_base_range_context_func)(
-                                      quantized, adjusted_tx_width_log2, pos),
-                                  plane_type);
+      level += ReadCoeffBaseRange(
+          clamped_tx_size_context,
+          GetCoeffBaseRangeContextEob(adjusted_tx_width_log2, pos, tx_class),
+          plane_type);
     }
-    quantized[PaddedIndex(pos, adjusted_tx_width_log2)] = level;
+    residual[pos] = level;
+  }
+  if (eob > 1) {
+    // Read all the other coefficients.
+    // Lookup used to call the right variant of ReadCoeffBase*() based on the
+    // transform class.
+    static constexpr void (Tile::*kGetCoeffBaseFunc[])(
+        const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
+        int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+        uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+        ResidualType* quantized_buffer) = {
+        &Tile::ReadCoeffBase2D<ResidualType>,
+        &Tile::ReadCoeffBaseHorizontal<ResidualType>,
+        &Tile::ReadCoeffBaseVertical<ResidualType>};
+    (this->*kGetCoeffBaseFunc[tx_class])(
+        scan, plane_type, tx_size, clamped_tx_size_context,
+        adjusted_tx_width_log2, eob,
+        symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
+        residual);
   }
-  // Read all the other coefficients.
-  for (int i = eob - 2; i >= 0; --i) {
-    const uint16_t pos = scan[i];
-    context = (this->*get_coeff_base_context_func)(quantized, tx_size,
-                                                   adjusted_tx_width_log2, pos);
-    int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(
-        symbol_decoder_context_
-            .coeff_base_cdf[tx_size_context][plane_type][context]);
-    if (level > kNumQuantizerBaseLevels) {
-      level += ReadCoeffBaseRange(clamped_tx_size_context,
-                                  (this->*get_coeff_base_range_context_func)(
-                                      quantized, adjusted_tx_width_log2, pos),
-                                  plane_type);
-    }
-    quantized[PaddedIndex(pos, adjusted_tx_width_log2)] = level;
-  }
-  const int min_value = -(1 << (7 + sequence_header_.color_config.bitdepth));
   const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
   const int current_quantizer_index = GetQIndex(
       frame_header_.segmentation, bp.segment_id, current_quantizer_index_);
   const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index);
   const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index);
-  const int shift = GetQuantizationShift(tx_size);
+  const int shift = kQuantizationShift[tx_size];
   const uint8_t* const quantizer_matrix =
       (frame_header_.quantizer.use_matrix &&
        *tx_type < kTransformTypeIdentityIdentity &&
@@ -1268,24 +1548,27 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane,
   int coefficient_level = 0;
   int8_t dc_category = 0;
   uint16_t* const dc_sign_cdf =
-      (quantized[0] != 0)
+      (residual[0] != 0)
           ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext(
                 x4, y4, w4, h4, plane)]
           : nullptr;
   assert(scan[0] == 0);
-  if (!ReadSignAndApplyDequantization</*is_dc_coefficient=*/true>(
-          block, quantized, scan, 0, adjusted_tx_width_log2, tx_width,
-          dc_q_value, quantizer_matrix, shift, min_value, max_value,
-          dc_sign_cdf, &dc_category, &coefficient_level)) {
+  if (!ReadSignAndApplyDequantization<ResidualType, /*is_dc_coefficient=*/true>(
+          scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf,
+          &dc_category, &coefficient_level, residual)) {
     return -1;
   }
-  for (int i = 1; i < eob; ++i) {
-    if (!ReadSignAndApplyDequantization</*is_dc_coefficient=*/false>(
-            block, quantized, scan, i, adjusted_tx_width_log2, tx_width,
-            ac_q_value, quantizer_matrix, shift, min_value, max_value, nullptr,
-            nullptr, &coefficient_level)) {
-      return -1;
-    }
+  if (eob > 1) {
+    int i = 1;
+    do {
+      if (!ReadSignAndApplyDequantization<ResidualType,
+                                          /*is_dc_coefficient=*/false>(
+              scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr,
+              nullptr, &coefficient_level, residual)) {
+        return -1;
+      }
+    } while (++i < eob);
+    MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual);
   }
   SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level),
                      dc_category);
@@ -1295,6 +1578,25 @@ int16_t Tile::ReadTransformCoefficients(const Block& block, Plane plane,
   return eob;
 }
 
+// CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template
+// |function| depending on the value of |sequence_header_.color_config.bitdepth|
+// with the variadic arguments.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+#define CALL_BITDEPTH_FUNCTION(function, ...)         \
+  do {                                                \
+    if (sequence_header_.color_config.bitdepth > 8) { \
+      function<uint16_t>(__VA_ARGS__);                \
+    } else {                                          \
+      function<uint8_t>(__VA_ARGS__);                 \
+    }                                                 \
+  } while (false)
+#else
+#define CALL_BITDEPTH_FUNCTION(function, ...) \
+  do {                                        \
+    function<uint8_t>(__VA_ARGS__);           \
+  } while (false)
+#endif
+
 bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
                           int base_y, TransformSize tx_size, int x, int y,
                           ProcessingMode mode) {
@@ -1317,15 +1619,8 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
                          mode == kProcessingModeParseAndDecode;
   if (do_decode && !bp.is_inter) {
     if (bp.palette_mode_info.size[GetPlaneType(plane)] > 0) {
-      if (sequence_header_.color_config.bitdepth == 8) {
-        PalettePrediction<uint8_t>(block, plane, start_x, start_y, x, y,
-                                   tx_size);
-#if LIBGAV1_MAX_BITDEPTH >= 10
-      } else {
-        PalettePrediction<uint16_t>(block, plane, start_x, start_y, x, y,
-                                    tx_size);
-#endif
-      }
+      CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y,
+                             x, y, tx_size);
     } else {
       const PredictionMode mode =
           (plane == kPlaneY)
@@ -1337,37 +1632,17 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
           (sub_block_column4x4 >> subsampling_x) + step_x + 1;
       const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1;
       const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x);
-      const bool has_left =
-          x > 0 || (plane == kPlaneY ? block.left_available
-                                     : block.LeftAvailableChroma());
-      const bool has_top =
-          y > 0 ||
-          (plane == kPlaneY ? block.top_available : block.TopAvailableChroma());
-      if (sequence_header_.color_config.bitdepth == 8) {
-        IntraPrediction<uint8_t>(
-            block, plane, start_x, start_y, has_left, has_top,
-            block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
-            block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
-            mode, tx_size);
-#if LIBGAV1_MAX_BITDEPTH >= 10
-      } else {
-        IntraPrediction<uint16_t>(
-            block, plane, start_x, start_y, has_left, has_top,
-            block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
-            block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
-            mode, tx_size);
-#endif
-      }
+      const bool has_left = x > 0 || block.left_available[plane];
+      const bool has_top = y > 0 || block.top_available[plane];
+
+      CALL_BITDEPTH_FUNCTION(
+          IntraPrediction, block, plane, start_x, start_y, has_left, has_top,
+          block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+          block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+          mode, tx_size);
       if (plane != kPlaneY && bp.uv_mode == kPredictionModeChromaFromLuma) {
-        if (sequence_header_.color_config.bitdepth == 8) {
-          ChromaFromLumaPrediction<uint8_t>(block, plane, start_x, start_y,
-                                            tx_size);
-#if LIBGAV1_MAX_BITDEPTH >= 10
-        } else {
-          ChromaFromLumaPrediction<uint16_t>(block, plane, start_x, start_y,
-                                             tx_size);
-#endif
-        }
+        CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x,
+                               start_y, tx_size);
       }
     }
     if (plane == kPlaneY) {
@@ -1381,34 +1656,35 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
   if (!bp.skip) {
     const int sb_row_index = SuperBlockRowIndex(block.row4x4);
     const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
-    switch (mode) {
-      case kProcessingModeParseAndDecode: {
-        TransformType tx_type;
-        const int16_t non_zero_coeff_count = ReadTransformCoefficients(
+    if (mode == kProcessingModeDecodeOnly) {
+      TransformParameterQueue& tx_params =
+          *residual_buffer_threaded_[sb_row_index][sb_column_index]
+               ->transform_parameters();
+      ReconstructBlock(block, plane, start_x, start_y, tx_size,
+                       tx_params.Type(), tx_params.NonZeroCoeffCount());
+      tx_params.Pop();
+    } else {
+      TransformType tx_type;
+      int non_zero_coeff_count;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      if (sequence_header_.color_config.bitdepth > 8) {
+        non_zero_coeff_count = ReadTransformCoefficients<int32_t>(
             block, plane, start_x, start_y, tx_size, &tx_type);
-        if (non_zero_coeff_count < 0) return false;
+      } else  // NOLINT
+#endif
+      {
+        non_zero_coeff_count = ReadTransformCoefficients<int16_t>(
+            block, plane, start_x, start_y, tx_size, &tx_type);
+      }
+      if (non_zero_coeff_count < 0) return false;
+      if (mode == kProcessingModeParseAndDecode) {
         ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type,
                          non_zero_coeff_count);
-        break;
-      }
-      case kProcessingModeParseOnly: {
-        TransformType tx_type;
-        const int16_t non_zero_coeff_count = ReadTransformCoefficients(
-            block, plane, start_x, start_y, tx_size, &tx_type);
-        if (non_zero_coeff_count < 0) return false;
+      } else {
+        assert(mode == kProcessingModeParseOnly);
         residual_buffer_threaded_[sb_row_index][sb_column_index]
             ->transform_parameters()
             ->Push(non_zero_coeff_count, tx_type);
-        break;
-      }
-      case kProcessingModeDecodeOnly: {
-        TransformParameterQueue& tx_params =
-            *residual_buffer_threaded_[sb_row_index][sb_column_index]
-                 ->transform_parameters();
-        ReconstructBlock(block, plane, start_x, start_y, tx_size,
-                         tx_params.Type(), tx_params.NonZeroCoeffCount());
-        tx_params.Pop();
-        break;
       }
     }
   }
@@ -1417,11 +1693,8 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
         &block.scratch_buffer
              ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1]
                             [(sub_block_column4x4 >> subsampling_x) + 1];
-    for (int i = 0; i < step_y; ++i) {
-      static_assert(sizeof(bool) == 1, "");
-      memset(block_decoded, 1, step_x);
-      block_decoded += DecoderScratchBuffer::kBlockDecodedStride;
-    }
+    SetBlockValues<bool>(step_y, step_x, true, block_decoded,
+                         TileScratchBuffer::kBlockDecodedStride);
   }
   return true;
 }
@@ -1437,7 +1710,7 @@ bool Tile::TransformTree(const Block& block, int start_x, int start_y,
   stack.Push(TransformTreeNode(start_x, start_y,
                                static_cast<TransformSize>(plane_size)));
 
-  while (!stack.Empty()) {
+  do {
     TransformTreeNode node = stack.Pop();
     const int row = DivideBy4(node.y);
     const int column = DivideBy4(node.x);
@@ -1479,24 +1752,18 @@ bool Tile::TransformTree(const Block& block, int start_x, int start_y,
     stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size));
     stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
     stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
-  }
+  } while (!stack.Empty());
   return true;
 }
 
 void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
                             int start_y, TransformSize tx_size,
-                            TransformType tx_type,
-                            int16_t non_zero_coeff_count) {
+                            TransformType tx_type, int non_zero_coeff_count) {
+  // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec.
   assert(non_zero_coeff_count >= 0);
   if (non_zero_coeff_count == 0) return;
-  // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec.
-  if (sequence_header_.color_config.bitdepth == 8) {
-    Reconstruct(dsp_, tx_type, tx_size,
-                frame_header_.segmentation.lossless[block.bp->segment_id],
-                reinterpret_cast<int16_t*>(*block.residual), start_x, start_y,
-                &buffer_[plane], non_zero_coeff_count);
 #if LIBGAV1_MAX_BITDEPTH >= 10
-  } else {
+  if (sequence_header_.color_config.bitdepth > 8) {
     Array2DView<uint16_t> buffer(
         buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
         reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
@@ -1504,7 +1771,13 @@ void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
                 frame_header_.segmentation.lossless[block.bp->segment_id],
                 reinterpret_cast<int32_t*>(*block.residual), start_x, start_y,
                 &buffer, non_zero_coeff_count);
+  } else  // NOLINT
 #endif
+  {
+    Reconstruct(dsp_, tx_type, tx_size,
+                frame_header_.segmentation.lossless[block.bp->segment_id],
+                reinterpret_cast<int16_t*>(*block.residual), start_x, start_y,
+                &buffer_[plane], non_zero_coeff_count);
   }
   if (split_parse_and_decode_) {
     *block.residual +=
@@ -1513,8 +1786,8 @@ void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
 }
 
 bool Tile::Residual(const Block& block, ProcessingMode mode) {
-  const int width_chunks = std::max(1, kBlockWidthPixels[block.size] >> 6);
-  const int height_chunks = std::max(1, kBlockHeightPixels[block.size] >> 6);
+  const int width_chunks = std::max(1, block.width >> 6);
+  const int height_chunks = std::max(1, block.height >> 6);
   const BlockSize size_chunk4x4 =
       (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size;
   const BlockParameters& bp = *block.bp;
@@ -1574,7 +1847,7 @@ bool Tile::Residual(const Block& block, ProcessingMode mode) {
 bool Tile::IsMvValid(const Block& block, bool is_compound) const {
   const BlockParameters& bp = *block.bp;
   for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) {
-    for (int mv_component : bp.mv[i].mv) {
+    for (int mv_component : bp.mv.mv[i].mv) {
       if (std::abs(mv_component) >= (1 << 14)) {
         return false;
       }
@@ -1583,22 +1856,20 @@ bool Tile::IsMvValid(const Block& block, bool is_compound) const {
   if (!block.bp->prediction_parameters->use_intra_block_copy) {
     return true;
   }
-  const int block_width = kBlockWidthPixels[block.size];
-  const int block_height = kBlockHeightPixels[block.size];
-  if ((bp.mv[0].mv[0] & 7) != 0 || (bp.mv[0].mv[1] & 7) != 0) {
+  if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) {
     return false;
   }
-  const int delta_row = bp.mv[0].mv[0] >> 3;
-  const int delta_column = bp.mv[0].mv[1] >> 3;
+  const int delta_row = bp.mv.mv[0].mv[0] >> 3;
+  const int delta_column = bp.mv.mv[0].mv[1] >> 3;
   int src_top_edge = MultiplyBy4(block.row4x4) + delta_row;
   int src_left_edge = MultiplyBy4(block.column4x4) + delta_column;
-  const int src_bottom_edge = src_top_edge + block_height;
-  const int src_right_edge = src_left_edge + block_width;
+  const int src_bottom_edge = src_top_edge + block.height;
+  const int src_right_edge = src_left_edge + block.width;
   if (block.HasChroma()) {
-    if (block_width < 8 && subsampling_x_[kPlaneU] != 0) {
+    if (block.width < 8 && subsampling_x_[kPlaneU] != 0) {
       src_left_edge -= 4;
     }
-    if (block_height < 8 && subsampling_y_[kPlaneU] != 0) {
+    if (block.height < 8 && subsampling_y_[kPlaneU] != 0) {
       src_top_edge -= 4;
     }
   }
@@ -1636,58 +1907,102 @@ bool Tile::IsMvValid(const Block& block, bool is_compound) const {
                                       wavefront_offset;
 }
 
-bool Tile::AssignMv(const Block& block, bool is_compound) {
-  MotionVector predicted_mv[2] = {};
+bool Tile::AssignInterMv(const Block& block, bool is_compound) {
+  int min[2];
+  int max[2];
+  GetClampParameters(block, min, max);
   BlockParameters& bp = *block.bp;
-  for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) {
-    const PredictionParameters& prediction_parameters =
-        *block.bp->prediction_parameters;
-    const PredictionMode mode = prediction_parameters.use_intra_block_copy
-                                    ? kPredictionModeNewMv
-                                    : GetSinglePredictionMode(i, bp.y_mode);
-    if (prediction_parameters.use_intra_block_copy) {
-      predicted_mv[0] = prediction_parameters.ref_mv_stack[0].mv[0];
-      if (predicted_mv[0].mv[0] == 0 && predicted_mv[0].mv[1] == 0) {
-        predicted_mv[0] = prediction_parameters.ref_mv_stack[1].mv[0];
-      }
-      if (predicted_mv[0].mv[0] == 0 && predicted_mv[0].mv[1] == 0) {
-        const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()];
-        if (block.row4x4 - super_block_size4x4 < row4x4_start_) {
-          predicted_mv[0].mv[1] = -MultiplyBy8(
-              MultiplyBy4(super_block_size4x4) + kIntraBlockCopyDelayPixels);
-        } else {
-          predicted_mv[0].mv[0] = -MultiplyBy32(super_block_size4x4);
+  const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  if (is_compound) {
+    for (int i = 0; i < 2; ++i) {
+      const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
+      MotionVector predicted_mv;
+      if (mode == kPredictionModeGlobalMv) {
+        predicted_mv = prediction_parameters.global_mv[i];
+      } else {
+        const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+                                  (mode == kPredictionModeNewMv &&
+                                   prediction_parameters.ref_mv_count <= 1))
+                                     ? 0
+                                     : prediction_parameters.ref_mv_index;
+        predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i);
+        if (ref_mv_index < prediction_parameters.ref_mv_count) {
+          predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+          predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
         }
       }
-    } else if (mode == kPredictionModeGlobalMv) {
-      predicted_mv[i] = prediction_parameters.global_mv[i];
+      if (mode == kPredictionModeNewMv) {
+        ReadMotionVector(block, i);
+        bp.mv.mv[i].mv[0] += predicted_mv.mv[0];
+        bp.mv.mv[i].mv[1] += predicted_mv.mv[1];
+      } else {
+        bp.mv.mv[i] = predicted_mv;
+      }
+    }
+  } else {
+    const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode);
+    MotionVector predicted_mv;
+    if (mode == kPredictionModeGlobalMv) {
+      predicted_mv = prediction_parameters.global_mv[0];
     } else {
       const int ref_mv_index = (mode == kPredictionModeNearestMv ||
                                 (mode == kPredictionModeNewMv &&
                                  prediction_parameters.ref_mv_count <= 1))
                                    ? 0
                                    : prediction_parameters.ref_mv_index;
-      predicted_mv[i] = prediction_parameters.ref_mv_stack[ref_mv_index].mv[i];
+      predicted_mv = prediction_parameters.reference_mv(ref_mv_index);
+      if (ref_mv_index < prediction_parameters.ref_mv_count) {
+        predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+        predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+      }
     }
     if (mode == kPredictionModeNewMv) {
-      ReadMotionVector(block, i);
-      bp.mv[i].mv[0] += predicted_mv[i].mv[0];
-      bp.mv[i].mv[1] += predicted_mv[i].mv[1];
+      ReadMotionVector(block, 0);
+      bp.mv.mv[0].mv[0] += predicted_mv.mv[0];
+      bp.mv.mv[0].mv[1] += predicted_mv.mv[1];
     } else {
-      bp.mv[i] = predicted_mv[i];
+      bp.mv.mv[0] = predicted_mv;
     }
   }
   return IsMvValid(block, is_compound);
 }
 
+bool Tile::AssignIntraMv(const Block& block) {
+  // TODO(linfengz): Check if the clamping process is necessary.
+  int min[2];
+  int max[2];
+  GetClampParameters(block, min, max);
+  BlockParameters& bp = *block.bp;
+  const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
+  ReadMotionVector(block, 0);
+  if (ref_mv_0.mv32 == 0) {
+    const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
+    if (ref_mv_1.mv32 == 0) {
+      const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()];
+      if (block.row4x4 - super_block_size4x4 < row4x4_start_) {
+        bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4);
+        bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels);
+      } else {
+        bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4);
+      }
+    } else {
+      bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]);
+      bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]);
+    }
+  } else {
+    bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]);
+    bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]);
+  }
+  return IsMvValid(block, /*is_compound=*/false);
+}
+
 void Tile::ResetEntropyContext(const Block& block) {
-  const int block_width4x4 = kNum4x4BlocksWide[block.size];
-  const int block_height4x4 = kNum4x4BlocksHigh[block.size];
   for (int plane = 0; plane < (block.HasChroma() ? PlaneCount() : 1); ++plane) {
     const int subsampling_x = subsampling_x_[plane];
     const int start_x = block.column4x4 >> subsampling_x;
     const int end_x =
-        std::min((block.column4x4 + block_width4x4) >> subsampling_x,
+        std::min((block.column4x4 + block.width4x4) >> subsampling_x,
                  frame_header_.columns4x4);
     memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0,
            end_x - start_x);
@@ -1696,7 +2011,7 @@ void Tile::ResetEntropyContext(const Block& block) {
     const int subsampling_y = subsampling_y_[plane];
     const int start_y = block.row4x4 >> subsampling_y;
     const int end_y =
-        std::min((block.row4x4 + block_height4x4) >> subsampling_y,
+        std::min((block.row4x4 + block.height4x4) >> subsampling_y,
                  frame_header_.rows4x4);
     memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0,
            end_y - start_y);
@@ -1705,12 +2020,15 @@ void Tile::ResetEntropyContext(const Block& block) {
   }
 }
 
-void Tile::ComputePrediction(const Block& block) {
+bool Tile::ComputePrediction(const Block& block) {
+  const BlockParameters& bp = *block.bp;
+  if (!bp.is_inter) return true;
   const int mask =
       (1 << (4 + static_cast<int>(sequence_header_.use_128x128_superblock))) -
       1;
   const int sub_block_row4x4 = block.row4x4 & mask;
   const int sub_block_column4x4 = block.column4x4 & mask;
+  const int plane_count = block.HasChroma() ? PlaneCount() : 1;
   // Returns true if this block applies local warping. The state is determined
   // in the Y plane and carried for use in the U/V planes.
   // But the U/V planes will not apply warping when the block size is smaller
@@ -1718,20 +2036,19 @@ void Tile::ComputePrediction(const Block& block) {
   bool is_local_valid = false;
   // Local warping parameters, similar usage as is_local_valid.
   GlobalMotion local_warp_params;
-  for (int plane = 0; plane < (block.HasChroma() ? PlaneCount() : 1); ++plane) {
+  int plane = 0;
+  do {
     const int8_t subsampling_x = subsampling_x_[plane];
     const int8_t subsampling_y = subsampling_y_[plane];
-    const BlockSize plane_size =
-        block.residual_size[GetPlaneType(static_cast<Plane>(plane))];
+    const BlockSize plane_size = block.residual_size[plane];
     const int block_width4x4 = kNum4x4BlocksWide[plane_size];
     const int block_height4x4 = kNum4x4BlocksHigh[plane_size];
     const int block_width = MultiplyBy4(block_width4x4);
     const int block_height = MultiplyBy4(block_height4x4);
     const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
     const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
-    const BlockParameters& bp = *block.bp;
-    if (bp.is_inter && bp.reference_frame[1] == kReferenceFrameIntra) {
-      const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y);
+    if (bp.reference_frame[1] == kReferenceFrameIntra) {
+      const int tr_row4x4 = sub_block_row4x4 >> subsampling_y;
       const int tr_column4x4 =
           (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1;
       const int bl_row4x4 =
@@ -1740,88 +2057,98 @@ void Tile::ComputePrediction(const Block& block) {
       const TransformSize tx_size =
           k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]]
                                  [k4x4HeightLog2[plane_size]];
-      const bool has_left =
-          plane == kPlaneY ? block.left_available : block.LeftAvailableChroma();
-      const bool has_top =
-          plane == kPlaneY ? block.top_available : block.TopAvailableChroma();
-      if (sequence_header_.color_config.bitdepth == 8) {
-        IntraPrediction<uint8_t>(
-            block, static_cast<Plane>(plane), base_x, base_y, has_left, has_top,
-            block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
-            block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
-            kInterIntraToIntraMode[block.bp->prediction_parameters
-                                       ->inter_intra_mode],
-            tx_size);
-#if LIBGAV1_MAX_BITDEPTH >= 10
-      } else {
-        IntraPrediction<uint16_t>(
-            block, static_cast<Plane>(plane), base_x, base_y, has_left, has_top,
-            block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
-            block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
-            kInterIntraToIntraMode[block.bp->prediction_parameters
-                                       ->inter_intra_mode],
-            tx_size);
-#endif
-      }
+      const bool has_left = block.left_available[plane];
+      const bool has_top = block.top_available[plane];
+      CALL_BITDEPTH_FUNCTION(
+          IntraPrediction, block, static_cast<Plane>(plane), base_x, base_y,
+          has_left, has_top,
+          block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+          block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+          kInterIntraToIntraMode[block.bp->prediction_parameters
+                                     ->inter_intra_mode],
+          tx_size);
     }
-    if (bp.is_inter) {
-      int candidate_row = (block.row4x4 >> subsampling_y) << subsampling_y;
-      int candidate_column = (block.column4x4 >> subsampling_x)
-                             << subsampling_x;
-      bool some_use_intra = false;
-      for (int r = 0; r < (block_height4x4 << subsampling_y); ++r) {
-        for (int c = 0; c < (block_width4x4 << subsampling_x); ++c) {
-          auto* const bp = block_parameters_holder_.Find(candidate_row + r,
-                                                         candidate_column + c);
-          if (bp != nullptr && bp->reference_frame[0] == kReferenceFrameIntra) {
-            some_use_intra = true;
-            break;
-          }
+    int candidate_row = block.row4x4;
+    int candidate_column = block.column4x4;
+    bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra;
+    if (!some_use_intra && plane != 0) {
+      candidate_row = (candidate_row >> subsampling_y) << subsampling_y;
+      candidate_column = (candidate_column >> subsampling_x) << subsampling_x;
+      if (candidate_row != block.row4x4) {
+        // Top block.
+        const BlockParameters& bp_top =
+            *block_parameters_holder_.Find(candidate_row, block.column4x4);
+        some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra;
+        if (!some_use_intra && candidate_column != block.column4x4) {
+          // Top-left block.
+          const BlockParameters& bp_top_left =
+              *block_parameters_holder_.Find(candidate_row, candidate_column);
+          some_use_intra =
+              bp_top_left.reference_frame[0] == kReferenceFrameIntra;
         }
-        if (some_use_intra) break;
-      }
-      int prediction_width;
-      int prediction_height;
-      if (some_use_intra) {
-        candidate_row = block.row4x4;
-        candidate_column = block.column4x4;
-        prediction_width = block_width;
-        prediction_height = block_height;
-      } else {
-        prediction_width = kBlockWidthPixels[block.size] >> subsampling_x;
-        prediction_height = kBlockHeightPixels[block.size] >> subsampling_y;
       }
-      for (int r = 0, y = 0; y < block_height; y += prediction_height, ++r) {
-        for (int c = 0, x = 0; x < block_width; x += prediction_width, ++c) {
-          InterPrediction(block, static_cast<Plane>(plane), base_x + x,
-                          base_y + y, prediction_width, prediction_height,
-                          candidate_row + r, candidate_column + c,
-                          &is_local_valid, &local_warp_params);
-        }
+      if (!some_use_intra && candidate_column != block.column4x4) {
+        // Left block.
+        const BlockParameters& bp_left =
+            *block_parameters_holder_.Find(block.row4x4, candidate_column);
+        some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra;
       }
     }
-  }
+    int prediction_width;
+    int prediction_height;
+    if (some_use_intra) {
+      candidate_row = block.row4x4;
+      candidate_column = block.column4x4;
+      prediction_width = block_width;
+      prediction_height = block_height;
+    } else {
+      prediction_width = block.width >> subsampling_x;
+      prediction_height = block.height >> subsampling_y;
+    }
+    int r = 0;
+    int y = 0;
+    do {
+      int c = 0;
+      int x = 0;
+      do {
+        if (!InterPrediction(block, static_cast<Plane>(plane), base_x + x,
+                             base_y + y, prediction_width, prediction_height,
+                             candidate_row + r, candidate_column + c,
+                             &is_local_valid, &local_warp_params)) {
+          return false;
+        }
+        ++c;
+        x += prediction_width;
+      } while (x < block_width);
+      ++r;
+      y += prediction_height;
+    } while (y < block_height);
+  } while (++plane < plane_count);
+  return true;
 }
 
+#undef CALL_BITDEPTH_FUNCTION
+
 void Tile::PopulateDeblockFilterLevel(const Block& block) {
   if (!post_filter_.DoDeblock()) return;
   BlockParameters& bp = *block.bp;
+  const int mode_id =
+      static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
   for (int i = 0; i < kFrameLfCount; ++i) {
     if (delta_lf_all_zero_) {
       bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
-          bp.segment_id, i, bp.reference_frame[0],
-          LoopFilterMask::GetModeId(bp.y_mode));
+          bp.segment_id, i, bp.reference_frame[0], mode_id);
     } else {
       bp.deblock_filter_level[i] =
           deblock_filter_levels_[bp.segment_id][i][bp.reference_frame[0]]
-                                [LoopFilterMask::GetModeId(bp.y_mode)];
+                                [mode_id];
     }
   }
 }
 
 bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
                         ParameterTree* const tree,
-                        DecoderScratchBuffer* const scratch_buffer,
+                        TileScratchBuffer* const scratch_buffer,
                         ResidualPtr* residual) {
   // Do not process the block if the starting point is beyond the visible frame.
   // This is equivalent to the has_row/has_column check in the
@@ -1831,34 +2158,34 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
       column4x4 >= frame_header_.columns4x4) {
     return true;
   }
-  Block block(*this, row4x4, column4x4, block_size, scratch_buffer, residual,
-              tree->parameters());
-  block.bp->size = block_size;
-  block_parameters_holder_.FillCache(row4x4, column4x4, block_size,
-                                     tree->parameters());
-  block.bp->prediction_parameters =
+  BlockParameters& bp = *tree->parameters();
+  block_parameters_holder_.FillCache(row4x4, column4x4, block_size, &bp);
+  Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
+  bp.size = block_size;
+  bp.prediction_parameters =
       split_parse_and_decode_ ? std::unique_ptr<PredictionParameters>(
                                     new (std::nothrow) PredictionParameters())
                               : std::move(prediction_parameters_);
-  if (block.bp->prediction_parameters == nullptr) return false;
+  if (bp.prediction_parameters == nullptr) return false;
   if (!DecodeModeInfo(block)) return false;
+  bp.is_global_mv_block = (bp.y_mode == kPredictionModeGlobalMv ||
+                           bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+                          !IsBlockDimension4(bp.size);
   PopulateDeblockFilterLevel(block);
   if (!ReadPaletteTokens(block)) return false;
   DecodeTransformSize(block);
-  BlockParameters& bp = *block.bp;
   // Part of Section 5.11.37 in the spec (implemented as a simple lookup).
-  bp.uv_transform_size =
-      frame_header_.segmentation.lossless[bp.segment_id]
-          ? kTransformSize4x4
-          : kUVTransformSize[block.residual_size[kPlaneTypeUV]];
+  bp.uv_transform_size = frame_header_.segmentation.lossless[bp.segment_id]
+                             ? kTransformSize4x4
+                             : kUVTransformSize[block.residual_size[kPlaneU]];
   if (bp.skip) ResetEntropyContext(block);
-  const int block_width4x4 = kNum4x4BlocksWide[block_size];
-  const int block_height4x4 = kNum4x4BlocksHigh[block_size];
   if (split_parse_and_decode_) {
     if (!Residual(block, kProcessingModeParseOnly)) return false;
   } else {
-    ComputePrediction(block);
-    if (!Residual(block, kProcessingModeParseAndDecode)) return false;
+    if (!ComputePrediction(block) ||
+        !Residual(block, kProcessingModeParseAndDecode)) {
+      return false;
+    }
   }
   // If frame_header_.segmentation.enabled is false, bp.segment_id is 0 for all
   // blocks. We don't need to call save bp.segment_id in the current frame
@@ -1870,25 +2197,22 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
   // save bp.segment_id in the current frame.
   if (frame_header_.segmentation.enabled &&
       frame_header_.segmentation.update_map) {
-    const int x_limit =
-        std::min(frame_header_.columns4x4 - column4x4, block_width4x4);
-    const int y_limit =
-        std::min(frame_header_.rows4x4 - row4x4, block_height4x4);
+    const int x_limit = std::min(frame_header_.columns4x4 - column4x4,
+                                 static_cast<int>(block.width4x4));
+    const int y_limit = std::min(frame_header_.rows4x4 - row4x4,
+                                 static_cast<int>(block.height4x4));
     current_frame_.segmentation_map()->FillBlock(row4x4, column4x4, x_limit,
                                                  y_limit, bp.segment_id);
   }
-  if (build_bit_mask_when_parsing_ || !split_parse_and_decode_) {
-    BuildBitMask(row4x4, column4x4, block_size);
-  }
+  StoreMotionFieldMvsIntoCurrentFrame(block);
   if (!split_parse_and_decode_) {
-    StoreMotionFieldMvsIntoCurrentFrame(block);
-    prediction_parameters_ = std::move(block.bp->prediction_parameters);
+    prediction_parameters_ = std::move(bp.prediction_parameters);
   }
   return true;
 }
 
 bool Tile::DecodeBlock(ParameterTree* const tree,
-                       DecoderScratchBuffer* const scratch_buffer,
+                       TileScratchBuffer* const scratch_buffer,
                        ResidualPtr* residual) {
   const int row4x4 = tree->row4x4();
   const int column4x4 = tree->column4x4();
@@ -1897,21 +2221,18 @@ bool Tile::DecodeBlock(ParameterTree* const tree,
     return true;
   }
   const BlockSize block_size = tree->block_size();
-  Block block(*this, row4x4, column4x4, block_size, scratch_buffer, residual,
-              tree->parameters());
-  ComputePrediction(block);
-  if (!Residual(block, kProcessingModeDecodeOnly)) return false;
-  if (!build_bit_mask_when_parsing_) {
-    BuildBitMask(row4x4, column4x4, block_size);
+  Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
+  if (!ComputePrediction(block) ||
+      !Residual(block, kProcessingModeDecodeOnly)) {
+    return false;
   }
-  StoreMotionFieldMvsIntoCurrentFrame(block);
   block.bp->prediction_parameters.reset(nullptr);
   return true;
 }
 
 bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
                             ParameterTree* const root,
-                            DecoderScratchBuffer* const scratch_buffer,
+                            TileScratchBuffer* const scratch_buffer,
                             ResidualPtr* residual) {
   Stack<ParameterTree*, kDfsStackSize> stack;
 
@@ -2025,7 +2346,7 @@ void Tile::ResetLoopRestorationParams() {
 }
 
 void Tile::ResetCdef(const int row4x4, const int column4x4) {
-  if (cdef_index_[0] == nullptr) return;
+  if (!sequence_header_.enable_cdef) return;
   const int row = DivideBy16(row4x4);
   const int column = DivideBy16(column4x4);
   cdef_index_[row][column] = -1;
@@ -2039,7 +2360,7 @@ void Tile::ResetCdef(const int row4x4, const int column4x4) {
   }
 }
 
-void Tile::ClearBlockDecoded(DecoderScratchBuffer* const scratch_buffer,
+void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
                              int row4x4, int column4x4) {
   // Set everything to false.
   memset(scratch_buffer->block_decoded, 0,
@@ -2075,7 +2396,7 @@ void Tile::ClearBlockDecoded(DecoderScratchBuffer* const scratch_buffer,
 }
 
 bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
-                             DecoderScratchBuffer* const scratch_buffer,
+                             TileScratchBuffer* const scratch_buffer,
                              ProcessingMode mode) {
   const bool parsing =
       mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode;
@@ -2139,11 +2460,11 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
 }
 
 bool Tile::DecodeSuperBlock(ParameterTree* const tree,
-                            DecoderScratchBuffer* const scratch_buffer,
+                            TileScratchBuffer* const scratch_buffer,
                             ResidualPtr* residual) {
   Stack<ParameterTree*, kDfsStackSize> stack;
   stack.Push(tree);
-  while (!stack.Empty()) {
+  do {
     ParameterTree* const node = stack.Pop();
     if (node->partition() != kPartitionNone) {
       for (int i = 3; i >= 0; --i) {
@@ -2157,7 +2478,7 @@ bool Tile::DecodeSuperBlock(ParameterTree* const tree,
                    node->row4x4(), node->column4x4());
       return false;
     }
-  }
+  } while (!stack.Empty());
   return true;
 }
 
@@ -2189,222 +2510,87 @@ void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
   }
 }
 
-void Tile::BuildBitMask(int row4x4, int column4x4, BlockSize block_size) {
-  if (!post_filter_.DoDeblock()) return;
-  if (block_size <= kBlock64x64) {
-    BuildBitMaskHelper(row4x4, column4x4, block_size, true, true);
-  } else {
-    const int block_width4x4 = kNum4x4BlocksWide[block_size];
-    const int block_height4x4 = kNum4x4BlocksHigh[block_size];
-    for (int y = 0; y < block_height4x4; y += 16) {
-      for (int x = 0; x < block_width4x4; x += 16) {
-        BuildBitMaskHelper(row4x4 + y, column4x4 + x, kBlock64x64, x == 0,
-                           y == 0);
-      }
-    }
-  }
-}
-
-void Tile::BuildBitMaskHelper(int row4x4, int column4x4, BlockSize block_size,
-                              const bool is_vertical_block_border,
-                              const bool is_horizontal_block_border) {
-  const int block_width4x4 = kNum4x4BlocksWide[block_size];
-  const int block_height4x4 = kNum4x4BlocksHigh[block_size];
-  BlockParameters& bp = *block_parameters_holder_.Find(row4x4, column4x4);
-  const bool skip = bp.skip && bp.is_inter;
-  LoopFilterMask* const masks = post_filter_.masks();
-  const int unit_id = DivideBy16(row4x4) * masks->num_64x64_blocks_per_row() +
-                      DivideBy16(column4x4);
-
-  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
-    // For U and V planes, do not build bit masks if level == 0.
-    if (plane > kPlaneY && frame_header_.loop_filter.level[plane + 1] == 0) {
-      continue;
-    }
-    // Build bit mask for vertical edges.
-    const int subsampling_x = subsampling_x_[plane];
-    const int subsampling_y = subsampling_y_[plane];
-    const int plane_width =
-        RightShiftWithRounding(frame_header_.width, subsampling_x);
-    const int column_limit =
-        std::min({column4x4 + block_width4x4, frame_header_.columns4x4,
-                  DivideBy4(plane_width + 3) << subsampling_x});
-    const int plane_height =
-        RightShiftWithRounding(frame_header_.height, subsampling_y);
-    const int row_limit =
-        std::min({row4x4 + block_height4x4, frame_header_.rows4x4,
-                  DivideBy4(plane_height + 3) << subsampling_y});
-    const int row_start = GetDeblockPosition(row4x4, subsampling_y);
-    const int column_start = GetDeblockPosition(column4x4, subsampling_x);
-    if (row_start >= row_limit || column_start >= column_limit) {
-      continue;
-    }
-    const int vertical_step = 1 << subsampling_y;
-    const int horizontal_step = 1 << subsampling_x;
-    const BlockParameters& bp =
-        *block_parameters_holder_.Find(row_start, column_start);
-    const int horizontal_level_index =
-        kDeblockFilterLevelIndex[plane][kLoopFilterTypeHorizontal];
-    const int vertical_level_index =
-        kDeblockFilterLevelIndex[plane][kLoopFilterTypeVertical];
-    const uint8_t vertical_level =
-        bp.deblock_filter_level[vertical_level_index];
-
-    for (int row = row_start; row < row_limit; row += vertical_step) {
-      for (int column = column_start; column < column_limit;) {
-        const TransformSize tx_size = (plane == kPlaneY)
-                                          ? inter_transform_sizes_[row][column]
-                                          : bp.uv_transform_size;
-        // (1). Don't filter frame boundary.
-        // (2). For tile boundary, we don't know whether the previous tile is
-        // available or not, thus we handle it after all tiles are decoded.
-        const bool is_vertical_border =
-            (column == column_start) && is_vertical_block_border;
-        if (column == GetDeblockPosition(column4x4_start_, subsampling_x) ||
-            (skip && !is_vertical_border)) {
-          column += kNum4x4BlocksWide[tx_size] << subsampling_x;
-          continue;
-        }
-
-        // bp_left is the parameter of the left prediction block which
-        // is guaranteed to be inside the tile.
-        const BlockParameters& bp_left =
-            *block_parameters_holder_.Find(row, column - horizontal_step);
-        const uint8_t left_level =
-            is_vertical_border
-                ? bp_left.deblock_filter_level[vertical_level_index]
-                : vertical_level;
-        // We don't have to check if the left block is skipped or not,
-        // because if the current transform block is on the edge of the coding
-        // block, is_vertical_border is true; if it's not on the edge,
-        // left skip is equal to skip.
-        if (vertical_level != 0 || left_level != 0) {
-          const TransformSize left_tx_size =
-              (plane == kPlaneY)
-                  ? inter_transform_sizes_[row][column - horizontal_step]
-                  : bp_left.uv_transform_size;
-          const LoopFilterTransformSizeId transform_size_id =
-              GetTransformSizeIdWidth(tx_size, left_tx_size);
-          const int r = row & (kNum4x4InLoopFilterMaskUnit - 1);
-          const int c = column & (kNum4x4InLoopFilterMaskUnit - 1);
-          const int shift = LoopFilterMask::GetShift(r, c);
-          const int index = LoopFilterMask::GetIndex(r);
-          const auto mask = static_cast<uint64_t>(1) << shift;
-          masks->SetLeft(mask, unit_id, plane, transform_size_id, index);
-          const uint8_t current_level =
-              (vertical_level == 0) ? left_level : vertical_level;
-          masks->SetLevel(current_level, unit_id, plane,
-                          kLoopFilterTypeVertical,
-                          LoopFilterMask::GetLevelOffset(r, c));
-        }
-        column += kNum4x4BlocksWide[tx_size] << subsampling_x;
-      }
-    }
-
-    // Build bit mask for horizontal edges.
-    const uint8_t horizontal_level =
-        bp.deblock_filter_level[horizontal_level_index];
-    for (int column = column_start; column < column_limit;
-         column += horizontal_step) {
-      for (int row = row_start; row < row_limit;) {
-        const TransformSize tx_size = (plane == kPlaneY)
-                                          ? inter_transform_sizes_[row][column]
-                                          : bp.uv_transform_size;
-
-        // (1). Don't filter frame boundary.
-        // (2). For tile boundary, we don't know whether the previous tile is
-        // available or not, thus we handle it after all tiles are decoded.
-        const bool is_horizontal_border =
-            (row == row_start) && is_horizontal_block_border;
-        if (row == GetDeblockPosition(row4x4_start_, subsampling_y) ||
-            (skip && !is_horizontal_border)) {
-          row += kNum4x4BlocksHigh[tx_size] << subsampling_y;
-          continue;
-        }
-
-        // bp_top is the parameter of the top prediction block which is
-        // guaranteed to be inside the tile.
-        const BlockParameters& bp_top =
-            *block_parameters_holder_.Find(row - vertical_step, column);
-        const uint8_t top_level =
-            is_horizontal_border
-                ? bp_top.deblock_filter_level[horizontal_level_index]
-                : horizontal_level;
-        // We don't have to check it the top block is skippped or not,
-        // because if the current transform block is on the edge of the coding
-        // block, is_horizontal_border is true; if it's not on the edge,
-        // top skip is equal to skip.
-        if (horizontal_level != 0 || top_level != 0) {
-          const TransformSize top_tx_size =
-              (plane == kPlaneY)
-                  ? inter_transform_sizes_[row - vertical_step][column]
-                  : bp_top.uv_transform_size;
-          const LoopFilterTransformSizeId transform_size_id =
-              static_cast<LoopFilterTransformSizeId>(
-                  std::min({kTransformHeightLog2[tx_size] - 2,
-                            kTransformHeightLog2[top_tx_size] - 2, 2}));
-          const int r = row & (kNum4x4InLoopFilterMaskUnit - 1);
-          const int c = column & (kNum4x4InLoopFilterMaskUnit - 1);
-          const int shift = LoopFilterMask::GetShift(r, c);
-          const int index = LoopFilterMask::GetIndex(r);
-          const auto mask = static_cast<uint64_t>(1) << shift;
-          masks->SetTop(mask, unit_id, plane, transform_size_id, index);
-          const uint8_t current_level =
-              (horizontal_level == 0) ? top_level : horizontal_level;
-          masks->SetLevel(current_level, unit_id, plane,
-                          kLoopFilterTypeHorizontal,
-                          LoopFilterMask::GetLevelOffset(r, c));
-        }
-        row += kNum4x4BlocksHigh[tx_size] << subsampling_y;
-      }
-    }
+void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
+  if (frame_header_.refresh_frame_flags == 0 ||
+      IsIntraFrame(frame_header_.frame_type)) {
+    return;
   }
-}
+  // Iterate over odd rows/columns beginning at the first odd row/column for the
+  // block. It is done this way because motion field mvs are only needed at a
+  // 8x8 granularity.
+  const int row_start4x4 = block.row4x4 | 1;
+  const int row_limit4x4 =
+      std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+  if (row_start4x4 >= row_limit4x4) return;
+  const int column_start4x4 = block.column4x4 | 1;
+  const int column_limit4x4 =
+      std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+  if (column_start4x4 >= column_limit4x4) return;
 
-void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
   // The largest reference MV component that can be saved.
   constexpr int kRefMvsLimit = (1 << 12) - 1;
   const BlockParameters& bp = *block.bp;
-  ReferenceFrameType reference_frame_to_store = kReferenceFrameNone;
-  MotionVector mv_to_store = {};
+  ReferenceInfo* reference_info = current_frame_.reference_info();
   for (int i = 1; i >= 0; --i) {
-    if (bp.reference_frame[i] > kReferenceFrameIntra &&
-        std::abs(bp.mv[i].mv[MotionVector::kRow]) <= kRefMvsLimit &&
-        std::abs(bp.mv[i].mv[MotionVector::kColumn]) <= kRefMvsLimit &&
-        GetRelativeDistance(
-            reference_order_hint_
-                [frame_header_.reference_frame_index[bp.reference_frame[i] -
-                                                     kReferenceFrameLast]],
-            frame_header_.order_hint, sequence_header_.enable_order_hint,
-            sequence_header_.order_hint_bits) < 0) {
-      reference_frame_to_store = bp.reference_frame[i];
-      mv_to_store = bp.mv[i];
-      break;
-    }
-  }
-  // Iterate over odd rows/columns beginning at the first odd row/column for the
-  // block. It is done this way because motion field mvs are only needed at a
-  // 8x8 granularity.
-  const int row_start = block.row4x4 | 1;
-  const int row_limit = std::min(block.row4x4 + kNum4x4BlocksHigh[block.size],
-                                 frame_header_.rows4x4);
-  const int column_start = block.column4x4 | 1;
-  const int column_limit =
-      std::min(block.column4x4 + kNum4x4BlocksWide[block.size],
-               frame_header_.columns4x4);
-  for (int row = row_start; row < row_limit; row += 2) {
-    const int row_index = DivideBy2(row);
-    ReferenceFrameType* const reference_frame_row_start =
-        current_frame_.motion_field_reference_frame(row_index,
-                                                    DivideBy2(column_start));
-    static_assert(sizeof(reference_frame_to_store) == sizeof(int8_t), "");
-    memset(reference_frame_row_start, reference_frame_to_store,
-           DivideBy2(column_limit - column_start + 1));
-    if (reference_frame_to_store <= kReferenceFrameIntra) continue;
-    for (int column = column_start; column < column_limit; column += 2) {
+    const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
+    // Must make a local copy so that StoreMotionFieldMvs() knows there is no
+    // overlap between load and store.
+    const MotionVector mv_to_store = bp.mv.mv[i];
+    const int mv_row = std::abs(mv_to_store.mv[MotionVector::kRow]);
+    const int mv_column = std::abs(mv_to_store.mv[MotionVector::kColumn]);
+    if (reference_frame_to_store > kReferenceFrameIntra &&
+        // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two
+        // absolute values and then compare with kRefMvsLimit to save a branch.
+        // The next line is equivalent to:
+        // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
+        (mv_row | mv_column) <= kRefMvsLimit &&
+        reference_info->relative_distance_from[reference_frame_to_store] < 0) {
+      const int row_start8x8 = DivideBy2(row_start4x4);
+      const int row_limit8x8 = DivideBy2(row_limit4x4);
+      const int column_start8x8 = DivideBy2(column_start4x4);
+      const int column_limit8x8 = DivideBy2(column_limit4x4);
+      const int rows = row_limit8x8 - row_start8x8;
+      const int columns = column_limit8x8 - column_start8x8;
+      const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
+      ReferenceFrameType* const reference_frame_row_start =
+          &reference_info
+               ->motion_field_reference_frame[row_start8x8][column_start8x8];
       MotionVector* const mv =
-          current_frame_.motion_field_mv(row_index, DivideBy2(column));
-      *mv = mv_to_store;
+          &reference_info->motion_field_mv[row_start8x8][column_start8x8];
+
+      // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
+      // and simplifies std::fill() for these cases.
+      if (columns <= 1) {
+        // Don't change the above condition to (columns == 1).
+        // Condition (columns <= 1) may help the compiler simplify the inlining
+        // of the general case of StoreMotionFieldMvs() by eliminating the
+        // (columns == 0) case.
+        assert(columns == 1);
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            1, reference_frame_row_start, mv);
+      } else if (columns == 2) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            2, reference_frame_row_start, mv);
+      } else if (columns == 4) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            4, reference_frame_row_start, mv);
+      } else if (columns == 8) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            8, reference_frame_row_start, mv);
+      } else if (columns == 16) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            16, reference_frame_row_start, mv);
+      } else if (columns < 16) {
+        // This always true condition (columns < 16) may help the compiler
+        // simplify the inlining of the following function.
+        // This general case is rare and usually only happens to the blocks
+        // which contain the right boundary of the frame.
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            columns, reference_frame_row_start, mv);
+      } else {
+        assert(false);
+      }
+      return;
     }
   }
 }