Snap for 8821814 from eee6aacd0da152622c837c25d87908eebece5f28 to mainline-go-uwb-releaseaml_go_uwb_330912000 android13-mainline-go-uwb-release

Change-Id: I0ab584bd5d845b0072708bc6991954abdc7fd2af
author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2022-07-12 16:58:33 +0000
committer: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2022-07-12 16:58:33 +0000
commit: 3ffef2f45795a505468f8bfe0f7000c8276033ea (patch)
tree: c9445be70dbfa7de3e9a3d43358d10e5582e5097
parent: a9d9cdc6db5e02f60d4676b391b4599204d38278 (diff)
parent: eee6aacd0da152622c837c25d87908eebece5f28 (diff)
download: libgav1-android13-mainline-go-uwb-release.tar.gz
85 files changed, 6117 insertions, 3927 deletions
diff --git a/Android.bp b/Android.bp
index d47cb2b..ee4852d 100644
--- a/Android.bp
+++ b/Android.bp
@@ -79,6 +79,7 @@ cc_library_static {
         "libgav1/src/dsp/arm/intrapred_smooth_neon.cc",
         "libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc",
         "libgav1/src/dsp/arm/inverse_transform_neon.cc",
+        "libgav1/src/dsp/arm/loop_filter_10bit_neon.cc",
         "libgav1/src/dsp/arm/loop_filter_neon.cc",
         "libgav1/src/dsp/arm/loop_restoration_10bit_neon.cc",
         "libgav1/src/dsp/arm/loop_restoration_neon.cc",
diff --git a/README.version b/README.version
index 53d5b62..13d4b14 100644
--- a/README.version
+++ b/README.version
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/codecs/libgav1
-Version: v0.17.0
+Version: v0.18.0
 BugComponent: 324837
 Local Modifications:
 None
diff --git a/libgav1/CMakeLists.txt b/libgav1/CMakeLists.txt
index 4029de1..52b1b32 100644
--- a/libgav1/CMakeLists.txt
+++ b/libgav1/CMakeLists.txt
@@ -48,6 +48,8 @@ libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
                VALUE ON)
 libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
                "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_EXAMPLES HELPSTRING "Enables examples." VALUE
+               ON)
 libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON)
 libgav1_option(
   NAME LIBGAV1_VERBOSE HELPSTRING
@@ -101,6 +103,12 @@ libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY)
 
 # Controls use of std::mutex and absl::Mutex in ThreadPool.
 libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+if((DEFINED
+    LIBGAV1_THREADPOOL_USE_STD_MUTEX
+    AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+   OR NOT (DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX OR ANDROID OR IOS))
+  set(use_absl_threading TRUE)
+endif()
 
 if(LIBGAV1_VERBOSE)
   libgav1_dump_cmake_flag_variables()
@@ -124,18 +132,22 @@ endif()
 libgav1_set_test_flags()
 
 set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp")
-if(NOT EXISTS "${libgav1_abseil}")
-  message(
-    FATAL_ERROR
-      "Abseil not found. This dependency is required by the"
-      " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
-      " not defined. To continue, download the Abseil repository to"
-      " third_party/abseil-cpp:\n  git \\\n    -C ${libgav1_root} \\\n"
-      "    clone \\\n"
-      "    https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+if(EXISTS "${libgav1_abseil}")
+  set(ABSL_PROPAGATE_CXX_STD ON)
+  add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}"
+                   EXCLUDE_FROM_ALL)
+else()
+  if(use_absl_threading OR LIBGAV1_ENABLE_EXAMPLES OR LIBGAV1_ENABLE_TESTS)
+    message(
+      FATAL_ERROR
+        "Abseil not found. This dependency is required by the"
+        " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
+        " not defined. To continue, download the Abseil repository to"
+        " third_party/abseil-cpp:\n  git \\\n    -C ${libgav1_root} \\\n"
+        "    clone \\\n"
+        "    https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+  endif()
 endif()
-set(ABSL_PROPAGATE_CXX_STD ON)
-add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
 
 libgav1_reset_target_lists()
 libgav1_add_dsp_targets()
diff --git a/libgav1/README.md b/libgav1/README.md
index 6744291..04c6a94 100644
--- a/libgav1/README.md
+++ b/libgav1/README.md
@@ -1,7 +1,7 @@
 # libgav1 -- an AV1 decoder
 
-libgav1 is a Main profile (0) & High profile (1) compliant AV1 decoder. More
-information on the AV1 video format can be found at
+libgav1 is a Main profile (0), High profile (1) & Professional profile (2)
+compliant AV1 decoder. More information on the AV1 video format can be found at
 [aomedia.org](https://aomedia.org).
 
 [TOC]
diff --git a/libgav1/cmake/libgav1_build_definitions.cmake b/libgav1/cmake/libgav1_build_definitions.cmake
index 0d00bb6..3885dcd 100644
--- a/libgav1/cmake/libgav1_build_definitions.cmake
+++ b/libgav1/cmake/libgav1_build_definitions.cmake
@@ -142,8 +142,10 @@ macro(libgav1_set_build_definitions)
 
   if(NOT LIBGAV1_MAX_BITDEPTH)
     set(LIBGAV1_MAX_BITDEPTH 10)
-  elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10)
-    libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8 or 10.")
+  elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8
+         AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10
+         AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 12)
+    libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12.")
   endif()
 
   list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}")
diff --git a/libgav1/cmake/libgav1_install.cmake b/libgav1/cmake/libgav1_install.cmake
index b7f6006..e2c79b9 100644
--- a/libgav1/cmake/libgav1_install.cmake
+++ b/libgav1/cmake/libgav1_install.cmake
@@ -48,8 +48,10 @@ macro(libgav1_setup_install_target)
       FILES ${libgav1_api_includes}
       DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1")
 
-    install(TARGETS gav1_decode DESTINATION
-                    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+    if(LIBGAV1_ENABLE_EXAMPLES)
+      install(TARGETS gav1_decode DESTINATION
+                      "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+    endif()
     install(TARGETS libgav1_static DESTINATION
                     "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
     if(BUILD_SHARED_LIBS)
diff --git a/libgav1/cmake/toolchains/android.cmake b/libgav1/cmake/toolchains/android.cmake
index 492957b..b550397 100644
--- a/libgav1/cmake/toolchains/android.cmake
+++ b/libgav1/cmake/toolchains/android.cmake
@@ -30,9 +30,9 @@ if(NOT ANDROID_ABI)
   set(ANDROID_ABI arm64-v8a)
 endif()
 
-# Force arm mode for 32-bit targets (instead of the default thumb) to improve
-# performance.
-if(NOT ANDROID_ARM_MODE)
+# Force arm mode for 32-bit arm targets (instead of the default thumb) to
+# improve performance.
+if(ANDROID_ABI MATCHES "^armeabi" AND NOT ANDROID_ARM_MODE)
   set(ANDROID_ARM_MODE arm)
 endif()
 
diff --git a/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake b/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake
index 7448f54..7d58ce1 100644
--- a/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake
+++ b/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake
@@ -27,10 +27,13 @@ endif()
 if(NOT CMAKE_C_COMPILER)
   set(CMAKE_C_COMPILER ${CROSS}gcc)
 endif()
-set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm")
+# Note: -march=armv7-a+fp is an alternative to -mfpu with newer versions of
+# gcc:
+# https://gcc.gnu.org/git/?p=gcc.git&a=commit;h=dff2abcbee65dbb4b7ca3ade0f7622ffdc0af391
+set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3")
 if(NOT CMAKE_CXX_COMPILER)
   set(CMAKE_CXX_COMPILER ${CROSS}g++)
 endif()
-set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm")
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3")
 set(CMAKE_SYSTEM_PROCESSOR "armv7")
 set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon")
diff --git a/libgav1/examples/libgav1_examples.cmake b/libgav1/examples/libgav1_examples.cmake
index 1f949f3..a3ec156 100644
--- a/libgav1/examples/libgav1_examples.cmake
+++ b/libgav1/examples/libgav1_examples.cmake
@@ -17,6 +17,13 @@ if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_)
 endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_
 set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1)
 
+if(NOT LIBGAV1_ENABLE_EXAMPLES)
+  macro(libgav1_add_examples_targets)
+
+  endmacro()
+  return()
+endif()
+
 set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc"
                                 "${libgav1_examples}/file_reader.h"
                                 "${libgav1_examples}/file_reader_constants.cc"
diff --git a/libgav1/src/buffer_pool.cc b/libgav1/src/buffer_pool.cc
index c1a5606..582f13c 100644
--- a/libgav1/src/buffer_pool.cc
+++ b/libgav1/src/buffer_pool.cc
@@ -156,19 +156,15 @@ bool BufferPool::OnFrameBufferSizeChanged(int bitdepth,
 }
 
 RefCountedBufferPtr BufferPool::GetFreeBuffer() {
-  // In frame parallel mode, the GetFreeBuffer() calls from ObuParser all happen
-  // from the same thread serially, but the GetFreeBuffer() call in
-  // DecoderImpl::ApplyFilmGrain can happen from multiple threads at the same
-  // time. So this function has to be thread safe.
-  // TODO(b/142583029): Investigate if the GetFreeBuffer() call in
-  // DecoderImpl::ApplyFilmGrain() call can be serialized so that this function
-  // need not be thread safe.
   std::unique_lock<std::mutex> lock(mutex_);
   for (auto buffer : buffers_) {
     if (!buffer->in_use_) {
       buffer->in_use_ = true;
       buffer->progress_row_ = -1;
       buffer->frame_state_ = kFrameStateUnknown;
+      buffer->hdr_cll_set_ = false;
+      buffer->hdr_mdcv_set_ = false;
+      buffer->itut_t35_set_ = false;
       lock.unlock();
       return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
     }
diff --git a/libgav1/src/buffer_pool.h b/libgav1/src/buffer_pool.h
index d9eba6d..d4e50e0 100644
--- a/libgav1/src/buffer_pool.h
+++ b/libgav1/src/buffer_pool.h
@@ -33,6 +33,7 @@
 #include "src/symbol_decoder_context.h"
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
 #include "src/utils/reference_info.h"
 #include "src/utils/segmentation.h"
 #include "src/utils/segmentation_map.h"
@@ -134,6 +135,36 @@ class RefCountedBuffer : public MaxAlignedAllocable {
   int temporal_id() const { return temporal_id_; }
   void set_temporal_id(int value) { temporal_id_ = value; }
 
+  ObuMetadataHdrCll hdr_cll() const { return hdr_cll_; }
+  void set_hdr_cll(const ObuMetadataHdrCll& hdr_cll) {
+    hdr_cll_set_ = true;
+    hdr_cll_ = hdr_cll;
+  }
+  bool hdr_cll_set() const { return hdr_cll_set_; }
+
+  ObuMetadataHdrMdcv hdr_mdcv() const { return hdr_mdcv_; }
+  void set_hdr_mdcv(const ObuMetadataHdrMdcv& hdr_mdcv) {
+    hdr_mdcv_set_ = true;
+    hdr_mdcv_ = hdr_mdcv;
+  }
+  bool hdr_mdcv_set() const { return hdr_mdcv_set_; }
+
+  ObuMetadataItutT35 itut_t35() const { return itut_t35_; }
+  bool set_itut_t35(const ObuMetadataItutT35& itut_t35,
+                    const uint8_t* const payload) {
+    itut_t35_ = itut_t35;
+    if (itut_t35.payload_size > 0) {
+      if (!itut_t35_payload_.Resize(itut_t35.payload_size)) return false;
+      memcpy(itut_t35_payload_.get(), payload, itut_t35.payload_size);
+      itut_t35_.payload_bytes = itut_t35_payload_.get();
+    } else {
+      itut_t35_.payload_bytes = nullptr;
+    }
+    itut_t35_set_ = true;
+    return true;
+  }
+  bool itut_t35_set() const { return itut_t35_set_; }
+
   SegmentationMap* segmentation_map() { return &segmentation_map_; }
   const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
 
@@ -317,6 +348,14 @@ class RefCountedBuffer : public MaxAlignedAllocable {
   int spatial_id_ = 0;
   int temporal_id_ = 0;
 
+  ObuMetadataHdrCll hdr_cll_ = {};
+  bool hdr_cll_set_ = false;  // Set to true when set_hdr_cll() is called.
+  ObuMetadataHdrMdcv hdr_mdcv_ = {};
+  bool hdr_mdcv_set_ = false;  // Set to true when set_hdr_mdcv() is called.
+  ObuMetadataItutT35 itut_t35_ = {};
+  DynamicBuffer<uint8_t> itut_t35_payload_;
+  bool itut_t35_set_ = false;  // Set to true when set_itut_t35() is called.
+
   // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
   SegmentationMap segmentation_map_;
 
diff --git a/libgav1/src/decoder_impl.cc b/libgav1/src/decoder_impl.cc
index dbb9e81..e8de64a 100644
--- a/libgav1/src/decoder_impl.cc
+++ b/libgav1/src/decoder_impl.cc
@@ -1171,6 +1171,24 @@ StatusCode DecoderImpl::CopyFrameToOutputBuffer(
   buffer_.spatial_id = frame->spatial_id();
   buffer_.temporal_id = frame->temporal_id();
   buffer_.buffer_private_data = frame->buffer_private_data();
+  if (frame->hdr_cll_set()) {
+    buffer_.has_hdr_cll = 1;
+    buffer_.hdr_cll = frame->hdr_cll();
+  } else {
+    buffer_.has_hdr_cll = 0;
+  }
+  if (frame->hdr_mdcv_set()) {
+    buffer_.has_hdr_mdcv = 1;
+    buffer_.hdr_mdcv = frame->hdr_mdcv();
+  } else {
+    buffer_.has_hdr_mdcv = 0;
+  }
+  if (frame->itut_t35_set()) {
+    buffer_.has_itut_t35 = 1;
+    buffer_.itut_t35 = frame->itut_t35();
+  } else {
+    buffer_.has_itut_t35 = 0;
+  }
   output_frame_ = frame;
   return kStatusOk;
 }
@@ -1602,7 +1620,7 @@ StatusCode DecoderImpl::ApplyFilmGrain(
          (*film_grain_frame)->buffer()->stride(kPlaneV));
   const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU);
 #if LIBGAV1_MAX_BITDEPTH >= 10
-  if (displayable_frame->buffer()->bitdepth() > 8) {
+  if (displayable_frame->buffer()->bitdepth() == 10) {
     FilmGrain<10> film_grain(displayable_frame->film_grain_params(),
                              displayable_frame->buffer()->is_monochrome(),
                              color_matrix_is_identity,
@@ -1625,6 +1643,30 @@ StatusCode DecoderImpl::ApplyFilmGrain(
     return kStatusOk;
   }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+  if (displayable_frame->buffer()->bitdepth() == 12) {
+    FilmGrain<12> film_grain(displayable_frame->film_grain_params(),
+                             displayable_frame->buffer()->is_monochrome(),
+                             color_matrix_is_identity,
+                             displayable_frame->buffer()->subsampling_x(),
+                             displayable_frame->buffer()->subsampling_y(),
+                             displayable_frame->upscaled_width(),
+                             displayable_frame->frame_height(), thread_pool);
+    if (!film_grain.AddNoise(
+            displayable_frame->buffer()->data(kPlaneY),
+            displayable_frame->buffer()->stride(kPlaneY),
+            displayable_frame->buffer()->data(kPlaneU),
+            displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+            (*film_grain_frame)->buffer()->data(kPlaneY),
+            (*film_grain_frame)->buffer()->stride(kPlaneY),
+            (*film_grain_frame)->buffer()->data(kPlaneU),
+            (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+      LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+      return kStatusOutOfMemory;
+    }
+    return kStatusOk;
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
   FilmGrain<8> film_grain(displayable_frame->film_grain_params(),
                           displayable_frame->buffer()->is_monochrome(),
                           color_matrix_is_identity,
diff --git a/libgav1/src/decoder_impl.h b/libgav1/src/decoder_impl.h
index b52ecdf..b75417d 100644
--- a/libgav1/src/decoder_impl.h
+++ b/libgav1/src/decoder_impl.h
@@ -141,8 +141,9 @@ class DecoderImpl : public Allocable {
                           int64_t user_private_data, void* buffer_private_data);
   StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
   static constexpr int GetMaxBitdepth() {
-    static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10,
-                  "LIBGAV1_MAX_BITDEPTH must be 8 or 10.");
+    static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10 ||
+                      LIBGAV1_MAX_BITDEPTH == 12,
+                  "LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12.");
     return LIBGAV1_MAX_BITDEPTH;
   }
 
diff --git a/libgav1/src/dsp/arm/common_neon.h b/libgav1/src/dsp/arm/common_neon.h
index 9c46525..c0af2c1 100644
--- a/libgav1/src/dsp/arm/common_neon.h
+++ b/libgav1/src/dsp/arm/common_neon.h
@@ -309,6 +309,12 @@ inline uint8x16_t MaskOverreadsQ(const uint8x16_t source,
   return dst;
 }
 
+inline uint16x8_t MaskOverreadsQ(const uint16x8_t source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return vreinterpretq_u16_u8(
+      MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes));
+}
+
 inline uint8x8_t Load1MsanU8(const uint8_t* const source,
                              const ptrdiff_t over_read_in_bytes) {
   return MaskOverreads(vld1_u8(source), over_read_in_bytes);
@@ -325,20 +331,6 @@ inline uint16x8_t Load1QMsanU16(const uint16_t* const source,
       vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes));
 }
 
-inline uint16x8x2_t Load2QMsanU16(const uint16_t* const source,
-                                  const ptrdiff_t over_read_in_bytes) {
-  // Relative source index of elements (2 bytes each):
-  // dst.val[0]: 00 02 04 06 08 10 12 14
-  // dst.val[1]: 01 03 05 07 09 11 13 15
-  uint16x8x2_t dst = vld2q_u16(source);
-  dst.val[0] = vreinterpretq_u16_u8(MaskOverreadsQ(
-      vreinterpretq_u8_u16(dst.val[0]), over_read_in_bytes >> 1));
-  dst.val[1] = vreinterpretq_u16_u8(
-      MaskOverreadsQ(vreinterpretq_u8_u16(dst.val[1]),
-                     (over_read_in_bytes >> 1) + (over_read_in_bytes % 4)));
-  return dst;
-}
-
 inline uint32x4_t Load1QMsanU32(const uint32_t* const source,
                                 const ptrdiff_t over_read_in_bytes) {
   return vreinterpretq_u32_u8(MaskOverreadsQ(
@@ -402,6 +394,24 @@ inline void Store8(void* const buf, const uint16x8_t val) {
   vst1q_u16(static_cast<uint16_t*>(buf), val);
 }
 
+inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) {
+#if LIBGAV1_MSAN
+  // The memory shadow is incorrect for vst4q_u16, only marking the first 16
+  // bytes of the destination as initialized. To avoid missing truly
+  // uninitialized memory, check the input vectors first, before marking the
+  // whole 64 bytes initialized. If any input vector contains unused values, it
+  // should pass through MaskOverreadsQ first.
+  __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0]));
+  __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1]));
+  __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2]));
+  __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3]));
+  vst4q_s16(static_cast<int16_t*>(buf), src);
+  __msan_unpoison(buf, sizeof(int16x8x4_t));
+#else
+  vst4q_s16(static_cast<int16_t*>(buf), src);
+#endif  // LIBGAV1_MSAN
+}
+
 //------------------------------------------------------------------------------
 // Pointer helpers.
 
@@ -587,7 +597,8 @@ inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
 //------------------------------------------------------------------------------
 // Saturation helpers.
 
-inline int16x4_t Clip3S16(int16x4_t val, int16x4_t low, int16x4_t high) {
+inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low,
+                          const int16x4_t high) {
   return vmin_s16(vmax_s16(val, low), high);
 }
 
@@ -596,7 +607,7 @@ inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low,
   return vminq_s16(vmaxq_s16(val, low), high);
 }
 
-inline uint16x8_t ConvertToUnsignedPixelU16(int16x8_t val, int bitdepth) {
+inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) {
   const int16x8_t low = vdupq_n_s16(0);
   const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
 
@@ -727,7 +738,7 @@ inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
 // Output:
 // b0.val[0]: 00 01 02 03 16 17 18 19
 // b0.val[1]: 04 05 06 07 20 21 22 23
-inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
+inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) {
   int16x8x2_t b0;
   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
                            vreinterpret_s16_s32(vget_low_s32(a1)));
@@ -736,7 +747,7 @@ inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
   return b0;
 }
 
-inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
+inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) {
   uint16x8x2_t b0;
   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
                            vreinterpret_u16_u32(vget_low_u32(a1)));
@@ -750,6 +761,11 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
 // 10 11 12 13
 // 20 21 22 23
 // 30 31 32 33
+// Output:
+// 00 10 20 30
+// 01 11 21 31
+// 02 12 22 32
+// 03 13 23 33
 inline void Transpose4x4(uint16x4_t a[4]) {
   // b:
   // 00 10 02 12
diff --git a/libgav1/src/dsp/arm/convolve_10bit_neon.cc b/libgav1/src/dsp/arm/convolve_10bit_neon.cc
index b7205df..389f029 100644
--- a/libgav1/src/dsp/arm/convolve_10bit_neon.cc
+++ b/libgav1/src/dsp/arm/convolve_10bit_neon.cc
@@ -45,12 +45,12 @@ namespace {
 //   Pixel output range:                [       0,     1023]
 //   Compound output range:             [    3988,    61532]
 
-template <int filter_index>
+template <int num_taps>
 int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
                            const int16x4_t* const taps) {
   const auto* ssrc = reinterpret_cast<const int16x8_t*>(src);
   int32x4x2_t sum;
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
@@ -65,7 +65,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
@@ -84,7 +84,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]);
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
@@ -106,12 +106,12 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
   return sum;
 }
 
-template <int filter_index>
+template <int num_taps>
 int32x4_t SumOnePassTaps(const uint16x4_t* const src,
                          const int16x4_t* const taps) {
   const auto* ssrc = reinterpret_cast<const int16x4_t*>(src);
   int32x4_t sum;
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     sum = vmull_s16(ssrc[0], taps[0]);
     sum = vmlal_s16(sum, ssrc[1], taps[1]);
@@ -119,7 +119,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src,
     sum = vmlal_s16(sum, ssrc[3], taps[3]);
     sum = vmlal_s16(sum, ssrc[4], taps[4]);
     sum = vmlal_s16(sum, ssrc[5], taps[5]);
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     sum = vmull_s16(ssrc[0], taps[0]);
     sum = vmlal_s16(sum, ssrc[1], taps[1]);
@@ -129,7 +129,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src,
     sum = vmlal_s16(sum, ssrc[5], taps[5]);
     sum = vmlal_s16(sum, ssrc[6], taps[6]);
     sum = vmlal_s16(sum, ssrc[7], taps[7]);
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     sum = vmull_s16(ssrc[0], taps[0]);
     sum = vmlal_s16(sum, ssrc[1], taps[1]);
@@ -143,7 +143,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src,
   return sum;
 }
 
-template <int filter_index, bool is_compound, bool is_2d>
+template <int num_taps, bool is_compound, bool is_2d>
 void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
                                  const ptrdiff_t src_stride,
                                  void* LIBGAV1_RESTRICT const dest,
@@ -162,15 +162,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
         const uint16x8_t src_long_hi = vld1q_u16(s + 8);
         uint16x8_t v_src[8];
         int32x4x2_t v_sum;
-        if (filter_index < 2) {
+        if (num_taps == 6) {
           v_src[0] = src_long;
           v_src[1] = vextq_u16(src_long, src_long_hi, 1);
           v_src[2] = vextq_u16(src_long, src_long_hi, 2);
           v_src[3] = vextq_u16(src_long, src_long_hi, 3);
           v_src[4] = vextq_u16(src_long, src_long_hi, 4);
           v_src[5] = vextq_u16(src_long, src_long_hi, 5);
-          v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1);
-        } else if (filter_index == 2) {
+          v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1);
+        } else if (num_taps == 8) {
           v_src[0] = src_long;
           v_src[1] = vextq_u16(src_long, src_long_hi, 1);
           v_src[2] = vextq_u16(src_long, src_long_hi, 2);
@@ -179,17 +179,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
           v_src[5] = vextq_u16(src_long, src_long_hi, 5);
           v_src[6] = vextq_u16(src_long, src_long_hi, 6);
           v_src[7] = vextq_u16(src_long, src_long_hi, 7);
-          v_sum = SumOnePassTaps<filter_index>(v_src, v_tap);
-        } else if (filter_index == 3) {
+          v_sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+        } else if (num_taps == 2) {
           v_src[0] = src_long;
           v_src[1] = vextq_u16(src_long, src_long_hi, 1);
-          v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
-        } else {  // filter_index > 3
+          v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+        } else {  // 4 taps
           v_src[0] = src_long;
           v_src[1] = vextq_u16(src_long, src_long_hi, 1);
           v_src[2] = vextq_u16(src_long, src_long_hi, 2);
           v_src[3] = vextq_u16(src_long, src_long_hi, 3);
-          v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+          v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
         }
 
         const int16x4_t d0 =
@@ -213,15 +213,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
       const uint16x8_t src_long_hi = vld1q_u16(src + x + 8);
       uint16x8_t v_src[8];
       int32x4x2_t v_sum;
-      if (filter_index < 2) {
+      if (num_taps == 6) {
         v_src[0] = src_long;
         v_src[1] = vextq_u16(src_long, src_long_hi, 1);
         v_src[2] = vextq_u16(src_long, src_long_hi, 2);
         v_src[3] = vextq_u16(src_long, src_long_hi, 3);
         v_src[4] = vextq_u16(src_long, src_long_hi, 4);
         v_src[5] = vextq_u16(src_long, src_long_hi, 5);
-        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1);
-      } else if (filter_index == 2) {
+        v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1);
+      } else if (num_taps == 8) {
         v_src[0] = src_long;
         v_src[1] = vextq_u16(src_long, src_long_hi, 1);
         v_src[2] = vextq_u16(src_long, src_long_hi, 2);
@@ -230,17 +230,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
         v_src[5] = vextq_u16(src_long, src_long_hi, 5);
         v_src[6] = vextq_u16(src_long, src_long_hi, 6);
         v_src[7] = vextq_u16(src_long, src_long_hi, 7);
-        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap);
-      } else if (filter_index == 3) {
+        v_sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+      } else if (num_taps == 2) {
         v_src[0] = src_long;
         v_src[1] = vextq_u16(src_long, src_long_hi, 1);
-        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
-      } else {  // filter_index > 3
+        v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+      } else {  // 4 taps
         v_src[0] = src_long;
         v_src[1] = vextq_u16(src_long, src_long_hi, 1);
         v_src[2] = vextq_u16(src_long, src_long_hi, 2);
         v_src[3] = vextq_u16(src_long, src_long_hi, 3);
-        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+        v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
       }
       if (is_compound) {
         const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
@@ -276,7 +276,7 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
   } while (--y != 0);
 }
 
-template <int filter_index, bool is_compound, bool is_2d>
+template <int num_taps, bool is_compound, bool is_2d>
 void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
                             const ptrdiff_t src_stride,
                             void* LIBGAV1_RESTRICT const dest,
@@ -291,14 +291,14 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
     int32x4_t v_sum;
     const uint16x8_t src_long = vld1q_u16(src);
     v_src[0] = vget_low_u16(src_long);
-    if (filter_index == 3) {
+    if (num_taps == 2) {
       v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
-      v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
+      v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
     } else {
       v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
       v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2));
       v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3));
-      v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+      v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
     }
     if (is_compound || is_2d) {
       const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
@@ -321,7 +321,7 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
   } while (--y != 0);
 }
 
-template <int filter_index, bool is_2d>
+template <int num_taps, bool is_2d>
 void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
                             const ptrdiff_t src_stride,
                             void* LIBGAV1_RESTRICT const dest,
@@ -336,7 +336,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
     const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride));
     const int16x8x2_t input = vzipq_s16(input0, input1);
     int32x4_t v_sum;
-    if (filter_index == 3) {
+    if (num_taps == 2) {
       v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]);
       v_sum = vmlal_s16(v_sum,
                         vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)),
@@ -387,7 +387,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
     assert(height % 2 == 1);
     const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src));
     int32x4_t v_sum;
-    if (filter_index == 3) {
+    if (num_taps == 2) {
       v_sum = vmull_s16(vget_low_s16(input), v_tap[3]);
       v_sum =
           vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]);
@@ -406,17 +406,17 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
   }
 }
 
-template <int filter_index, bool is_compound, bool is_2d>
+template <int num_taps, bool is_compound, bool is_2d>
 void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
                       const ptrdiff_t src_stride,
                       void* LIBGAV1_RESTRICT const dest,
                       const ptrdiff_t pred_stride, const int width,
                       const int height, const int16x4_t* const v_tap) {
-  assert(width < 8 || filter_index <= 3);
+  assert(width < 8 || num_taps != 4);
   // Don't simplify the redundant if conditions with the template parameters,
   // which helps the compiler generate compact code.
-  if (width >= 8 && filter_index <= 3) {
-    FilterHorizontalWidth8AndUp<filter_index, is_compound, is_2d>(
+  if (width >= 8 && num_taps != 4) {
+    FilterHorizontalWidth8AndUp<num_taps, is_compound, is_2d>(
         src, src_stride, dest, pred_stride, width, height, v_tap);
     return;
   }
@@ -424,17 +424,17 @@ void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
   // Horizontal passes only needs to account for number of taps 2 and 4 when
   // |width| <= 4.
   assert(width <= 4);
-  assert(filter_index >= 3 && filter_index <= 5);
-  if (filter_index >= 3 && filter_index <= 5) {
+  assert(num_taps == 2 || num_taps == 4);
+  if (num_taps == 2 || num_taps == 4) {
     if (width == 4) {
-      FilterHorizontalWidth4<filter_index, is_compound, is_2d>(
+      FilterHorizontalWidth4<num_taps, is_compound, is_2d>(
           src, src_stride, dest, pred_stride, height, v_tap);
       return;
     }
     assert(width == 2);
     if (!is_compound) {
-      FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
-                                                  pred_stride, height, v_tap);
+      FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest,
+                                              pred_stride, height, v_tap);
     }
   }
 }
@@ -455,22 +455,17 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
   }
 
   if (filter_index == 2) {  // 8 tap.
-    FilterHorizontal<2, is_compound, is_2d>(src, src_stride, dst, dst_stride,
+    FilterHorizontal<8, is_compound, is_2d>(src, src_stride, dst, dst_stride,
                                             width, height, v_tap);
-  } else if (filter_index == 1) {  // 6 tap.
-    FilterHorizontal<1, is_compound, is_2d>(src + 1, src_stride, dst,
+  } else if (filter_index < 2) {  // 6 tap.
+    FilterHorizontal<6, is_compound, is_2d>(src + 1, src_stride, dst,
                                             dst_stride, width, height, v_tap);
-  } else if (filter_index == 0) {  // 6 tap.
-    FilterHorizontal<0, is_compound, is_2d>(src + 1, src_stride, dst,
-                                            dst_stride, width, height, v_tap);
-  } else if (filter_index == 4) {  // 4 tap.
+  } else if ((filter_index & 0x4) != 0) {  // 4 tap.
+    // ((filter_index == 4) | (filter_index == 5))
     FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst,
                                             dst_stride, width, height, v_tap);
-  } else if (filter_index == 5) {  // 4 tap.
-    FilterHorizontal<5, is_compound, is_2d>(src + 2, src_stride, dst,
-                                            dst_stride, width, height, v_tap);
   } else {  // 2 tap.
-    FilterHorizontal<3, is_compound, is_2d>(src + 3, src_stride, dst,
+    FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst,
                                             dst_stride, width, height, v_tap);
   }
 }
@@ -510,13 +505,12 @@ void ConvolveCompoundHorizontal_NEON(
                                          filter_index);
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
                     const ptrdiff_t src_stride,
                     void* LIBGAV1_RESTRICT const dst,
                     const ptrdiff_t dst_stride, const int width,
                     const int height, const int16x4_t* const taps) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
   auto* const dst16 = static_cast<uint16_t*>(dst);
@@ -555,7 +549,7 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
       srcs[next_row] = vld1q_u16(src_x);
       src_x += src_stride;
 
-      const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+      const int32x4x2_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
       if (is_compound) {
         const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
         const int16x4_t d0 =
@@ -593,13 +587,12 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
   } while (x < width);
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
                        const ptrdiff_t src_stride,
                        void* LIBGAV1_RESTRICT const dst,
                        const ptrdiff_t dst_stride, const int height,
                        const int16x4_t* const taps) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -633,8 +626,8 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
     srcs[num_taps] = vld1_u16(src);
     src += src_stride;
 
-    const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
-    const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps);
+    const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
+    const int32x4_t v_sum_1 = SumOnePassTaps<num_taps>(srcs + 1, taps);
     if (is_compound) {
       const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
       const int16x4_t d1 =
@@ -673,13 +666,12 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
   } while (y != 0);
 }
 
-template <int filter_index>
+template <int num_taps>
 void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
                        const ptrdiff_t src_stride,
                        void* LIBGAV1_RESTRICT const dst,
                        const ptrdiff_t dst_stride, const int height,
                        const int16x4_t* const taps) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -718,7 +710,7 @@ void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
     src += src_stride;
     srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2);
 
-    const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+    const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
     const uint16x4_t d0 =
         vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
     Store2<0>(dst16, d0);
@@ -1180,13 +1172,13 @@ void ConvolveVertical_NEON(
 
   if (filter_index == 0) {  // 6 tap.
     if (width == 2) {
-      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+      FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height,
                            taps + 1);
     } else if (width == 4) {
-      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+      FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height,
                            taps + 1);
     } else {
-      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
                         taps + 1);
     }
   } else if ((static_cast<int>(filter_index == 1) &
@@ -1196,33 +1188,33 @@ void ConvolveVertical_NEON(
                static_cast<int>(vertical_filter_id == 9) |
                static_cast<int>(vertical_filter_id == 15))) != 0) {  // 6 tap.
     if (width == 2) {
-      FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+      FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height,
                            taps + 1);
     } else if (width == 4) {
-      FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+      FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height,
                            taps + 1);
     } else {
-      FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
                         taps + 1);
     }
   } else if (filter_index == 2) {  // 8 tap.
     if (width == 2) {
-      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
     } else {
-      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
                         taps);
     }
   } else if (filter_index == 3) {  // 2 tap.
     if (width == 2) {
-      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height,
                            taps + 3);
     } else if (width == 4) {
-      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height,
                            taps + 3);
     } else {
-      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
                         taps + 3);
     }
   } else {
@@ -1240,13 +1232,13 @@ void ConvolveVertical_NEON(
     // treating it as though it has 4.
     if (filter_index == 1) src += src_stride;
     if (width == 2) {
-      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
                            taps + 2);
     } else if (width == 4) {
-      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
                            taps + 2);
     } else {
-      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
                         taps + 2);
     }
   }
@@ -1274,10 +1266,10 @@ void ConvolveCompoundVertical_NEON(
 
   if (filter_index == 0) {  // 6 tap.
     if (width == 4) {
-      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+      FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps + 1);
     } else {
-      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps + 1);
     }
   } else if ((static_cast<int>(filter_index == 1) &
@@ -1287,26 +1279,26 @@ void ConvolveCompoundVertical_NEON(
                static_cast<int>(vertical_filter_id == 9) |
                static_cast<int>(vertical_filter_id == 15))) != 0) {  // 6 tap.
     if (width == 4) {
-      FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+      FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps + 1);
     } else {
-      FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps + 1);
     }
   } else if (filter_index == 2) {  // 8 tap.
     if (width == 4) {
-      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+      FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps);
     } else {
-      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
     }
   } else if (filter_index == 3) {  // 2 tap.
     if (width == 4) {
-      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps + 3);
     } else {
-      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps + 3);
     }
   } else {
@@ -1323,10 +1315,10 @@ void ConvolveCompoundVertical_NEON(
     // treating it as though it has 4.
     if (filter_index == 1) src += src_stride;
     if (width == 4) {
-      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps + 2);
     } else {
-      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps + 2);
     }
   }
@@ -1980,7 +1972,7 @@ inline void ConvolveKernelHorizontal2Tap(
                                  PermuteSrcVals(src_bytes, src_lookup[1])};
 
       vst1_s16(intermediate,
-               vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src, taps),
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src, taps),
                             kInterRoundBitsHorizontal - 1));
       src_y = AddByteStride(src_y, src_stride);
       intermediate += kIntermediateStride;
@@ -2034,13 +2026,12 @@ inline void ConvolveKernelHorizontal2Tap(
       const uint16x4_t src_high[2] = {vget_high_u16(src[0]),
                                       vget_high_u16(src[1])};
 
-      vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(
-                                                src_low, taps_low),
-                                            kInterRoundBitsHorizontal - 1));
-      vst1_s16(
-          intermediate_x + 4,
-          vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src_high, taps_high),
-                       kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_low, taps_low),
+                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x + 4,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_high, taps_high),
+                            kInterRoundBitsHorizontal - 1));
       // Avoid right shifting the stride.
       src_x = AddByteStride(src_x, src_stride);
       intermediate_x += kIntermediateStride;
@@ -2123,7 +2114,7 @@ inline void ConvolveKernelHorizontalPositive4Tap(
                                PermuteSrcVals(src_bytes, src_lookup[3])};
 
     vst1_s16(intermediate,
-             vrshrn_n_s32(SumOnePassTaps</*filter_index=*/5>(src, taps),
+             vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps),
                           kInterRoundBitsHorizontal - 1));
     src_y = AddByteStride(src_y, src_stride);
     intermediate += kIntermediateStride;
@@ -2202,7 +2193,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
                                PermuteSrcVals(src_bytes, src_lookup[3])};
 
     vst1_s16(intermediate,
-             vrshrn_n_s32(SumOnePassTaps</*filter_index=*/4>(src, taps),
+             vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps),
                           kInterRoundBitsHorizontal - 1));
     src_y = AddByteStride(src_y, src_stride);
     intermediate += kIntermediateStride;
@@ -2297,13 +2288,12 @@ inline void ConvolveKernelHorizontalSigned6Tap(
         src_high[i] = vget_high_u16(src_i);
       }
 
-      vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(
-                                                src_low, taps_low),
-                                            kInterRoundBitsHorizontal - 1));
-      vst1_s16(
-          intermediate_x + 4,
-          vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high),
-                       kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low),
+                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x + 4,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high),
+                            kInterRoundBitsHorizontal - 1));
       // Avoid right shifting the stride.
       src_x = AddByteStride(src_x, src_stride);
       intermediate_x += kIntermediateStride;
@@ -2401,13 +2391,12 @@ inline void ConvolveKernelHorizontalMixed6Tap(
         src_high[i] = vget_high_u16(src_i);
       }
 
-      vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(
-                                                src_low, taps_low),
-                                            kInterRoundBitsHorizontal - 1));
-      vst1_s16(
-          intermediate_x + 4,
-          vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high),
-                       kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low),
+                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x + 4,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high),
+                            kInterRoundBitsHorizontal - 1));
       // Avoid right shifting the stride.
       src_x = AddByteStride(src_x, src_stride);
       intermediate_x += kIntermediateStride;
@@ -2505,13 +2494,12 @@ inline void ConvolveKernelHorizontalSigned8Tap(
         src_high[i] = vget_high_u16(src_i);
       }
 
-      vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(
-                                                src_low, taps_low),
-                                            kInterRoundBitsHorizontal - 1));
-      vst1_s16(
-          intermediate_x + 4,
-          vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(src_high, taps_high),
-                       kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_low, taps_low),
+                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(intermediate_x + 4,
+               vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_high, taps_high),
+                            kInterRoundBitsHorizontal - 1));
       // Avoid right shifting the stride.
       src_x = AddByteStride(src_x, src_stride);
       intermediate_x += kIntermediateStride;
diff --git a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
index 7d287c8..6087276 100644
--- a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -36,44 +36,48 @@ constexpr int kInterPostRoundBit = 4;
 namespace low_bitdepth {
 namespace {
 
-inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
+inline uint8x8_t ComputeWeightedAverage8(const int16x8_t pred0,
                                          const int16x8_t pred1,
-                                         const int16x4_t weights[2]) {
-  // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
-  const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0));
-  const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0));
-  const int32x4_t blended_lo =
-      vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1));
-  const int32x4_t blended_hi =
-      vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1));
-
-  return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4),
-                      vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4));
+                                         const int16x8_t weight) {
+  // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+  // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+  //    8(=kInterPostRoundBit + 4)
+  // The formula is manipulated to avoid lengthening to 32 bits.
+  // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+  // = (p0 - p1) * w0 + 16 * p1
+  // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+  const int16x8_t diff = vsubq_s16(pred0, pred1);
+  // (((p0 - p1) * (w0 << 11) << 1) >> 16) + ((16 * p1) >> 4)
+  const int16x8_t weighted_diff = vqdmulhq_s16(diff, weight);
+  // ((p0 - p1) * w0 >> 4) + p1
+  const int16x8_t upscaled_average = vaddq_s16(weighted_diff, pred1);
+  // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+  return vqrshrun_n_s16(upscaled_average, kInterPostRoundBit);
 }
 
-template <int width, int height>
+template <int width>
 inline void DistanceWeightedBlendSmall_NEON(
     const int16_t* LIBGAV1_RESTRICT prediction_0,
-    const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2],
-    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+    const int16_t* LIBGAV1_RESTRICT prediction_1, const int height,
+    const int16x8_t weight, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   constexpr int step = 16 / width;
 
-  for (int y = 0; y < height; y += step) {
+  int y = height;
+  do {
     const int16x8_t src_00 = vld1q_s16(prediction_0);
     const int16x8_t src_10 = vld1q_s16(prediction_1);
     prediction_0 += 8;
     prediction_1 += 8;
-    const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+    const uint8x8_t result0 = ComputeWeightedAverage8(src_00, src_10, weight);
 
     const int16x8_t src_01 = vld1q_s16(prediction_0);
     const int16x8_t src_11 = vld1q_s16(prediction_1);
     prediction_0 += 8;
     prediction_1 += 8;
-    const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+    const uint8x8_t result1 = ComputeWeightedAverage8(src_01, src_11, weight);
 
-    const uint8x8_t result0 = vqmovun_s16(res0);
-    const uint8x8_t result1 = vqmovun_s16(res1);
     if (width == 4) {
       StoreLo4(dst, result0);
       dst += dest_stride;
@@ -90,12 +94,13 @@ inline void DistanceWeightedBlendSmall_NEON(
       vst1_u8(dst, result1);
       dst += dest_stride;
     }
-  }
+    y -= step;
+  } while (y != 0);
 }
 
 inline void DistanceWeightedBlendLarge_NEON(
     const int16_t* LIBGAV1_RESTRICT prediction_0,
-    const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2],
+    const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x8_t weight,
     const int width, const int height, void* LIBGAV1_RESTRICT const dest,
     const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
@@ -106,16 +111,15 @@ inline void DistanceWeightedBlendLarge_NEON(
     do {
       const int16x8_t src0_lo = vld1q_s16(prediction_0 + x);
       const int16x8_t src1_lo = vld1q_s16(prediction_1 + x);
-      const int16x8_t res_lo =
-          ComputeWeightedAverage8(src0_lo, src1_lo, weights);
+      const uint8x8_t res_lo =
+          ComputeWeightedAverage8(src0_lo, src1_lo, weight);
 
       const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8);
       const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8);
-      const int16x8_t res_hi =
-          ComputeWeightedAverage8(src0_hi, src1_hi, weights);
+      const uint8x8_t res_hi =
+          ComputeWeightedAverage8(src0_hi, src1_hi, weight);
 
-      const uint8x16_t result =
-          vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi));
+      const uint8x16_t result = vcombine_u8(res_lo, res_hi);
       vst1q_u8(dst + x, result);
       x += 16;
     } while (x < width);
@@ -128,52 +132,25 @@ inline void DistanceWeightedBlendLarge_NEON(
 inline void DistanceWeightedBlend_NEON(
     const void* LIBGAV1_RESTRICT prediction_0,
     const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0,
-    const uint8_t weight_1, const int width, const int height,
+    const uint8_t /*weight_1*/, const int width, const int height,
     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)};
-  // TODO(johannkoenig): Investigate the branching. May be fine to call with a
-  // variable height.
+  // Upscale the weight for vqdmulh.
+  const int16x8_t weight = vdupq_n_s16(weight_0 << 11);
   if (width == 4) {
-    if (height == 4) {
-      DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest,
-                                            dest_stride);
-    } else if (height == 8) {
-      DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest,
-                                            dest_stride);
-    } else {
-      assert(height == 16);
-      DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest,
-                                             dest_stride);
-    }
+    DistanceWeightedBlendSmall_NEON<4>(pred_0, pred_1, height, weight, dest,
+                                       dest_stride);
     return;
   }
 
   if (width == 8) {
-    switch (height) {
-      case 4:
-        DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest,
-                                              dest_stride);
-        return;
-      case 8:
-        DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest,
-                                              dest_stride);
-        return;
-      case 16:
-        DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest,
-                                               dest_stride);
-        return;
-      default:
-        assert(height == 32);
-        DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest,
-                                               dest_stride);
-
-        return;
-    }
+    DistanceWeightedBlendSmall_NEON<8>(pred_0, pred_1, height, weight, dest,
+                                       dest_stride);
+    return;
   }
 
-  DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest,
+  DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weight, width, height, dest,
                                   dest_stride);
 }
 
diff --git a/libgav1/src/dsp/arm/film_grain_neon.cc b/libgav1/src/dsp/arm/film_grain_neon.cc
index 0b1b481..76e1151 100644
--- a/libgav1/src/dsp/arm/film_grain_neon.cc
+++ b/libgav1/src/dsp/arm/film_grain_neon.cc
@@ -18,23 +18,21 @@
 #if LIBGAV1_ENABLE_NEON
 #include <arm_neon.h>
 
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <new>
 
 #include "src/dsp/arm/common_neon.h"
-#include "src/dsp/arm/film_grain_neon.h"
-#include "src/dsp/common.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
 #include "src/utils/common.h"
 #include "src/utils/compiler_attributes.h"
-#include "src/utils/logging.h"
+#include "src/utils/constants.h"
 #include "src/utils/memory.h"
+#include "src/utils/types.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -52,10 +50,8 @@ inline int16x8_t GetSignedSource8(const uint8_t* src) {
   return ZeroExtend(vld1_u8(src));
 }
 
-inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int /*valid_range*/) {
-  // TODO(b/194217060): restore |valid_range| usage after correcting call sites
-  // causing test vector failures.
-  return ZeroExtend(Load1MsanU8(src, 0));
+inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int valid_range) {
+  return ZeroExtend(Load1MsanU8(src, 8 - valid_range));
 }
 
 inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
@@ -69,11 +65,8 @@ inline int16x8_t GetSignedSource8(const uint16_t* src) {
   return vreinterpretq_s16_u16(vld1q_u16(src));
 }
 
-inline int16x8_t GetSignedSource8Msan(const uint16_t* src,
-                                      int /*valid_range*/) {
-  // TODO(b/194217060): restore |valid_range| usage after correcting call sites
-  // causing test vector failures.
-  return vreinterpretq_s16_u16(Load1QMsanU16(src, 0));
+inline int16x8_t GetSignedSource8Msan(const uint16_t* src, int valid_range) {
+  return vreinterpretq_s16_u16(Load1QMsanU16(src, 16 - valid_range));
 }
 
 inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
@@ -198,17 +191,13 @@ inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
 }
 
 inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma,
-                                     int subsampling_x, int /*valid_range*/) {
+                                     int subsampling_x, int valid_range) {
   if (subsampling_x != 0) {
-    // TODO(b/194217060): restore |valid_range| usage after correcting call
-    // sites causing test vector failures.
-    const uint8x16_t src = Load1QMsanU8(luma, 0);
-
+    const uint8x16_t src = MaskOverreadsQ(vld1q_u8(luma), 16 - valid_range);
+    // MemorySanitizer registers vpaddlq_u8 as a use of the memory.
     return vrshrq_n_u16(vpaddlq_u8(src), 1);
   }
-  // TODO(b/194217060): restore |valid_range| usage after correcting call sites
-  // causing test vector failures.
-  return vmovl_u8(Load1MsanU8(luma, 0));
+  return MaskOverreadsQ(vmovl_u8(vld1_u8(luma)), 16 - valid_range);
 }
 
 #if LIBGAV1_MAX_BITDEPTH >= 10
@@ -252,16 +241,13 @@ inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
 }
 
 inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma,
-                                     int subsampling_x, int /*valid_range*/) {
+                                     int subsampling_x, int valid_range) {
   if (subsampling_x != 0) {
-    // TODO(b/194217060): restore |valid_range| usage after correcting call
-    // sites causing test vector failures.
-    const uint16x8x2_t src = Load2QMsanU16(luma, 0);
-    return vrhaddq_u16(src.val[0], src.val[1]);
+    const uint16x8x2_t src = vld2q_u16(luma);
+    const uint16x8_t result = vrhaddq_u16(src.val[0], src.val[1]);
+    return MaskOverreadsQ(result, 16 - valid_range);
   }
-  // TODO(b/194217060): restore |valid_range| usage after correcting call sites
-  // causing test vector failures.
-  return Load1QMsanU16(luma, 0);
+  return Load1QMsanU16(luma, 16 - valid_range);
 }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
@@ -614,8 +600,7 @@ void InitializeScalingLookupTable_NEON(int num_points,
   }
   static_assert(sizeof(scaling_lut[0]) == 2, "");
   Memset(scaling_lut, point_scaling[0],
-         std::max(static_cast<int>(point_value[0]), 1)
-             << (bitdepth - kBitdepth8));
+         (static_cast<int>(point_value[0]) + 1) << (bitdepth - kBitdepth8));
   const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000));
   const int32x4_t rounding = vdupq_n_s32(32768);
   for (int i = 0; i < num_points - 1; ++i) {
@@ -666,7 +651,7 @@ void InitializeScalingLookupTable_NEON(int num_points,
         const int16x8x4_t result = {
             start, vaddq_s16(start, vrshrq_n_s16(delta, 2)),
             vaddq_s16(start, delta2), vaddq_s16(start, delta3)};
-        vst4q_s16(&scaling_lut[x_base], result);
+        Store4QMsanS16(&scaling_lut[x_base], result);
       } else {
         vst1q_s16(&scaling_lut[x_base], full_interp);
       }
@@ -696,13 +681,29 @@ inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
 }
 
 template <int bitdepth, typename Pixel>
-inline int16x8_t GetScalingFactors(
-    const int16_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
+                                   const Pixel* source) {
   int16_t start_vals[8];
   static_assert(bitdepth <= kBitdepth10,
                 "NEON Film Grain is not yet implemented for 12bpp.");
+#if LIBGAV1_MSAN
+  memset(start_vals, 0, sizeof(start_vals));
+#endif
   for (int i = 0; i < 8; ++i) {
-    assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
+    assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
+    start_vals[i] = scaling_lut[source[i]];
+  }
+  return vld1q_s16(start_vals);
+}
+
+template <int bitdepth, typename Pixel>
+inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
+                                   const Pixel* source, const int valid_range) {
+  int16_t start_vals[8];
+  static_assert(bitdepth <= kBitdepth10,
+                "NEON Film Grain is not yet implemented for 12bpp.");
+  for (int i = 0; i < valid_range; ++i) {
+    assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
     start_vals[i] = scaling_lut[source[i]];
   }
   return vld1q_s16(start_vals);
@@ -743,10 +744,11 @@ void BlendNoiseWithImageLuma_NEON(
   const int16x8_t scaling_shift_vect = vdupq_n_s16(
       (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
 
+  const int safe_width = width & ~15;
   int y = 0;
   do {
     int x = 0;
-    do {
+    for (; x + 8 <= safe_width; x += 8) {
       // This operation on the unsigned input is safe in 8bpp because the vector
       // is widened before it is reinterpreted.
       const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]);
@@ -767,8 +769,8 @@ void BlendNoiseWithImageLuma_NEON(
       // This operation on the unsigned input is safe in 8bpp because the vector
       // is widened before it is reinterpreted.
       const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]);
-      const int16x8_t scaling1 = GetScalingFactors<bitdepth, Pixel>(
-          scaling_lut_y, &in_y_row[std::min(x, width)]);
+      const int16x8_t scaling1 =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
       noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
 
       noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect);
@@ -778,8 +780,41 @@ void BlendNoiseWithImageLuma_NEON(
       // function for just that case, though the gain would be very small.
       StoreUnsigned8(&out_y_row[x],
                      vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling)));
-      x += 8;
-    } while (x < width);
+    }
+
+    if (x < width) {
+      assert(width - x < 16);
+      if (x < width - 8) {
+        const int16x8_t orig = GetSignedSource8(&in_y_row[x]);
+        const int16x8_t scaling =
+            GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+        int16x8_t noise =
+            GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+        noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+        const int16x8_t combined = vaddq_s16(orig, noise);
+        // In 8bpp, when params_.clip_to_restricted_range == false, we can
+        // replace clipping with vqmovun_s16, but it's not likely to be worth
+        // copying the function for just that case, though the gain would be
+        // very small.
+        StoreUnsigned8(&out_y_row[x],
+                       vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+        x += 8;
+      }
+      const int valid_range_pixels = width - x;
+      const int valid_range_bytes = (width - x) * sizeof(in_y_row[0]);
+      const int16x8_t orig =
+          GetSignedSource8Msan(&in_y_row[x], valid_range_bytes);
+      const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
+          scaling_lut_y, &in_y_row[x], valid_range_pixels);
+      int16x8_t noise =
+          GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+      noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+
+      const int16x8_t combined = vaddq_s16(orig, noise);
+      StoreUnsigned8(&out_y_row[x],
+                     vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+    }
     in_y_row += source_stride_y;
     out_y_row += dest_stride_y;
   } while (++y < height);
@@ -787,13 +822,9 @@ void BlendNoiseWithImageLuma_NEON(
 
 template <int bitdepth, typename GrainType, typename Pixel>
 inline int16x8_t BlendChromaValsWithCfl(
-    const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
-    const int16_t* LIBGAV1_RESTRICT scaling_lut,
     const Pixel* LIBGAV1_RESTRICT chroma_cursor,
     const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
-    const int16x8_t scaling_shift_vect) {
-  const int16x8_t scaling =
-      GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+    const int16x8_t scaling, const int16x8_t scaling_shift_vect) {
   const int16x8_t orig = GetSignedSource8(chroma_cursor);
   int16x8_t noise = GetSignedSource8(noise_image_cursor);
   noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
@@ -812,7 +843,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
   const int16x8_t floor = vdupq_n_s16(min_value);
   const int16x8_t ceiling = vdupq_n_s16(max_chroma);
   Pixel luma_buffer[16];
-  memset(luma_buffer, 0, sizeof(luma_buffer));
   // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
   // for 16 bit signed integers. In higher bitdepths, however, we have to
   // expand to 32 to protect the sign bit.
@@ -831,40 +861,45 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
   int y = 0;
   do {
     int x = 0;
-    do {
+    for (; x + 8 <= safe_chroma_width; x += 8) {
       const int luma_x = x << subsampling_x;
       const uint16x8_t average_luma =
           GetAverageLuma(&in_y_row[luma_x], subsampling_x);
       StoreUnsigned8(average_luma_buffer, average_luma);
 
+      const int16x8_t scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
       const int16x8_t blended =
           BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
-              average_luma_buffer, scaling_lut, &in_chroma_row[x],
-              &(noise_image[y + start_height][x]), scaling_shift_vect);
+              &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
+              scaling_shift_vect);
 
       // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
       // clipping with vqmovun_s16, but it's not likely to be worth copying the
       // function for just that case.
       StoreUnsigned8(&out_chroma_row[x],
                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
-      x += 8;
-    } while (x < safe_chroma_width);
+    }
 
     if (x < chroma_width) {
       const int luma_x = x << subsampling_x;
       const int valid_range_pixels = width - luma_x;
+      const int valid_range_chroma_pixels = chroma_width - x;
       const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+      assert(valid_range_pixels < 16);
       memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
       luma_buffer[valid_range_pixels] = in_y_row[width - 1];
       const uint16x8_t average_luma = GetAverageLumaMsan(
-          luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]));
+          luma_buffer, subsampling_x, valid_range_chroma_pixels << 1);
 
       StoreUnsigned8(average_luma_buffer, average_luma);
 
+      const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
+          scaling_lut, average_luma_buffer, valid_range_chroma_pixels);
       const int16x8_t blended =
           BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
-              average_luma_buffer, scaling_lut, &in_chroma_row[x],
-              &(noise_image[y + start_height][x]), scaling_shift_vect);
+              &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
+              scaling_shift_vect);
       // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
       // clipping with vqmovun_s16, but it's not likely to be worth copying the
       // function for just that case.
@@ -915,7 +950,8 @@ inline int16x8_t BlendChromaValsNoCfl(
     const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
     const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
     const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
-    const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) {
+    const int16x8_t& offset, int luma_multiplier, int chroma_multiplier,
+    bool restrict_scaling_lookup, int valid_range_pixels = 0) {
   uint8_t merged_buffer[8];
   const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
   const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
@@ -925,8 +961,12 @@ inline int16x8_t BlendChromaValsNoCfl(
   // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required.
   const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
   vst1_u8(merged_buffer, merged);
+
   const int16x8_t scaling =
-      GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
+      restrict_scaling_lookup
+          ? GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer,
+                                                   valid_range_pixels)
+          : GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
   int16x8_t noise = GetSignedSource8(noise_image_cursor);
   noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect);
   return vaddq_s16(orig, noise);
@@ -952,34 +992,28 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
   const int chroma_width = (width + subsampling_x) >> subsampling_x;
   const int safe_chroma_width = chroma_width & ~7;
   uint8_t luma_buffer[16];
-#if LIBGAV1_MSAN
-  // Quiet msan warnings.
-  memset(luma_buffer, 0, sizeof(luma_buffer));
-#endif
   const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
 
   start_height >>= subsampling_y;
   int y = 0;
   do {
     int x = 0;
-    do {
+    for (; x + 8 <= safe_chroma_width; x += 8) {
       const int luma_x = x << subsampling_x;
-      const int valid_range = width - luma_x;
+      const int valid_range_chroma_pixels = chroma_width - x;
 
       const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
-      const int16x8_t average_luma = vreinterpretq_s16_u16(
-          GetAverageLumaMsan(&in_y_row[luma_x], subsampling_x, valid_range));
+      const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+          &in_y_row[luma_x], subsampling_x, valid_range_chroma_pixels << 1));
       const int16x8_t blended = BlendChromaValsNoCfl(
           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
           average_luma, scaling_shift_vect, offset, luma_multiplier,
-          chroma_multiplier);
+          chroma_multiplier, /*restrict_scaling_lookup=*/false);
       // In 8bpp, when params_.clip_to_restricted_range == false, we can
       // replace clipping with vqmovun_s16, but the gain would be small.
       StoreUnsigned8(&out_chroma_row[x],
                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
-
-      x += 8;
-    } while (x < safe_chroma_width);
+    }
 
     if (x < chroma_width) {
       // Begin right edge iteration. Same as the normal iterations, but the
@@ -988,19 +1022,20 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
       const int luma_x = x << subsampling_x;
       const int valid_range_pixels = width - luma_x;
       const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+      assert(valid_range_pixels < 16);
       memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
       luma_buffer[valid_range_pixels] = in_y_row[width - 1];
-      const int valid_range_chroma_bytes =
-          (chroma_width - x) * sizeof(in_chroma_row[0]);
+      const int valid_range_chroma_pixels = chroma_width - x;
 
       const int16x8_t orig_chroma =
-          GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
+          GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_pixels);
       const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
-          luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])));
+          luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
       const int16x8_t blended = BlendChromaValsNoCfl(
           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
           average_luma, scaling_shift_vect, offset, luma_multiplier,
-          chroma_multiplier);
+          chroma_multiplier, /*restrict_scaling_lookup=*/true,
+          valid_range_chroma_pixels);
       StoreUnsigned8(&out_chroma_row[x],
                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
       // End of right edge iteration.
@@ -1267,7 +1302,8 @@ inline int16x8_t BlendChromaValsNoCfl(
     const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
     const int16_t* LIBGAV1_RESTRICT noise_image_cursor,
     const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
-    const int32x4_t& offset, int luma_multiplier, int chroma_multiplier) {
+    const int32x4_t& offset, int luma_multiplier, int chroma_multiplier,
+    bool restrict_scaling_lookup, int valid_range_pixels = 0) {
   uint16_t merged_buffer[8];
   const int32x4_t weighted_luma_low =
       vmull_n_s16(vget_low_s16(average_luma), luma_multiplier);
@@ -1287,7 +1323,11 @@ inline int16x8_t BlendChromaValsNoCfl(
   vst1q_u16(merged_buffer,
             vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel));
   const int16x8_t scaling =
-      GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer);
+      restrict_scaling_lookup
+          ? GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer,
+                                                     valid_range_pixels)
+          : GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut,
+                                                     merged_buffer);
   const int16x8_t noise = GetSignedSource8(noise_image_cursor);
   const int16x8_t scaled_noise =
       ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect);
@@ -1311,11 +1351,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
   const int chroma_width = (width + subsampling_x) >> subsampling_x;
   const int safe_chroma_width = chroma_width & ~7;
   uint16_t luma_buffer[16];
-#if LIBGAV1_MSAN
-  // TODO(b/194217060): This can be removed if the range calculations below are
-  // fixed.
-  memset(luma_buffer, 0, sizeof(luma_buffer));
-#endif
   // Offset is added before downshifting in order to take advantage of
   // saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp.
   const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2));
@@ -1324,7 +1359,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
   int y = 0;
   do {
     int x = 0;
-    do {
+    for (; x + 8 <= safe_chroma_width; x += 8) {
       const int luma_x = x << subsampling_x;
       const int16x8_t average_luma = vreinterpretq_s16_u16(
           GetAverageLuma(&in_y_row[luma_x], subsampling_x));
@@ -1332,12 +1367,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
       const int16x8_t blended = BlendChromaValsNoCfl(
           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
           average_luma, scaling_shift_vect, offset, luma_multiplier,
-          chroma_multiplier);
+          chroma_multiplier, /*restrict_scaling_lookup=*/false);
       StoreUnsigned8(&out_chroma_row[x],
                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
-
-      x += 8;
-    } while (x < safe_chroma_width);
+    }
 
     if (x < chroma_width) {
       // Begin right edge iteration. Same as the normal iterations, but the
@@ -1346,19 +1379,22 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
       const int luma_x = x << subsampling_x;
       const int valid_range_pixels = width - luma_x;
       const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+      assert(valid_range_pixels < 16);
       memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
       luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+      const int valid_range_chroma_pixels = chroma_width - x;
       const int valid_range_chroma_bytes =
           (chroma_width - x) * sizeof(in_chroma_row[0]);
       const int16x8_t orig_chroma =
           GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
 
       const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
-          luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])));
+          luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
       const int16x8_t blended = BlendChromaValsNoCfl(
           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
           average_luma, scaling_shift_vect, offset, luma_multiplier,
-          chroma_multiplier);
+          chroma_multiplier, /*restrict_scaling_lookup=*/true,
+          valid_range_chroma_pixels);
       StoreUnsigned8(&out_chroma_row[x],
                      vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
       // End of right edge iteration.
@@ -1442,10 +1478,8 @@ void Init10bpp() {
   dsp->film_grain.initialize_scaling_lut =
       InitializeScalingLookupTable_NEON<kBitdepth10>;
 
-  // TODO(b/194442742): reenable this function after segfault under armv7 ASan
-  // is fixed.
-  // dsp->film_grain.blend_noise_luma =
-  //     BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
   dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON;
   dsp->film_grain.blend_noise_chroma[1] =
       BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>;
diff --git a/libgav1/src/dsp/arm/film_grain_neon.h b/libgav1/src/dsp/arm/film_grain_neon.h
index 3ba2eef..09596e2 100644
--- a/libgav1/src/dsp/arm/film_grain_neon.h
+++ b/libgav1/src/dsp/arm/film_grain_neon.h
@@ -39,9 +39,7 @@ void FilmGrainInit_NEON();
 #define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
 #define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
 #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
-// TODO(b/194442742): reenable this function after segfault under armv7 ASan is
-// fixed.
-// #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
 #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
 #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
 #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
diff --git a/libgav1/src/dsp/arm/intrapred_directional_neon.cc b/libgav1/src/dsp/arm/intrapred_directional_neon.cc
index 3cad4a6..e9bdcf0 100644
--- a/libgav1/src/dsp/arm/intrapred_directional_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_directional_neon.cc
@@ -505,20 +505,12 @@ inline void DirectionalZone1Blend_WxH(
   } while (++y < height);
 }
 
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
-    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
-// 7.11.2.4 (8) 90 < angle > 180
-// The strategy for these functions (4xH and 8+xH) is to know how many blocks
-// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
-// then handle only blocks that take from |left_ptr|. Additionally, a fast
-// index-shuffle approach is used for pred values from |left_column| in sections
-// that permit it.
+//  7.11.2.4 (8) 90 < angle > 180
+//  The strategy for these functions (4xH and 8+xH) is to know how many blocks
+//  can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+//  then handle only blocks that take from |left_ptr|. Additionally, a fast
+//  index-shuffle approach is used for pred values from |left_column| in
+//  sections that permit it.
 inline void DirectionalZone2_4xH(
     uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
     const uint8_t* LIBGAV1_RESTRICT const top_row,
@@ -544,13 +536,6 @@ inline void DirectionalZone2_4xH(
   assert(xstep >= 3);
   const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
 
-  // For steep angles, the source pixels from |left_column| may not fit in a
-  // 16-byte load for shuffling.
-  // TODO(petersonab): Find a more precise formula for this subject to x.
-  // TODO(johannkoenig): Revisit this for |width| == 4.
-  const int max_shuffle_height =
-      std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
-
   // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
   int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
 
@@ -569,9 +554,9 @@ inline void DirectionalZone2_4xH(
   // blocks that have a mixture of values computed from top or left. The final
   // stage covers blocks that are only computed from the left.
   if (min_top_only_x > 0) {
-    // Round down to the nearest multiple of 8.
-    // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should.
-    const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+    // Round down to the nearest multiple of 8 (or 4, if height is 4).
+    const int max_top_only_y =
+        std::min((1 << 6) / xstep, height) & ~(min_height - 1);
     DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
                             upsampled_top);
 
@@ -584,18 +569,11 @@ inline void DirectionalZone2_4xH(
     // All rows from |min_left_only_y| down for this set of columns only need
     // |left_column| to compute.
     const int min_left_only_y = std::min((4 << 6) / xstep, height);
-    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
-    // high. This means that max_shuffle_height is unbounded and xstep_bounds
-    // will overflow in 16 bits. This is prevented by stopping the first
-    // blending loop at min_left_only_y for such cases, which means we skip over
-    // the second blending loop as well.
-    const int left_shuffle_stop_y =
-        std::min(max_shuffle_height, min_left_only_y);
     int xstep_bounds = xstep_bounds_base + xstep_y;
     int top_x = -xstep - xstep_y;
 
     // +8 increment is OK because if height is 4 this only goes once.
-    for (; y < left_shuffle_stop_y;
+    for (; y < min_left_only_y;
          y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
       DirectionalZone2FromLeftCol_WxH<4>(
           dst, stride, min_height,
@@ -607,21 +585,8 @@ inline void DirectionalZone2_4xH(
                                    upsample_top_shift);
     }
 
-    // Pick up from the last y-value, using the slower but secure method for
-    // left prediction.
-    const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
-    for (; y < min_left_only_y;
-         y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
-      DirectionalZone3_WxH<4>(
-          dst, stride, min_height,
-          left_column + ((y - left_base_increment) << upsample_left_shift),
-          base_left_y, -ystep, upsample_left_shift);
-
-      DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
-                                   xstep_bounds, top_x, xstep,
-                                   upsample_top_shift);
-    }
     // Loop over y for left_only rows.
+    const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
     for (; y < height; y += 8, dst += stride8) {
       DirectionalZone3_WxH<4>(
           dst, stride, min_height,
@@ -634,34 +599,88 @@ inline void DirectionalZone2_4xH(
   }
 }
 
-// Process a multiple of 8 |width|.
-inline void DirectionalZone2_8(
+template <bool shuffle_left_column>
+inline void DirectionalZone2_8xH(
     uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
     const uint8_t* LIBGAV1_RESTRICT const top_row,
-    const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
-    const int height, const int xstep, const int ystep,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep, const int x, const int left_offset,
+    const int xstep_bounds_base, const int16x8_t left_y,
     const bool upsampled_top, const bool upsampled_left) {
   const int upsample_left_shift = static_cast<int>(upsampled_left);
   const int upsample_top_shift = static_cast<int>(upsampled_top);
 
-  // Helper vector.
-  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
-
   // Loop incrementers for moving by block (8x8). This function handles blocks
   // with height 4 as well. They are calculated in one pass so these variables
   // do not get used.
   const ptrdiff_t stride8 = stride << 3;
   const int xstep8 = xstep << 3;
-  const int ystep8 = ystep << 3;
 
-  // Process Wx4 blocks.
+  // Cover 8x4 case.
   const int min_height = (height == 4) ? 4 : 8;
 
-  // All columns from |min_top_only_x| to the right will only need |top_row| to
-  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
-  // at least 3.
-  assert(xstep >= 3);
-  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  uint8_t* dst_x = dst + x;
+  // Round down to the nearest multiple of 8 (or 4, if height is 4).
+  const int max_top_only_y =
+      std::min((1 << 6) / xstep, height) & ~(min_height - 1);
+  DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
+                          top_row + (x << upsample_top_shift), -xstep,
+                          upsampled_top);
+
+  if (max_top_only_y == height) return;
+
+  int y = max_top_only_y;
+  dst_x += stride * y;
+  const int xstep_y = xstep * y;
+
+  // All rows from |min_left_only_y| down for this set of columns only need
+  // |left_column| to compute. Round up to the nearest 8.
+  const int min_left_only_y =
+      Align(std::min(((x + 8) << 6) / xstep, height), 8);
+  int xstep_bounds = xstep_bounds_base + xstep_y;
+  int top_x = -xstep - xstep_y;
+
+  const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+  for (; y < min_left_only_y;
+       y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+    if (shuffle_left_column) {
+      DirectionalZone2FromLeftCol_WxH<8>(
+          dst_x, stride, min_height,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y,
+          upsample_left_shift);
+    } else {
+      DirectionalZone3_WxH<8>(
+          dst_x, stride, min_height,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep, upsample_left_shift);
+    }
+
+    DirectionalZone1Blend_WxH<8>(
+        dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+        xstep_bounds, top_x, xstep, upsample_top_shift);
+  }
+
+  // Loop over y for left_only rows.
+  for (; y < height; y += 8, dst_x += stride8) {
+    DirectionalZone3_WxH<8>(
+        dst_x, stride, min_height,
+        left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+        -ystep, upsample_left_shift);
+  }
+}
+
+// Process a multiple of 8 |width|.
+inline void DirectionalZone2_WxH(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
+  const int ystep8 = ystep << 3;
 
   // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
   int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -677,90 +696,43 @@ inline void DirectionalZone2_8(
   // left_y vector omits the portion which is covered under the left_column
   // offset. Following values need the full ystep as a relative offset.
   const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
   int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
 
+  // For ystep > 90, at least two sets of 8 columns can be fully computed from
+  // top_row only.
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+  // Analysis finds that, for most angles (ystep < 132), all segments that use
+  // both top_row and left_column can compute from left_column using byte
+  // shuffles from a single vector. For steeper angles, the shuffle is also
+  // fully reliable when x >= 32.
+  const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+  const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
+
   // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
   // The first stage, before the first y-loop, covers blocks that are only
   // computed from the top row. The second stage, comprising two y-loops, covers
   // blocks that have a mixture of values computed from top or left. The final
   // stage covers blocks that are only computed from the left.
   int x = 0;
-  // For steep angles, the source pixels from |left_column| may not fit in a
-  // 16-byte load for shuffling. |d| represents the number of pixels that can
-  // fit in one contiguous vector when stepping by |ystep|. For a given x
-  // position, the left column values can be obtained by VTBL as long as the
-  // values at row[x + d] and beyond come from the top row. However, this does
-  // not guarantee that the vector will also contain all of the values needed
-  // from top row.
-  const int d = 16 / ((ystep >> 6) + 1);
+  for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+           xstep_bounds_base -= (8 << 6),
+           left_y = vsubq_s16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height,
+                                xstep, ystep, x, left_offset, xstep_bounds_base,
+                                left_y, upsampled_top, upsampled_left);
+  }
   for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
            xstep_bounds_base -= (8 << 6),
            left_y = vsubq_s16(left_y, increment_left8),
            left_offset -= left_base_increment8) {
-    uint8_t* dst_x = dst + x;
-    const int max_shuffle_height =
-        std::min(((x + d) << 6) / xstep, height) & ~7;
-    // Round down to the nearest multiple of 8.
-    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
-    DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
-                            top_row + (x << upsample_top_shift), -xstep,
-                            upsampled_top);
-
-    if (max_top_only_y == height) continue;
-
-    int y = max_top_only_y;
-    dst_x += stride * y;
-    const int xstep_y = xstep * y;
-
-    // All rows from |min_left_only_y| down for this set of columns only need
-    // |left_column| to compute.
-    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
-    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
-    // high. This means that max_shuffle_height is unbounded and xstep_bounds
-    // will overflow in 16 bits. This is prevented by stopping the first
-    // blending loop at min_left_only_y for such cases, which means we skip over
-    // the second blending loop as well.
-    const int left_shuffle_stop_y =
-        std::min(max_shuffle_height, min_left_only_y);
-    int xstep_bounds = xstep_bounds_base + xstep_y;
-    int top_x = -xstep - xstep_y;
-
-    for (; y < left_shuffle_stop_y;
-         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
-      DirectionalZone2FromLeftCol_WxH<8>(
-          dst_x, stride, min_height,
-          left_column + ((left_offset + y) << upsample_left_shift), left_y,
-          upsample_left_shift);
-
-      DirectionalZone1Blend_WxH<8>(
-          dst_x, stride, min_height, top_row + (x << upsample_top_shift),
-          xstep_bounds, top_x, xstep, upsample_top_shift);
-    }
-
-    // Pick up from the last y-value, using the slower but secure method for
-    // left prediction.
-    const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
-    for (; y < min_left_only_y;
-         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
-      DirectionalZone3_WxH<8>(
-          dst_x, stride, min_height,
-          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
-          -ystep, upsample_left_shift);
-
-      DirectionalZone1Blend_WxH<8>(
-          dst_x, stride, min_height, top_row + (x << upsample_top_shift),
-          xstep_bounds, top_x, xstep, upsample_top_shift);
-    }
-    // Loop over y for left_only rows.
-    for (; y < height; y += 8, dst_x += stride8) {
-      DirectionalZone3_WxH<8>(
-          dst_x, stride, min_height,
-          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
-          -ystep, upsample_left_shift);
-    }
+    DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep,
+                               ystep, x, left_offset, xstep_bounds_base, left_y,
+                               upsampled_top, upsampled_left);
   }
-  // TODO(johannkoenig): May be able to remove this branch.
   if (x < width) {
+    const int upsample_top_shift = static_cast<int>(upsampled_top);
     DirectionalZone1_WxH(dst + x, stride, width - x, height,
                          top_row + (x << upsample_top_shift), -xstep,
                          upsampled_top);
@@ -792,8 +764,8 @@ void DirectionalIntraPredictorZone2_NEON(
     DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
                          upsampled_top, upsampled_left);
   } else {
-    DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep,
-                       ystep, upsampled_top, upsampled_left);
+    DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep,
+                         ystep, upsampled_top, upsampled_left);
   }
 }
 
@@ -935,6 +907,16 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
   return vrshrq_n_u16(sum, 5 /*log2(32)*/);
 }
 
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+                                const uint16x8_t a_weight,
+                                const uint16x8_t b_weight) {
+  const uint16x8_t a_product = vmulq_u16(a, a_weight);
+  const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
+
+  return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
 // Each element of |dest| contains values associated with one weight value.
 inline void LoadEdgeVals(uint16x4x2_t* dest,
                          const uint16_t* LIBGAV1_RESTRICT const source,
@@ -959,6 +941,24 @@ inline void LoadEdgeVals(uint16x8x2_t* dest,
   }
 }
 
+// For Wx4 blocks, load the source for 2 columns. The source for the second
+// column is held in the high half of each vector.
+inline void LoadEdgeVals2x4(uint16x8x2_t* dest,
+                            const uint16_t* LIBGAV1_RESTRICT const source_low,
+                            const uint16_t* LIBGAV1_RESTRICT const source_high,
+                            const bool upsampled) {
+  if (upsampled) {
+    const uint16x4x2_t low = vld2_u16(source_low);
+    const uint16x4x2_t high = vld2_u16(source_high);
+    dest->val[0] = vcombine_u16(low.val[0], high.val[0]);
+    dest->val[1] = vcombine_u16(low.val[1], high.val[1]);
+  } else {
+    dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high));
+    dest->val[1] =
+        vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1));
+  }
+}
+
 template <bool upsampled>
 inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
                                  const ptrdiff_t stride, const int height,
@@ -1286,18 +1286,162 @@ inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
 }
 
 template <bool upsampled>
+inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep, const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+  const uint16x8_t inverter = vdupq_n_u16(32);
+
+  uint16x8x2_t sampled_left_col;
+  // Compute two columns at a time, then transpose for storage.
+  uint16x8_t result[4];
+
+  // The low half of pre-transpose vectors contains columns 0 through 3.
+  int left_y_low = base_left_y + ystep;
+  int left_offset_low = left_y_low >> index_scale_bits;
+  int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+  // The high half of pre-transpose vectors contains columns 4 through 7.
+  int left_y_high = left_y_low + (ystep << 2);
+  int left_offset_high = left_y_high >> index_scale_bits;
+  int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+  uint16x8_t weights_0 =
+      vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+  uint16x8_t weights_1 = vsubq_u16(inverter, weights_0);
+  LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+                  &left[left_offset_high], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            weights_1, weights_0);
+
+  left_y_low += ystep;
+  left_offset_low = left_y_low >> index_scale_bits;
+  shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+  left_y_high += ystep;
+  left_offset_high = left_y_high >> index_scale_bits;
+  shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+  weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+  weights_1 = vsubq_u16(inverter, weights_0);
+  LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+                  &left[left_offset_high], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            weights_1, weights_0);
+
+  left_y_high += ystep;
+  left_y_low += ystep;
+  left_offset_low = left_y_low >> index_scale_bits;
+  shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+  left_offset_high = left_y_high >> index_scale_bits;
+  shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+  weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+  weights_1 = vsubq_u16(inverter, weights_0);
+  LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+                  &left[left_offset_high], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            weights_1, weights_0);
+
+  left_y_low += ystep;
+  left_offset_low = left_y_low >> index_scale_bits;
+  shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+  left_y_high += ystep;
+  left_offset_high = left_y_high >> index_scale_bits;
+  shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+  weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+  weights_1 = vsubq_u16(inverter, weights_0);
+  LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+                  &left[left_offset_high], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            weights_1, weights_0);
+
+  Transpose4x8(result);
+  Store8(dst, result[0]);
+  dst += stride;
+  Store8(dst, result[1]);
+  dst += stride;
+  Store8(dst, result[2]);
+  dst += stride;
+  Store8(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep, const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x8_t result[4];
+
+  int left_y = base_left_y + ystep;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  uint16x8x2_t sampled_left_col;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose4x8(result);
+  Store4(dst, vget_low_u16(result[0]));
+  dst += stride;
+  Store4(dst, vget_low_u16(result[1]));
+  dst += stride;
+  Store4(dst, vget_low_u16(result[2]));
+  dst += stride;
+  Store4(dst, vget_low_u16(result[3]));
+  dst += stride;
+  Store4(dst, vget_high_u16(result[0]));
+  dst += stride;
+  Store4(dst, vget_high_u16(result[1]));
+  dst += stride;
+  Store4(dst, vget_high_u16(result[2]));
+  dst += stride;
+  Store4(dst, vget_high_u16(result[3]));
+}
+
+template <bool upsampled>
 inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
                                  const ptrdiff_t stride, const int height,
                                  const uint16_t* LIBGAV1_RESTRICT const left,
                                  const int ystep) {
+  assert(height == 8 || height == 16);
   const int upsample_shift = static_cast<int>(upsampled);
-  int y = 0;
-  do {
-    DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+  DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep);
+  if (height == 16) {
+    dest += stride << 3;
+    DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift),
                                     ystep);
-    dest += 4 * stride;
-    y += 4;
-  } while (y < height);
+  }
 }
 
 template <bool upsampled>
@@ -1305,16 +1449,17 @@ inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
                                  const ptrdiff_t stride, const int width,
                                  const uint16_t* LIBGAV1_RESTRICT const left,
                                  const int ystep) {
-  int x = 0;
-  int base_left_y = 0;
-  do {
-    // TODO(petersonab): Establish 8x4 transpose to reserve this function for
-    // 8x4 and 16x4.
-    DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
-                                    base_left_y);
-    base_left_y += 4 * ystep;
-    x += 4;
-  } while (x < width);
+  assert(width <= 16);
+  if (width == 4) {
+    DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep);
+    return;
+  }
+  DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep);
+  if (width == 16) {
+    const int base_left_y = ystep << 3;
+    DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left,
+                                    ystep, base_left_y);
+  }
 }
 
 template <bool upsampled>
@@ -1460,17 +1605,17 @@ void DirectionalIntraPredictorZone3_NEON(
     } while (y != 0);
     return;
   }
-  if (width == 4) {
+  if (height == 4) {
     if (upsampled_left) {
-      DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+      DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
     } else {
-      DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+      DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
     }
-  } else if (height == 4) {
+  } else if (width == 4) {
     if (upsampled_left) {
-      DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+      DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
     } else {
-      DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+      DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
     }
   } else {
     if (upsampled_left) {
@@ -1532,16 +1677,6 @@ inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
   return vrshr_n_u16(sum, 5 /*log2(32)*/);
 }
 
-// Blend two values based on weight pairs that each sum to 32.
-inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
-                                const uint16x8_t a_weight,
-                                const uint16x8_t b_weight) {
-  const uint16x8_t a_product = vmulq_u16(a, a_weight);
-  const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
-
-  return vrshrq_n_u16(sum, 5 /*log2(32)*/);
-}
-
 // Because the source values "move backwards" as the row index increases, the
 // indices derived from ystep are generally negative in localized functions.
 // This is accommodated by making sure the relative indices are within [-15, 0]
@@ -1608,8 +1743,8 @@ inline void DirectionalZone2FromLeftCol_4xH(
   } while (++y < height);
 }
 
-inline void DirectionalZone2FromLeftCol_8xH(
-    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+inline void DirectionalZone2FromLeftCol_8x8(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
     const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
     const bool upsampled) {
   const int upsample_shift = static_cast<int>(upsampled);
@@ -1653,8 +1788,7 @@ inline void DirectionalZone2FromLeftCol_8xH(
       vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
   const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
 
-  int y = 0;
-  do {
+  for (int y = 0; y < 8; ++y) {
     uint16x8_t src_left, src_right;
     LoadStepwise(
         left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
@@ -1664,7 +1798,7 @@ inline void DirectionalZone2FromLeftCol_8xH(
 
     Store8(dst, val);
     dst += stride;
-  } while (++y < height);
+  }
 }
 
 template <bool upsampled>
@@ -1704,8 +1838,8 @@ inline void DirectionalZone1Blend_4xH(
 }
 
 template <bool upsampled>
-inline void DirectionalZone1Blend_8xH(
-    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+inline void DirectionalZone1Blend_8x8(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
     const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
     const int xstep) {
   const int upsample_shift = static_cast<int>(upsampled);
@@ -1716,8 +1850,7 @@ inline void DirectionalZone1Blend_8xH(
   const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
 
   uint16x8x2_t top_vals;
-  int y = height;
-  do {
+  for (int y = 0; y < 8; ++y) {
     const uint16_t* const src = top_row + (top_x >> scale_bits_x);
     LoadEdgeVals(&top_vals, src, upsampled);
 
@@ -1736,20 +1869,9 @@ inline void DirectionalZone1Blend_8xH(
     dest += stride;
     zone_bounds += xstep;
     top_x -= xstep;
-  } while (--y != 0);
+  }
 }
 
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices
-// that do not correspond to angle derivatives are left at zero.
-// Notably, in cases with upsampling, the shuffle-invalid height is always
-// greater than the prediction height (which is 8 at maximum).
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
-    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
 // 7.11.2.4 (8) 90 < angle > 180
 // The strategy for these functions (4xH and 8+xH) is to know how many blocks
 // can be processed with just pixels from |top_ptr|, then handle mixed blocks,
@@ -1796,9 +1918,9 @@ inline void DirectionalZone2_4xH(
   // computed from the top row. The second stage, comprising two y-loops, covers
   // blocks that have a mixture of values computed from top or left. The final
   // stage covers blocks that are only computed from the left.
-  // Round down to the nearest multiple of 8.
-  // TODO(petersonab): Check if rounding to the nearest 4 is okay.
-  const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+  // Round down to the nearest multiple of 8 (or 4, if height is 4).
+  const int max_top_only_y =
+      std::min((1 << 6) / xstep, height) & ~(min_height - 1);
   DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
                                       stride >> 1, max_top_only_y, top_row,
                                       -xstep);
@@ -1827,12 +1949,15 @@ inline void DirectionalZone2_4xH(
                                              xstep_bounds, top_x, xstep);
   }
 
-  // Loop over y for left-only rows.
-  for (; y < height; y += 8, dst += stride8) {
-    // Angle expected by Zone3 is flipped about the 180 degree vector, which
-    // is the x-axis.
+  // Left-only section. |height| - |y| is assumed equivalent to:
+  // (y == 0) && (height == 4)
+  if (height - y == 4) {
+    DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep);
+    return;
+  }
+  if (y < height) {
     DirectionalZone3_4xH<upsampled_left>(
-        dst, stride, min_height, left_column + (y << upsample_left_shift),
+        dst, stride, height - y, left_column + (y << upsample_left_shift),
         -ystep);
   }
 }
@@ -1882,9 +2007,75 @@ inline void DirectionalZone2_Wx4(
   }
 }
 
+template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_8xH(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep, const int x, const int left_offset,
+    const int xstep_bounds_base, const int16x8_t left_y) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Loop incrementers for moving by block (8x8). This function handles blocks
+  // with height 4 as well. They are calculated in one pass so these variables
+  // do not get used.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  uint8_t* dst_x = dst + x * sizeof(uint16_t);
+  // Round down to the nearest multiple of 8.
+  const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+  DirectionalZone1_WxH<upsampled_top>(
+      reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
+      top_row + (x << upsample_top_shift), -xstep);
+
+  if (max_top_only_y == height) return;
+
+  int y = max_top_only_y;
+  dst_x += stride * y;
+  const int xstep_y = xstep * y;
+
+  // All rows from |min_left_only_y| down for this set of columns only need
+  // |left_column| to compute. Round up to the nearest 8.
+  const int min_left_only_y =
+      Align(std::min(((x + 8) << 6) / xstep, height), 8);
+  int xstep_bounds = xstep_bounds_base + xstep_y;
+  int top_x = -xstep - xstep_y;
+
+  for (; y < min_left_only_y;
+       y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+    if (shuffle_left_column) {
+      DirectionalZone2FromLeftCol_8x8(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y,
+          upsampled_left);
+    } else {
+      DirectionalZone3_8x8<upsampled_left>(
+          dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+          -ystep * x);
+    }
+
+    DirectionalZone1Blend_8x8<upsampled_top>(
+        dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x,
+        xstep);
+  }
+
+  // Loop over y for left_only rows.
+  for (; y < height; y += 8, dst_x += stride8) {
+    DirectionalZone3_8x8<upsampled_left>(
+        dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+        -ystep * x);
+  }
+}
+
 // Process a multiple of 8 |width|.
 template <bool upsampled_top, bool upsampled_left>
-inline void DirectionalZone2_8(
+inline void DirectionalZone2_NEON(
     uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
     const uint16_t* LIBGAV1_RESTRICT const top_row,
     const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
@@ -1894,30 +2085,24 @@ inline void DirectionalZone2_8(
         dst, stride, top_row, left_column, width, xstep, ystep);
     return;
   }
-  const int upsample_left_shift = static_cast<int>(upsampled_left);
   const int upsample_top_shift = static_cast<int>(upsampled_top);
 
   // Helper vector.
   const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
 
-  // Loop increments for moving by block (8x8). This function handles blocks
-  // with height 4 as well. They are calculated in one pass so these variables
-  // do not get used.
-  const ptrdiff_t stride8 = stride << 3;
-  const int xstep8 = xstep << 3;
   const int ystep8 = ystep << 3;
 
   // All columns from |min_top_only_x| to the right will only need |top_row| to
   // compute and can therefore call the Zone1 functions. This assumes |xstep| is
   // at least 3.
   assert(xstep >= 3);
-  const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
-  // For steep angles, the source pixels from |left_column| may not fit in a
-  // 16-byte load for shuffling.
-  // TODO(petersonab): Find a more precise formula for this subject to x.
-  const int max_shuffle_height =
-      std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+  const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8);
+  // Analysis finds that, for most angles (ystep < 132), all segments that use
+  // both top_row and left_column can compute from left_column using byte
+  // shuffles from a single vector. For steeper angles, the shuffle is also
+  // fully reliable when x >= 32.
+  const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+  const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
 
   // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
   int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -1935,73 +2120,22 @@ inline void DirectionalZone2_8(
   int16x8_t left_y =
       vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
 
-  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
-  // The first stage, before the first y-loop, covers blocks that are only
-  // computed from the top row. The second stage, comprising two y-loops, covers
-  // blocks that have a mixture of values computed from top or left. The final
-  // stage covers blocks that are only computed from the left.
   int x = 0;
+  for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+           xstep_bounds_base -= (8 << 6),
+           left_y = vsubq_s16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    DirectionalZone2_8xH<false, upsampled_top, upsampled_left>(
+        dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+        xstep_bounds_base, left_y);
+  }
   for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
            xstep_bounds_base -= (8 << 6),
            left_y = vsubq_s16(left_y, increment_left8),
            left_offset -= left_base_increment8) {
-    uint8_t* dst_x = dst + x * sizeof(uint16_t);
-
-    // Round down to the nearest multiple of 8.
-    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
-    DirectionalZone1_WxH<upsampled_top>(
-        reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
-        top_row + (x << upsample_top_shift), -xstep);
-
-    if (max_top_only_y == height) continue;
-
-    int y = max_top_only_y;
-    dst_x += stride * y;
-    const int xstep_y = xstep * y;
-
-    // All rows from |min_left_only_y| down for this set of columns only need
-    // |left_column| to compute.
-    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
-    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
-    // high. This means that max_shuffle_height is unbounded and xstep_bounds
-    // will overflow in 16 bits. This is prevented by stopping the first
-    // blending loop at min_left_only_y for such cases, which means we skip over
-    // the second blending loop as well.
-    const int left_shuffle_stop_y =
-        std::min(max_shuffle_height, min_left_only_y);
-    int xstep_bounds = xstep_bounds_base + xstep_y;
-    int top_x = -xstep - xstep_y;
-
-    for (; y < left_shuffle_stop_y;
-         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
-      DirectionalZone2FromLeftCol_8xH(
-          dst_x, stride, 8,
-          left_column + ((left_offset + y) << upsample_left_shift), left_y,
-          upsample_left_shift);
-
-      DirectionalZone1Blend_8xH<upsampled_top>(
-          dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
-          top_x, xstep);
-    }
-
-    // Pick up from the last y-value, using the slower but secure method for
-    // left prediction.
-    for (; y < min_left_only_y;
-         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
-      DirectionalZone3_8x8<upsampled_left>(
-          dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
-          -ystep * x);
-
-      DirectionalZone1Blend_8xH<upsampled_top>(
-          dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
-          top_x, xstep);
-    }
-    // Loop over y for left_only rows.
-    for (; y < height; y += 8, dst_x += stride8) {
-      DirectionalZone3_8x8<upsampled_left>(
-          dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
-          -ystep * x);
-    }
+    DirectionalZone2_8xH<true, upsampled_top, upsampled_left>(
+        dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+        xstep_bounds_base, left_y);
   }
   // Reached |min_top_only_x|.
   if (x < width) {
@@ -2129,18 +2263,18 @@ void DirectionalIntraPredictorZone2_NEON(
   }
   if (upsampled_top) {
     if (upsampled_left) {
-      DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width,
-                                     height, xstep, ystep);
+      DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width,
+                                        height, xstep, ystep);
     } else {
-      DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width,
-                                      height, xstep, ystep);
+      DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width,
+                                         height, xstep, ystep);
     }
   } else if (upsampled_left) {
-    DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width,
-                                    height, xstep, ystep);
+    DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width,
+                                       height, xstep, ystep);
   } else {
-    DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width,
-                                     height, xstep, ystep);
+    DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width,
+                                        height, xstep, ystep);
   }
 }
 
diff --git a/libgav1/src/dsp/arm/intrapred_neon.cc b/libgav1/src/dsp/arm/intrapred_neon.cc
index cd47a22..d1adbdf 100644
--- a/libgav1/src/dsp/arm/intrapred_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_neon.cc
@@ -407,13 +407,9 @@ inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
 inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
                              const uint16x8_t top_left_dist_low,
                              const uint16x8_t top_left_dist_high) {
-  // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of
-  // using movl(x_dist).
-  const uint8x8_t x_le_top_left_low =
-      vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low));
-  const uint8x8_t x_le_top_left_high =
-      vmovn_u16(vcleq_u16(vmovl_u8(vget_high_u8(x_dist)), top_left_dist_high));
-  return vcombine_u8(x_le_top_left_low, x_le_top_left_high);
+  const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
+                                               vqmovn_u16(top_left_dist_high));
+  return vcleq_u8(x_dist, top_left_dist);
 }
 
 // Select the closest values and collect them.
diff --git a/libgav1/src/dsp/arm/intrapred_smooth_neon.cc b/libgav1/src/dsp/arm/intrapred_smooth_neon.cc
index bcda131..d6c1450 100644
--- a/libgav1/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_smooth_neon.cc
@@ -31,7 +31,6 @@
 
 namespace libgav1 {
 namespace dsp {
-
 namespace low_bitdepth {
 namespace {
 
@@ -42,20 +41,15 @@ constexpr uint8_t kSmoothWeights[] = {
 #include "src/dsp/smooth_weights.inc"
 };
 
-inline uint16x4_t CalculatePred(const uint16x4_t weighted_top,
-                                const uint16x4_t weighted_left,
-                                const uint16x4_t weighted_bl,
-                                const uint16x4_t weighted_tr) {
-  const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left);
-  const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr);
-  const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1);
-  return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1);
+// 256 - v = vneg_s8(v)
+inline uint8x8_t NegateS8(const uint8x8_t v) {
+  return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
 }
 
 template <int height>
-inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-                           const void* LIBGAV1_RESTRICT const top_row,
-                           const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* LIBGAV1_RESTRICT const left_column) {
   constexpr int width = 4;
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const auto* const left = static_cast<const uint8_t*>(left_column);
@@ -68,47 +62,49 @@ inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
   const uint8x8_t top_right_v = vdup_n_u8(top_right);
   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
   const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4);
-  // 256 - weights = vneg_s8(weights)
-  const uint8x8_t scaled_weights_x =
-      vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+  const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
 
   for (int y = 0; y < height; ++y) {
     const uint8x8_t left_v = vdup_n_u8(left[y]);
     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
-    const uint8x8_t scaled_weights_y =
-        vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_y_v)));
-    const uint16x4_t weighted_bl =
-        vget_low_u16(vmull_u8(scaled_weights_y, bottom_left_v));
-
-    const uint16x4_t weighted_top = vget_low_u16(vmull_u8(weights_y_v, top_v));
-    const uint16x4_t weighted_left =
-        vget_low_u16(vmull_u8(weights_x_v, left_v));
-    const uint16x4_t weighted_tr =
-        vget_low_u16(vmull_u8(scaled_weights_x, top_right_v));
-    const uint16x4_t result =
-        CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr);
-
-    StoreLo4(dst, vmovn_u16(vcombine_u16(result, result)));
+    const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+    const uint16x8_t weighted_top_bl =
+        vmlal_u8(weighted_bl, weights_y_v, top_v);
+    const uint16x8_t weighted_left_tr =
+        vmlal_u8(weighted_tr, weights_x_v, left_v);
+    // Maximum value of each parameter: 0xFF00
+    const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+    const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale);
+
+    StoreLo4(dst, result);
     dst += stride;
   }
 }
 
-inline uint8x8_t CalculatePred(const uint16x8_t weighted_top,
-                               const uint16x8_t weighted_left,
-                               const uint16x8_t weighted_bl,
-                               const uint16x8_t weighted_tr) {
-  // Maximum value: 0xFF00
-  const uint16x8_t pred_0 = vaddq_u16(weighted_top, weighted_bl);
-  // Maximum value: 0xFF00
-  const uint16x8_t pred_1 = vaddq_u16(weighted_left, weighted_tr);
-  const uint16x8_t pred_2 = vhaddq_u16(pred_0, pred_1);
-  return vrshrn_n_u16(pred_2, kSmoothWeightScale);
+inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl,
+                               const uint16x8_t weighted_left_tr) {
+  // Maximum value of each parameter: 0xFF00
+  const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+  return vrshrn_n_u16(avg, kSmoothWeightScale);
+}
+
+inline uint8x8_t CalculateWeightsAndPred(
+    const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
+    const uint8x8_t bottom_left, const uint8x8_t weights_x,
+    const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
+  const uint16x8_t weighted_top = vmull_u8(weights_y, top);
+  const uint16x8_t weighted_top_bl =
+      vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
+  const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
+  return CalculatePred(weighted_top_bl, weighted_left_tr);
 }
 
 template <int height>
-inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-                           const void* LIBGAV1_RESTRICT const top_row,
-                           const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* LIBGAV1_RESTRICT const left_column) {
   constexpr int width = 8;
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const auto* const left = static_cast<const uint8_t*>(left_column);
@@ -121,21 +117,16 @@ inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
   const uint8x8_t top_right_v = vdup_n_u8(top_right);
   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
   const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
-  // 256 - weights = vneg_s8(weights)
-  const uint8x8_t scaled_weights_x =
-      vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+  const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
 
   for (int y = 0; y < height; ++y) {
     const uint8x8_t left_v = vdup_n_u8(left[y]);
     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
-    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
-    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
-
-    const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
-    const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v);
-    const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+    const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
     const uint8x8_t result =
-        CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr);
+        CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v,
+                                weights_x_v, scaled_weights_y, weights_y_v);
 
     vst1_u8(dst, result);
     dst += stride;
@@ -146,28 +137,34 @@ inline uint8x16_t CalculateWeightsAndPred(
     const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
     const uint8x8_t weights_y, const uint8x16_t weights_x,
     const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
-  const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
+  const uint16x8_t weighted_top_bl_low =
+      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
   const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
-  const uint16x8_t weighted_tr_low =
-      vmull_u8(vget_low_u8(scaled_weights_x), top_right);
-  const uint8x8_t result_low = CalculatePred(
-      weighted_top_low, weighted_left_low, weighted_bl, weighted_tr_low);
+  const uint16x8_t weighted_left_tr_low =
+      vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+  const uint8x8_t result_low =
+      CalculatePred(weighted_top_bl_low, weighted_left_tr_low);
 
-  const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
+  const uint16x8_t weighted_top_bl_high =
+      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
   const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
-  const uint16x8_t weighted_tr_high =
-      vmull_u8(vget_high_u8(scaled_weights_x), top_right);
-  const uint8x8_t result_high = CalculatePred(
-      weighted_top_high, weighted_left_high, weighted_bl, weighted_tr_high);
+  const uint16x8_t weighted_left_tr_high =
+      vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+  const uint8x8_t result_high =
+      CalculatePred(weighted_top_bl_high, weighted_left_tr_high);
 
   return vcombine_u8(result_low, result_high);
 }
 
+// 256 - v = vneg_s8(v)
+inline uint8x16_t NegateS8(const uint8x16_t v) {
+  return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
+}
+
 template <int width, int height>
-inline void Smooth16PlusxN_NEON(
-    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-    const void* LIBGAV1_RESTRICT const top_row,
-    const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                         const void* LIBGAV1_RESTRICT const top_row,
+                         const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const auto* const left = static_cast<const uint8_t*>(left_column);
   const uint8_t top_right = top[width - 1];
@@ -188,9 +185,6 @@ inline void Smooth16PlusxN_NEON(
   const uint8x8_t top_right_v = vdup_n_u8(top_right);
   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
 
-  // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop.
-  // This currently has a performance slope similar to Paeth so it does not
-  // appear to be register bound for arm64.
   uint8x16_t weights_x_v[4];
   weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
   if (width > 16) {
@@ -202,23 +196,19 @@ inline void Smooth16PlusxN_NEON(
   }
 
   uint8x16_t scaled_weights_x[4];
-  scaled_weights_x[0] =
-      vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0])));
+  scaled_weights_x[0] = NegateS8(weights_x_v[0]);
   if (width > 16) {
-    scaled_weights_x[1] =
-        vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1])));
+    scaled_weights_x[1] = NegateS8(weights_x_v[1]);
     if (width == 64) {
-      scaled_weights_x[2] =
-          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2])));
-      scaled_weights_x[3] =
-          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3])));
+      scaled_weights_x[2] = NegateS8(weights_x_v[2]);
+      scaled_weights_x[3] = NegateS8(weights_x_v[3]);
     }
   }
 
   for (int y = 0; y < height; ++y) {
     const uint8x8_t left_v = vdup_n_u8(left[y]);
     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
-    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+    const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
     const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
 
     vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
@@ -246,10 +236,10 @@ inline void Smooth16PlusxN_NEON(
 }
 
 template <int width, int height>
-inline void SmoothVertical4Or8xN_NEON(
-    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-    const void* LIBGAV1_RESTRICT const top_row,
-    const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+                               ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint8_t*>(top_row);
   const auto* const left = static_cast<const uint8_t*>(left_column);
   const uint8_t bottom_left = left[height - 1];
@@ -267,17 +257,17 @@ inline void SmoothVertical4Or8xN_NEON(
 
   for (int y = 0; y < height; ++y) {
     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
-    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+    const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
 
     const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
-    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
-    const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl);
-    const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+    const uint16x8_t weighted_top_bl =
+        vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);
+    const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale);
 
     if (width == 4) {
-      StoreLo4(dst, pred_scaled);
+      StoreLo4(dst, pred);
     } else {  // width == 8
-      vst1_u8(dst, pred_scaled);
+      vst1_u8(dst, pred);
     }
     dst += stride;
   }
@@ -286,10 +276,10 @@ inline void SmoothVertical4Or8xN_NEON(
 inline uint8x16_t CalculateVerticalWeightsAndPred(
     const uint8x16_t top, const uint8x8_t weights_y,
     const uint16x8_t weighted_bl) {
-  const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
-  const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
-  const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl);
-  const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl);
+  const uint16x8_t pred_low =
+      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+  const uint16x8_t pred_high =
+      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
   const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
   const uint8x8_t pred_scaled_high =
       vrshrn_n_u16(pred_high, kSmoothWeightScale);
@@ -297,7 +287,7 @@ inline uint8x16_t CalculateVerticalWeightsAndPred(
 }
 
 template <int width, int height>
-inline void SmoothVertical16PlusxN_NEON(
+void SmoothVertical16PlusxN_NEON(
     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
     const void* LIBGAV1_RESTRICT const top_row,
     const void* LIBGAV1_RESTRICT const left_column) {
@@ -321,7 +311,7 @@ inline void SmoothVertical16PlusxN_NEON(
 
   for (int y = 0; y < height; ++y) {
     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
-    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+    const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
     const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
 
     const uint8x16_t pred_0 =
@@ -349,7 +339,7 @@ inline void SmoothVertical16PlusxN_NEON(
 }
 
 template <int width, int height>
-inline void SmoothHorizontal4Or8xN_NEON(
+void SmoothHorizontal4Or8xN_NEON(
     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
     const void* LIBGAV1_RESTRICT const top_row,
     const void* LIBGAV1_RESTRICT const left_column) {
@@ -361,22 +351,19 @@ inline void SmoothHorizontal4Or8xN_NEON(
   const uint8x8_t top_right_v = vdup_n_u8(top_right);
   // Over-reads for 4xN but still within the array.
   const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
-  // 256 - weights = vneg_s8(weights)
-  const uint8x8_t scaled_weights_x =
-      vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x)));
+  const uint8x8_t scaled_weights_x = NegateS8(weights_x);
+  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
 
   for (int y = 0; y < height; ++y) {
     const uint8x8_t left_v = vdup_n_u8(left[y]);
-
-    const uint16x8_t weighted_left = vmull_u8(weights_x, left_v);
-    const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
-    const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr);
-    const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+    const uint16x8_t weighted_left_tr =
+        vmlal_u8(weighted_tr, weights_x, left_v);
+    const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale);
 
     if (width == 4) {
-      StoreLo4(dst, pred_scaled);
+      StoreLo4(dst, pred);
     } else {  // width == 8
-      vst1_u8(dst, pred_scaled);
+      vst1_u8(dst, pred);
     }
     dst += stride;
   }
@@ -386,23 +373,22 @@ inline uint8x16_t CalculateHorizontalWeightsAndPred(
     const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
     const uint8x16_t scaled_weights_x) {
   const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
-  const uint16x8_t weighted_tr_low =
-      vmull_u8(vget_low_u8(scaled_weights_x), top_right);
-  const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low);
-  const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+  const uint16x8_t weighted_left_tr_low =
+      vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+  const uint8x8_t pred_scaled_low =
+      vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale);
 
   const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
-  const uint16x8_t weighted_tr_high =
-      vmull_u8(vget_high_u8(scaled_weights_x), top_right);
-  const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high);
+  const uint16x8_t weighted_left_tr_high =
+      vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
   const uint8x8_t pred_scaled_high =
-      vrshrn_n_u16(pred_high, kSmoothWeightScale);
+      vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale);
 
   return vcombine_u8(pred_scaled_low, pred_scaled_high);
 }
 
 template <int width, int height>
-inline void SmoothHorizontal16PlusxN_NEON(
+void SmoothHorizontal16PlusxN_NEON(
     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
     const void* LIBGAV1_RESTRICT const top_row,
     const void* LIBGAV1_RESTRICT const left_column) {
@@ -424,16 +410,12 @@ inline void SmoothHorizontal16PlusxN_NEON(
   }
 
   uint8x16_t scaled_weights_x[4];
-  scaled_weights_x[0] =
-      vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0])));
+  scaled_weights_x[0] = NegateS8(weights_x[0]);
   if (width > 16) {
-    scaled_weights_x[1] =
-        vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1])));
+    scaled_weights_x[1] = NegateS8(weights_x[1]);
     if (width == 64) {
-      scaled_weights_x[2] =
-          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2])));
-      scaled_weights_x[3] =
-          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3])));
+      scaled_weights_x[2] = NegateS8(weights_x[2]);
+      scaled_weights_x[3] = NegateS8(weights_x[3]);
     }
   }
 
@@ -633,10 +615,15 @@ constexpr uint16_t kSmoothWeights[] = {
 #include "src/dsp/smooth_weights.inc"
 };
 
+// 256 - v = vneg_s8(v)
+inline uint16x4_t NegateS8(const uint16x4_t v) {
+  return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
+}
+
 template <int height>
-inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-                           const void* LIBGAV1_RESTRICT const top_row,
-                           const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint16_t*>(top_row);
   const auto* const left = static_cast<const uint16_t*>(left_column);
   const uint16_t top_right = top[3];
@@ -647,9 +634,7 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
   const uint16x4_t top_v = vld1_u16(top);
   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
   const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights);
-  const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v);
-
-  // Weighted top right doesn't change with each row.
+  const uint16x4_t scaled_weights_x = NegateS8(weights_x_v);
   const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
 
   for (int y = 0; y < height; ++y) {
@@ -670,10 +655,10 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
 
 // Common code between 8xH and [16|32|64]xH.
 inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
-                           const uint32x4_t& weighted_corners_low,
-                           const uint32x4_t& weighted_corners_high,
-                           const uint16x4x2_t& top_vals,
-                           const uint16x4x2_t& weights_x, const uint16_t left_y,
+                           const uint32x4_t weighted_corners_low,
+                           const uint32x4_t weighted_corners_high,
+                           const uint16x4x2_t top_vals,
+                           const uint16x4x2_t weights_x, const uint16_t left_y,
                            const uint16_t weight_y) {
   // Each variable in the running summation is named for the last item to be
   // accumulated.
@@ -697,9 +682,9 @@ inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
 }
 
 template <int height>
-inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-                           const void* LIBGAV1_RESTRICT const top_row,
-                           const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint16_t*>(top_row);
   const auto* const left = static_cast<const uint16_t*>(left_column);
   const uint16_t top_right = top[7];
@@ -712,14 +697,12 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
   const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
                                   vld1_u16(kSmoothWeights + 8)};
-  // Weighted top right doesn't change with each row.
   const uint32x4_t weighted_tr_low =
-      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+      vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
   const uint32x4_t weighted_tr_high =
-      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+      vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
 
   for (int y = 0; y < height; ++y) {
-    // |weighted_bl| is invariant across the row.
     const uint32x4_t weighted_bl =
         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
     const uint32x4_t weighted_corners_low =
@@ -735,9 +718,9 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
 
 // For width 16 and above.
 template <int width, int height>
-inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-                           const void* LIBGAV1_RESTRICT const top_row,
-                           const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint16_t*>(top_row);
   const auto* const left = static_cast<const uint16_t*>(left_column);
   const uint16_t top_right = top[width - 1];
@@ -746,23 +729,19 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
 
   auto* dst = static_cast<uint8_t*>(dest);
 
-  const uint16x4_t weight_scaling = vdup_n_u16(256);
   // Precompute weighted values that don't vary with |y|.
   uint32x4_t weighted_tr_low[width >> 3];
   uint32x4_t weighted_tr_high[width >> 3];
   for (int i = 0; i < width >> 3; ++i) {
     const int x = i << 3;
     const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x);
-    weighted_tr_low[i] =
-        vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right);
+    weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right);
     const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x);
-    weighted_tr_high[i] =
-        vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right);
+    weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right);
   }
 
   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
   for (int y = 0; y < height; ++y) {
-    // |weighted_bl| is invariant across the row.
     const uint32x4_t weighted_bl =
         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
     auto* dst_x = reinterpret_cast<uint16_t*>(dst);
@@ -785,10 +764,9 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
 }
 
 template <int height>
-inline void SmoothVertical4xH_NEON(
-    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-    const void* LIBGAV1_RESTRICT const top_row,
-    const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                            const void* LIBGAV1_RESTRICT const top_row,
+                            const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint16_t*>(top_row);
   const auto* const left = static_cast<const uint16_t*>(left_column);
   const uint16_t bottom_left = left[height - 1];
@@ -812,10 +790,10 @@ inline void SmoothVertical4xH_NEON(
 }
 
 template <int height>
-inline void SmoothVertical8xH_NEON(
-    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
-    const void* LIBGAV1_RESTRICT const top_row,
-    const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t stride,
+                            const void* LIBGAV1_RESTRICT const top_row,
+                            const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint16_t*>(top_row);
   const auto* const left = static_cast<const uint16_t*>(left_column);
   const uint16_t bottom_left = left[height - 1];
@@ -829,7 +807,6 @@ inline void SmoothVertical8xH_NEON(
 
   for (int y = 0; y < height; ++y) {
     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
-    // |weighted_bl| is invariant across the row.
     const uint32x4_t weighted_bl =
         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
 
@@ -846,10 +823,10 @@ inline void SmoothVertical8xH_NEON(
 
 // For width 16 and above.
 template <int width, int height>
-inline void SmoothVerticalWxH_NEON(
-    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
-    const void* LIBGAV1_RESTRICT const top_row,
-    const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t stride,
+                            const void* LIBGAV1_RESTRICT const top_row,
+                            const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint16_t*>(top_row);
   const auto* const left = static_cast<const uint16_t*>(left_column);
   const uint16_t bottom_left = left[height - 1];
@@ -865,7 +842,6 @@ inline void SmoothVerticalWxH_NEON(
 
   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
   for (int y = 0; y < height; ++y) {
-    // |weighted_bl| is invariant across the row.
     const uint32x4_t weighted_bl =
         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
 
@@ -885,10 +861,10 @@ inline void SmoothVerticalWxH_NEON(
 }
 
 template <int height>
-inline void SmoothHorizontal4xH_NEON(
-    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-    const void* LIBGAV1_RESTRICT const top_row,
-    const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint16_t*>(top_row);
   const auto* const left = static_cast<const uint16_t*>(left_column);
   const uint16_t top_right = top[3];
@@ -896,7 +872,7 @@ inline void SmoothHorizontal4xH_NEON(
   auto* dst = static_cast<uint8_t*>(dest);
 
   const uint16x4_t weights_x = vld1_u16(kSmoothWeights);
-  const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x);
+  const uint16x4_t scaled_weights_x = NegateS8(weights_x);
 
   const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
   for (int y = 0; y < height; ++y) {
@@ -909,10 +885,10 @@ inline void SmoothHorizontal4xH_NEON(
 }
 
 template <int height>
-inline void SmoothHorizontal8xH_NEON(
-    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-    const void* LIBGAV1_RESTRICT const top_row,
-    const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint16_t*>(top_row);
   const auto* const left = static_cast<const uint16_t*>(left_column);
   const uint16_t top_right = top[7];
@@ -923,9 +899,9 @@ inline void SmoothHorizontal8xH_NEON(
                                   vld1_u16(kSmoothWeights + 8)};
 
   const uint32x4_t weighted_tr_low =
-      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+      vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
   const uint32x4_t weighted_tr_high =
-      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+      vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
 
   for (int y = 0; y < height; ++y) {
     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
@@ -943,18 +919,16 @@ inline void SmoothHorizontal8xH_NEON(
 
 // For width 16 and above.
 template <int width, int height>
-inline void SmoothHorizontalWxH_NEON(
-    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-    const void* LIBGAV1_RESTRICT const top_row,
-    const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
   const auto* const top = static_cast<const uint16_t*>(top_row);
   const auto* const left = static_cast<const uint16_t*>(left_column);
   const uint16_t top_right = top[width - 1];
 
   auto* dst = static_cast<uint8_t*>(dest);
 
-  const uint16x4_t weight_scaling = vdup_n_u16(256);
-
   uint16x4_t weights_x_low[width >> 3];
   uint16x4_t weights_x_high[width >> 3];
   uint32x4_t weighted_tr_low[width >> 3];
@@ -962,11 +936,9 @@ inline void SmoothHorizontalWxH_NEON(
   for (int i = 0; i < width >> 3; ++i) {
     const int x = i << 3;
     weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x);
-    weighted_tr_low[i] =
-        vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right);
+    weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right);
     weights_x_high[i] = vld1_u16(kSmoothWeights + width + x);
-    weighted_tr_high[i] =
-        vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right);
+    weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right);
   }
 
   for (int y = 0; y < height; ++y) {
@@ -1141,6 +1113,7 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
       SmoothHorizontalWxH_NEON<64, 64>;
 }
+
 }  // namespace
 }  // namespace high_bitdepth
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc b/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc
index 617accc..e6f0d9d 100644
--- a/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc
+++ b/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -282,9 +282,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
   const int32x4_t max = vdupq_n_s32((1 << range) - 1);
   int32x4_t s[4], x[4];
 
-  LoadSrc<4>(dst, step, 0, x);
   if (is_row) {
-    Transpose4x4(x, x);
+    assert(step == 4);
+    int32x4x4_t y = vld4q_s32(dst);
+    for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+  } else {
+    LoadSrc<4>(dst, step, 0, x);
   }
 
   // stage 1.
@@ -301,9 +304,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
     for (auto& i : s) {
       i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
     }
-    Transpose4x4(s, s);
+    int32x4x4_t y;
+    for (int i = 0; i < 4; ++i) y.val[i] = s[i];
+    vst4q_s32(dst, y);
+  } else {
+    StoreDst<4>(dst, step, 0, s);
   }
-  StoreDst<4>(dst, step, 0, s);
 }
 
 template <ButterflyRotationFunc butterfly_rotation,
@@ -937,9 +943,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
   int32x4_t s[8];
   int32x4_t x[4];
 
-  LoadSrc<4>(dst, step, 0, x);
   if (is_row) {
-    Transpose4x4(x, x);
+    assert(step == 4);
+    int32x4x4_t y = vld4q_s32(dst);
+    for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+  } else {
+    LoadSrc<4>(dst, step, 0, x);
   }
 
   // stage 1.
@@ -981,9 +990,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
     x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
     x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
     x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
-    Transpose4x4(x, x);
+    int32x4x4_t y;
+    for (int i = 0; i < 4; ++i) y.val[i] = x[i];
+    vst4q_s32(dst, y);
+  } else {
+    StoreDst<4>(dst, step, 0, x);
   }
-  StoreDst<4>(dst, step, 0, x);
 }
 
 alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
diff --git a/libgav1/src/dsp/arm/inverse_transform_neon.cc b/libgav1/src/dsp/arm/inverse_transform_neon.cc
index 1c2e111..452f14a 100644
--- a/libgav1/src/dsp/arm/inverse_transform_neon.cc
+++ b/libgav1/src/dsp/arm/inverse_transform_neon.cc
@@ -41,50 +41,6 @@ namespace {
 
 //------------------------------------------------------------------------------
 
-// TODO(slavarnway): Move transpose functions to transpose_neon.h or
-// common_neon.h.
-
-LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int16x8_t in[4],
-                                        int16x8_t out[4]) {
-  // Swap 16 bit elements. Goes from:
-  // a0: 00 01 02 03
-  // a1: 10 11 12 13
-  // a2: 20 21 22 23
-  // a3: 30 31 32 33
-  // to:
-  // b0.val[0]: 00 10 02 12
-  // b0.val[1]: 01 11 03 13
-  // b1.val[0]: 20 30 22 32
-  // b1.val[1]: 21 31 23 33
-  const int16x4_t a0 = vget_low_s16(in[0]);
-  const int16x4_t a1 = vget_low_s16(in[1]);
-  const int16x4_t a2 = vget_low_s16(in[2]);
-  const int16x4_t a3 = vget_low_s16(in[3]);
-
-  const int16x4x2_t b0 = vtrn_s16(a0, a1);
-  const int16x4x2_t b1 = vtrn_s16(a2, a3);
-
-  // Swap 32 bit elements resulting in:
-  // c0.val[0]: 00 10 20 30 04 14 24 34
-  // c0.val[1]: 02 12 22 32 06 16 26 36
-  // c1.val[0]: 01 11 21 31 05 15 25 35
-  // c1.val[1]: 03 13 23 33 07 17 27 37
-  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
-                                  vreinterpret_s32_s16(b1.val[0]));
-  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
-                                  vreinterpret_s32_s16(b1.val[1]));
-
-  const int16x4_t d0 = vreinterpret_s16_s32(c0.val[0]);
-  const int16x4_t d1 = vreinterpret_s16_s32(c1.val[0]);
-  const int16x4_t d2 = vreinterpret_s16_s32(c0.val[1]);
-  const int16x4_t d3 = vreinterpret_s16_s32(c1.val[1]);
-
-  out[0] = vcombine_s16(d0, d0);
-  out[1] = vcombine_s16(d1, d1);
-  out[2] = vcombine_s16(d2, d2);
-  out[3] = vcombine_s16(d3, d3);
-}
-
 // Note this is only used in the final stage of Dct32/64 and Adst16 as the in
 // place version causes additional stack usage with clang.
 LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8],
@@ -580,16 +536,19 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
 
   if (stage_is_rectangular) {
     if (transpose) {
-      int16x8_t input[8];
-      LoadSrc<8, 8>(dst, step, 0, input);
-      Transpose4x8To8x4(input, x);
+      assert(step == 4);
+      int16x8x4_t y = vld4q_s16(dst);
+      for (int i = 0; i < 4; ++i) x[i] = y.val[i];
     } else {
       LoadSrc<16, 4>(dst, step, 0, x);
     }
   } else {
-    LoadSrc<8, 4>(dst, step, 0, x);
     if (transpose) {
-      Transpose4x4(x, x);
+      assert(step == 4);
+      int16x4x4_t y = vld4_s16(dst);
+      for (int i = 0; i < 4; ++i) x[i] = vcombine_s16(y.val[i], y.val[i]);
+    } else {
+      LoadSrc<8, 4>(dst, step, 0, x);
     }
   }
 
@@ -604,17 +563,20 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
 
   if (stage_is_rectangular) {
     if (transpose) {
-      int16x8_t output[8];
-      Transpose8x4To4x8(s, output);
-      StoreDst<8, 8>(dst, step, 0, output);
+      int16x8x4_t y;
+      for (int i = 0; i < 4; ++i) y.val[i] = s[i];
+      vst4q_s16(dst, y);
     } else {
       StoreDst<16, 4>(dst, step, 0, s);
     }
   } else {
     if (transpose) {
-      Transpose4x4(s, s);
+      int16x4x4_t y;
+      for (int i = 0; i < 4; ++i) y.val[i] = vget_low_s16(s[i]);
+      vst4_s16(dst, y);
+    } else {
+      StoreDst<8, 4>(dst, step, 0, s);
     }
-    StoreDst<8, 4>(dst, step, 0, s);
   }
 }
 
@@ -1204,45 +1166,41 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
 
 //------------------------------------------------------------------------------
 // Asymmetric Discrete Sine Transforms (ADST).
-template <bool stage_is_rectangular>
+
 LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
                                       bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  int32x4_t s[8];
-  int16x8_t x[4];
+  int32x4_t s[7];
+  int16x4_t x[4];
 
-  if (stage_is_rectangular) {
-    if (transpose) {
-      int16x8_t input[8];
-      LoadSrc<8, 8>(dst, step, 0, input);
-      Transpose4x8To8x4(input, x);
-    } else {
-      LoadSrc<16, 4>(dst, step, 0, x);
-    }
+  if (transpose) {
+    assert(step == 4);
+    int16x4x4_t y = vld4_s16(dst);
+    for (int i = 0; i < 4; ++i) x[i] = y.val[i];
   } else {
-    LoadSrc<8, 4>(dst, step, 0, x);
-    if (transpose) {
-      Transpose4x4(x, x);
-    }
+    x[0] = vld1_s16(dst);
+    x[1] = vld1_s16(dst + 1 * step);
+    x[2] = vld1_s16(dst + 2 * step);
+    x[3] = vld1_s16(dst + 3 * step);
   }
 
   // stage 1.
-  s[5] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[1]);
-  s[6] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[3]);
+  s[5] = vmull_n_s16(x[3], kAdst4Multiplier[1]);
+  s[6] = vmull_n_s16(x[3], kAdst4Multiplier[3]);
 
   // stage 2.
-  const int32x4_t a7 = vsubl_s16(vget_low_s16(x[0]), vget_low_s16(x[2]));
-  const int32x4_t b7 = vaddw_s16(a7, vget_low_s16(x[3]));
+  const int32x4_t a7 = vsubl_s16(x[0], x[2]);
+  const int32x4_t b7 = vaddw_s16(a7, x[3]);
 
   // stage 3.
-  s[0] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[0]);
-  s[1] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[1]);
+  s[0] = vmull_n_s16(x[0], kAdst4Multiplier[0]);
+  s[1] = vmull_n_s16(x[0], kAdst4Multiplier[1]);
   // s[0] = s[0] + s[3]
-  s[0] = vmlal_n_s16(s[0], vget_low_s16(x[2]), kAdst4Multiplier[3]);
+  s[0] = vmlal_n_s16(s[0], x[2], kAdst4Multiplier[3]);
   // s[1] = s[1] - s[4]
-  s[1] = vmlsl_n_s16(s[1], vget_low_s16(x[2]), kAdst4Multiplier[0]);
+  s[1] = vmlsl_n_s16(s[1], x[2], kAdst4Multiplier[0]);
 
-  s[3] = vmull_n_s16(vget_low_s16(x[1]), kAdst4Multiplier[2]);
+  s[3] = vmull_n_s16(x[1], kAdst4Multiplier[2]);
   s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
 
   // stage 4.
@@ -1259,24 +1217,20 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
   const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12);
   const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
 
-  x[0] = vcombine_s16(dst_0, dst_0);
-  x[1] = vcombine_s16(dst_1, dst_1);
-  x[2] = vcombine_s16(dst_2, dst_2);
-  x[3] = vcombine_s16(dst_3, dst_3);
+  x[0] = dst_0;
+  x[1] = dst_1;
+  x[2] = dst_2;
+  x[3] = dst_3;
 
-  if (stage_is_rectangular) {
-    if (transpose) {
-      int16x8_t output[8];
-      Transpose8x4To4x8(x, output);
-      StoreDst<8, 8>(dst, step, 0, output);
-    } else {
-      StoreDst<16, 4>(dst, step, 0, x);
-    }
+  if (transpose) {
+    int16x4x4_t y;
+    for (int i = 0; i < 4; ++i) y.val[i] = x[i];
+    vst4_s16(dst, y);
   } else {
-    if (transpose) {
-      Transpose4x4(x, x);
-    }
-    StoreDst<8, 4>(dst, step, 0, x);
+    vst1_s16(dst, x[0]);
+    vst1_s16(dst + 1 * step, x[1]);
+    vst1_s16(dst + 2 * step, x[2]);
+    vst1_s16(dst + 3 * step, x[3]);
   }
 }
 
@@ -2705,7 +2659,7 @@ void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
   int i = adjusted_tx_height;
   auto* data = src;
   do {
-    Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true);
+    Adst4_NEON(data, /*step=*/4, /*transpose=*/true);
     data += 16;
     i -= 4;
   } while (i != 0);
@@ -2732,7 +2686,7 @@ void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
     int i = tx_width;
     auto* data = src;
     do {
-      Adst4_NEON<false>(data, tx_width, /*transpose=*/false);
+      Adst4_NEON(data, tx_width, /*transpose=*/false);
       data += 4;
       i -= 4;
     } while (i != 0);
diff --git a/libgav1/src/dsp/arm/loop_filter_10bit_neon.cc b/libgav1/src/dsp/arm/loop_filter_10bit_neon.cc
new file mode 100644
index 0000000..a9dd98f
--- /dev/null
+++ b/libgav1/src/dsp/arm/loop_filter_10bit_neon.cc
@@ -0,0 +1,1218 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) {
+  const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+  return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0,
+                                 const uint16x4_t q0, const uint16x4_t q1,
+                                 const uint16_t outer_thresh) {
+  const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+  const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+  const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+  const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+  const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+  return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+//   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16x8_t abd_p1p2_q1q2,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+//   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16x8_t abd_p1p2_q1q2,
+                               const uint16x8_t abd_p2p3_q2q3,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterNMasks functions.
+
+inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+                         const uint16_t hev_thresh, const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const hev_mask,
+                         uint16x4_t* const needs_filter4_mask) {
+  const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  // This includes cases where NeedsFilter4() is not true and so Filter2() will
+  // not be applied.
+  const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+  *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+  // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+  *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+//   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1,
+                          const uint16x8_t abd_p0p2_q0q2) {
+  constexpr int flat_thresh = 1 << 2;
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1,
+                         const uint16x8_t p0q0, const uint16_t hev_thresh,
+                         const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const needs_filter6_mask,
+                         uint16x4_t* const is_flat3_mask,
+                         uint16x4_t* const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+  *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2));
+  *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+                                     inner_thresh, outer_mask);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+//   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+//   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0,
+                          const uint16x8_t abd_pn1p0_qn1q0,
+                          const uint16x8_t abd_pn2p0_qn2q0) {
+  constexpr int flat_thresh = 1 << 2;
+  const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+  const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2,
+                         const uint16x8_t p1q1, const uint16x8_t p0q0,
+                         const uint16_t hev_thresh, const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const needs_filter8_mask,
+                         uint16x4_t* const is_flat4_mask,
+                         uint16x4_t* const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+  const uint16x4_t is_flat4 =
+      IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3));
+  *needs_filter8_mask =
+      NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+                   inner_thresh, outer_mask);
+  // |is_flat4_mask| is used to decide where to use the result of Filter8.
+  // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+  // overriding the question of whether to use Filter8. Because Filter4 doesn't
+  // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the
+  // source value. To be correct, the mask must account for this override.
+  *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterN functions.
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+                    const uint16x8_t p1q1, const uint16x4_t hev_mask,
+                    uint16x8_t* const p1q1_result,
+                    uint16x8_t* const p0q0_result) {
+  const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+  // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  // q0mp0 means "q0 minus p0".
+  const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+  const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+  // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+  const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/)));
+  const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1);
+  const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+  const int16x4_t p1mq1_saturated =
+      Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel);
+  const int16x4_t hev_option =
+      vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+  const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+  // Need to figure out what's going on here because there are some unnecessary
+  // tricks to accommodate 8x8 as smallest 8bpp vector
+
+  // We can not shift with rounding because the clamp comes *before* the
+  // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+  // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int16x4_t plus_four =
+      Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t plus_three =
+      Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+  const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+  // a3 = (a1 + 1) >> 1;
+  const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+  const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+  const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+  // Need to shift the second term or we end up with a2_ma2.
+  const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+  const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+  *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10);
+  *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+
+  const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0),
+                             vld1_u16(dst_q0), vld1_u16(dst_q1)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  // Offset by 2 uint16_t values to load from first p1 position.
+  auto* dst = static_cast<uint8_t*>(dest) - 4;
+  auto* dst_p1 = reinterpret_cast<uint16_t*>(dst);
+  auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2);
+  auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3);
+
+  uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+                       vld1_u16(dst_q1)};
+  Transpose4x4(src);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  uint16x4_t output[4] = {
+      vget_low_u16(p1q1_output),
+      vget_low_u16(p0q0_output),
+      vget_high_u16(p0q0_output),
+      vget_high_u16(p1q1_output),
+  };
+  Transpose4x4(output);
+
+  vst1_u16(dst_p1, output[0]);
+  vst1_u16(dst_p0, output[1]);
+  vst1_u16(dst_q0, output[2]);
+  vst1_u16(dst_q1, output[3]);
+}
+
+inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+                    const uint16x8_t p0q0, uint16x8_t* const p1q1_output,
+                    uint16x8_t* const p0q0_output) {
+  // Sum p1 and q1 output from opposite directions.
+  // The formula is regrouped to allow 3 doubling operations to be combined.
+  //
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //      ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //                                 ^^^^^^^^
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                    ^^^^^^^^^^^
+  uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                                ^^^^^^
+  sum = vaddq_u16(sum, p0q0);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //               ^^^^^
+  sum = vshlq_n_u16(sum, 1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //        ^^^^^^                          ^^^^^^
+  // Should dual issue with the left shift.
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+  sum = vaddq_u16(sum, outer_sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - (2 * p2) + q0 + q1
+  // q0 = q1 - (2 * q2) + p0 + p1
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //                ^^^^^^^^
+  const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //        ^^^^^^^^
+  sum = vsubq_u16(sum, p2q2_double);
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+
+  const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1),
+                             vld1_u16(dst_p0), vld1_u16(dst_q0),
+                             vld1_u16(dst_q1), vld1_u16(dst_q2)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+  // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // Filter6() does not apply, but Filter4() applies to one or more values.
+    p0q0_output = p0q0;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  // Left side of the filter window.
+  auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // Overread by 2 values. These overreads become the high halves of src_raw[2]
+  // and src_raw[3] after transpose.
+  uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                           vld1q_u16(dst_3)};
+  Transpose4x8(src_raw);
+  // p2, p1, p0, q0, q1, q2
+  const uint16x4_t src[6] = {
+      vget_low_u16(src_raw[0]),  vget_low_u16(src_raw[1]),
+      vget_low_u16(src_raw[2]),  vget_low_u16(src_raw[3]),
+      vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+  };
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+  // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // Filter6() does not apply, but Filter4() applies to one or more values.
+    p0q0_output = p0q0;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x4_t output[4] = {
+      vget_low_u16(p1q1_output),
+      vget_low_u16(p0q0_output),
+      vget_high_u16(p0q0_output),
+      vget_high_u16(p1q1_output),
+  };
+  Transpose4x4(output);
+
+  // dst_n starts at p2, so adjust to p1.
+  vst1_u16(dst_0 + 1, output[0]);
+  vst1_u16(dst_1 + 1, output[1]);
+  vst1_u16(dst_2 + 1, output[2]);
+  vst1_u16(dst_3 + 1, output[3]);
+}
+
+inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+                    const uint16x8_t p1q1, const uint16x8_t p0q0,
+                    uint16x8_t* const p2q2_output,
+                    uint16x8_t* const p1q1_output,
+                    uint16x8_t* const p0q0_output) {
+  // Sum p2 and q2 output from opposite directions.
+  // The formula is regrouped to allow 2 doubling operations to be combined.
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //                                ^^^^^^^^
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                    ^^^^^^^^^^^
+  const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //               ^^^^^
+  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+  // Add two other terms to make dual issue with shift more likely.
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                   ^^^^^^^^^^^
+  const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                 ^^^^^^^^^^^^^
+  sum = vaddq_u16(sum, p01q01);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //        ^^^^^^
+  sum = vaddq_u16(sum, p3q3);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                               ^^^^^^
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p2q2_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p3 - p2 + p1 + q1
+  // q1 = q2 - q3 - q2 + q0 + p1
+  sum = vsubq_u16(sum, p23q23);
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p3 - p1 + p0 + q2
+  // q0 = q1 - q3 - q1 + q0 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+  const uint16x8_t q2p2 = Transpose64(p2q2);
+  sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  const uint16x4_t src[8] = {
+      vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0),
+      vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+  const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+  const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+  const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() does not apply, but Filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+inline uint16x8_t ReverseLowHalf(const uint16x8_t a) {
+  return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+  // To get desired pairs after transpose, one half should be reversed.
+  uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                       vld1q_u16(dst_3)};
+
+  // src[0] = p0q0
+  // src[1] = p1q1
+  // src[2] = p2q2
+  // src[3] = p3q3
+  LoopFilterTranspose4x8(src);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask = OuterThreshold(
+      vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+      vget_high_u16(src[1]), outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = src[0];
+  const uint16x8_t p1q1 = src[1];
+  const uint16x8_t p2q2 = src[2];
+  const uint16x8_t p3q3 = src[3];
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() does not apply, but Filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3};
+  // After transpose, |output| will contain rows of the form:
+  // p0 p1 p2 p3 q0 q1 q2 q3
+  Transpose4x8(output);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, ReverseLowHalf(output[0]));
+  vst1q_u16(dst_1, ReverseLowHalf(output[1]));
+  vst1q_u16(dst_2, ReverseLowHalf(output[2]));
+  vst1q_u16(dst_3, ReverseLowHalf(output[3]));
+}
+
+inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5,
+                     const uint16x8_t p4q4, const uint16x8_t p3q3,
+                     const uint16x8_t p2q2, const uint16x8_t p1q1,
+                     const uint16x8_t p0q0, uint16x8_t* const p5q5_output,
+                     uint16x8_t* const p4q4_output,
+                     uint16x8_t* const p3q3_output,
+                     uint16x8_t* const p2q2_output,
+                     uint16x8_t* const p1q1_output,
+                     uint16x8_t* const p0q0_output) {
+  // Sum p5 and q5 output from opposite directions.
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                                     ^^^^^^^^
+  const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                 ^^^^^^^^^^^^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                               ^^^^^^^^^^^^^^^^^^^
+  uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+  sum = vaddq_u16(sum, p6q6_x7);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                       ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                     ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                 ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //           ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                           ^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //      ^^
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p5q5_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p4 and q4 output:
+  // p4 = p5 - (2 * p6) + p3 + q1
+  // q4 = q5 - (2 * q6) + q3 + p1
+  sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+  *p4q4_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p3 and q3 output:
+  // p3 = p4 - p6 - p5 + p2 + q2
+  // q3 = q4 - q6 - q5 + q2 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+  const uint16x8_t q2p2 = Transpose64(p2q2);
+  sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+  *p3q3_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p2 and q2 output:
+  // p2 = p3 - p6 - p4 + p1 + q3
+  // q2 = q3 - q6 - q4 + q1 + p3
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+  const uint16x8_t q3p3 = Transpose64(p3q3);
+  sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+  *p2q2_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p6 - p3 + p0 + q4
+  // q1 = q2 - q6 - q3 + q0 + p4
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+  const uint16x8_t q4p4 = Transpose64(p4q4);
+  sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p6 - p2 + q0 + q5
+  // q0 = q1 - q6 - q2 + p0 + p5
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+  const uint16x8_t q5p5 = Transpose64(p5q5);
+  sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+  *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+                       int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride);
+  auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride);
+  auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride);
+  auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+  auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride);
+  auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride);
+  auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride);
+
+  const uint16x4_t src[14] = {
+      vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+      vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+      vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+      vld1_u16(dst_q5), vld1_u16(dst_q6)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+  const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+  const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+  const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+  const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+  const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+  const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+  // Mask to choose between the outputs of Filter8 and Filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // Filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                             vabdq_u16(p0q0, p6q6)));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+
+  vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+  vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+  vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+  vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+  vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+  vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) {
+  uint16x8x2_t acdb;
+#if defined(__aarch64__)
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+                     vreinterpretq_u64_u16(ab), 1));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+                     vreinterpretq_u64_u16(ab), 0));
+#endif  // defined(__aarch64__)
+  return acdb;
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                     int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // Low halves:  p7 p6 p5 p4
+  // High halves: p3 p2 p1 p0
+  uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                         vld1q_u16(dst_3)};
+  // p7 will be the low half of src_p[0]. Not used until the end.
+  Transpose4x8(src_p);
+
+  // Low halves:  q0 q1 q2 q3
+  // High halves: q4 q5 q6 q7
+  uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+                         vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)};
+  // q7 will be the high half of src_q[3]. Not used until the end.
+  Transpose4x8(src_q);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask = OuterThreshold(
+      vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+      vget_low_u16(src_q[1]), outer_thresh);
+  const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+  const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+  const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+  const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+  const uint16x8_t p4q4 =
+      vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+  const uint16x8_t p5q5 =
+      vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+  const uint16x8_t p6q6 =
+      vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+  const uint16x8_t p7q7 =
+      vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+  // Mask to choose between the outputs of Filter8 and Filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // Filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                             vabdq_u16(p0q0, p6q6)));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+  // To get the correctly ordered rows from the transpose, we need:
+  // p7p3 p6p2 p5p1 p4p0
+  // q0q4 q1q5 q2q6 q3q7
+  const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output);
+  const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output);
+  const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output);
+  const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output);
+  uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+                            p5p1_q1q5.val[0], p4p0_q0q4.val[0]};
+  Transpose4x8(output_p);
+  uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+                            p6p2_q2q6.val[1], p7p3_q3q7.val[1]};
+  Transpose4x8(output_q);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, output_p[0]);
+  vst1q_u16(dst_0 + 8, output_q[0]);
+  vst1q_u16(dst_1, output_p[1]);
+  vst1q_u16(dst_1 + 8, output_q[1]);
+  vst1q_u16(dst_2, output_p[2]);
+  vst1q_u16(dst_2 + 8, output_q[2]);
+  vst1q_u16(dst_3, output_p[3]);
+  vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+}  // namespace
+
+void LoopFilterInit10bpp_NEON() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Horizontal4_NEON;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Horizontal6_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Horizontal8_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Horizontal14_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Vertical14_NEON;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/libgav1/src/dsp/arm/loop_filter_neon.cc b/libgav1/src/dsp/arm/loop_filter_neon.cc
index 8c03928..a8b236d 100644
--- a/libgav1/src/dsp/arm/loop_filter_neon.cc
+++ b/libgav1/src/dsp/arm/loop_filter_neon.cc
@@ -29,7 +29,6 @@
 
 namespace libgav1 {
 namespace dsp {
-namespace low_bitdepth {
 namespace {
 
 // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
@@ -149,10 +148,6 @@ void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
   needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
 
 #if defined(__aarch64__)
-  // This provides a good speedup for the unit test. Not sure how applicable it
-  // is to valid streams though.
-  // Consider doing this on armv7 if there is a quick way to check if a vector
-  // is zero.
   if (vaddv_u8(needs_filter4_mask) == 0) {
     // None of the values will be filtered.
     return;
@@ -209,10 +204,6 @@ void Vertical4_NEON(void* const dest, const ptrdiff_t stride,
   needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
 
 #if defined(__aarch64__)
-  // This provides a good speedup for the unit test. Not sure how applicable it
-  // is to valid streams though.
-  // Consider doing this on armv7 if there is a quick way to check if a vector
-  // is zero.
   if (vaddv_u8(needs_filter4_mask) == 0) {
     // None of the values will be filtered.
     return;
@@ -346,10 +337,6 @@ void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
   hev_mask = InterleaveLow32(hev_mask, hev_mask);
 
 #if defined(__aarch64__)
-  // This provides a good speedup for the unit test. Not sure how applicable it
-  // is to valid streams though.
-  // Consider doing this on armv7 if there is a quick way to check if a vector
-  // is zero.
   if (vaddv_u8(needs_filter6_mask) == 0) {
     // None of the values will be filtered.
     return;
@@ -420,10 +407,6 @@ void Vertical6_NEON(void* const dest, const ptrdiff_t stride,
   hev_mask = InterleaveLow32(hev_mask, hev_mask);
 
 #if defined(__aarch64__)
-  // This provides a good speedup for the unit test. Not sure how applicable it
-  // is to valid streams though.
-  // Consider doing this on armv7 if there is a quick way to check if a vector
-  // is zero.
   if (vaddv_u8(needs_filter6_mask) == 0) {
     // None of the values will be filtered.
     return;
@@ -600,10 +583,6 @@ void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
   hev_mask = InterleaveLow32(hev_mask, hev_mask);
 
 #if defined(__aarch64__)
-  // This provides a good speedup for the unit test. Not sure how applicable it
-  // is to valid streams though.
-  // Consider doing this on armv7 if there is a quick way to check if a vector
-  // is zero.
   if (vaddv_u8(needs_filter8_mask) == 0) {
     // None of the values will be filtered.
     return;
@@ -679,10 +658,6 @@ void Vertical8_NEON(void* const dest, const ptrdiff_t stride,
   hev_mask = InterleaveLow32(hev_mask, hev_mask);
 
 #if defined(__aarch64__)
-  // This provides a good speedup for the unit test. Not sure how applicable it
-  // is to valid streams though.
-  // Consider doing this on armv7 if there is a quick way to check if a vector
-  // is zero.
   if (vaddv_u8(needs_filter8_mask) == 0) {
     // None of the values will be filtered.
     return;
@@ -863,10 +838,6 @@ void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
   hev_mask = InterleaveLow32(hev_mask, hev_mask);
 
 #if defined(__aarch64__)
-  // This provides a good speedup for the unit test. Not sure how applicable it
-  // is to valid streams though.
-  // Consider doing this on armv7 if there is a quick way to check if a vector
-  // is zero.
   if (vaddv_u8(needs_filter8_mask) == 0) {
     // None of the values will be filtered.
     return;
@@ -1031,10 +1002,6 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
   hev_mask = InterleaveLow32(hev_mask, hev_mask);
 
 #if defined(__aarch64__)
-  // This provides a good speedup for the unit test. Not sure how applicable it
-  // is to valid streams though.
-  // Consider doing this on armv7 if there is a quick way to check if a vector
-  // is zero.
   if (vaddv_u8(needs_filter8_mask) == 0) {
     // None of the values will be filtered.
     return;
@@ -1158,7 +1125,9 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
   vst1q_u8(dst, output_3);
 }
 
-void Init8bpp() {
+}  // namespace
+
+void LoopFilterInit_NEON() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
@@ -1178,1267 +1147,6 @@ void Init8bpp() {
   dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
       Vertical14_NEON;
 }
-}  // namespace
-}  // namespace low_bitdepth
-
-#if LIBGAV1_MAX_BITDEPTH >= 10
-namespace high_bitdepth {
-namespace {
-
-// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
-inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) {
-  const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
-  return vorr_u16(vget_low_u16(a), vget_high_u16(a));
-}
-
-// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
-inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0,
-                                 const uint16x4_t q0, const uint16x4_t q1,
-                                 const uint16_t outer_thresh) {
-  const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
-  const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
-  const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
-  const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
-  const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
-  return vcle_u16(sum, vdup_n_u16(outer_thresh));
-}
-
-// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
-//   OuterThreshold()
-inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1,
-                               const uint16_t inner_thresh,
-                               const uint16x4_t outer_mask) {
-  const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
-  const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
-  return vand_u16(inner_mask, outer_mask);
-}
-
-// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
-//   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
-//   OuterThreshold()
-inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1,
-                               const uint16x8_t abd_p1p2_q1q2,
-                               const uint16_t inner_thresh,
-                               const uint16x4_t outer_mask) {
-  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
-  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
-  const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
-  return vand_u16(inner_mask, outer_mask);
-}
-
-// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
-//   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
-//   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
-//   OuterThreshold()
-inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1,
-                               const uint16x8_t abd_p1p2_q1q2,
-                               const uint16x8_t abd_p2p3_q2q3,
-                               const uint16_t inner_thresh,
-                               const uint16x4_t outer_mask) {
-  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
-  const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
-  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
-  const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
-  return vand_u16(inner_mask, outer_mask);
-}
-
-// -----------------------------------------------------------------------------
-// FilterNMasks functions.
-
-inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
-                         const uint16_t hev_thresh, const uint16x4_t outer_mask,
-                         const uint16_t inner_thresh,
-                         uint16x4_t* const hev_mask,
-                         uint16x4_t* const needs_filter4_mask) {
-  const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
-  // This includes cases where NeedsFilter4() is not true and so Filter2() will
-  // not be applied.
-  const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
-
-  *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask);
-
-  // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
-  *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
-}
-
-// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
-//   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
-// |flat_thresh| == 4 for 10 bit decode.
-inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1,
-                          const uint16x8_t abd_p0p2_q0q2) {
-  constexpr int flat_thresh = 1 << 2;
-  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
-  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
-  return vand_u16(vget_low_u16(b), vget_high_u16(b));
-}
-
-inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1,
-                         const uint16x8_t p0q0, const uint16_t hev_thresh,
-                         const uint16x4_t outer_mask,
-                         const uint16_t inner_thresh,
-                         uint16x4_t* const needs_filter6_mask,
-                         uint16x4_t* const is_flat3_mask,
-                         uint16x4_t* const hev_mask) {
-  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
-  *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
-  *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2));
-  *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
-                                     inner_thresh, outer_mask);
-}
-
-// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
-// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
-//   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
-//   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
-// |flat_thresh| == 4 for 10 bit decode.
-inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0,
-                          const uint16x8_t abd_pn1p0_qn1q0,
-                          const uint16x8_t abd_pn2p0_qn2q0) {
-  constexpr int flat_thresh = 1 << 2;
-  const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
-  const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
-  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
-  return vand_u16(vget_low_u16(c), vget_high_u16(c));
-}
-
-inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2,
-                         const uint16x8_t p1q1, const uint16x8_t p0q0,
-                         const uint16_t hev_thresh, const uint16x4_t outer_mask,
-                         const uint16_t inner_thresh,
-                         uint16x4_t* const needs_filter8_mask,
-                         uint16x4_t* const is_flat4_mask,
-                         uint16x4_t* const hev_mask) {
-  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
-  *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
-  const uint16x4_t is_flat4 =
-      IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3));
-  *needs_filter8_mask =
-      NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
-                   inner_thresh, outer_mask);
-  // |is_flat4_mask| is used to decide where to use the result of Filter8.
-  // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
-  // overriding the question of whether to use Filter8. Because Filter4 doesn't
-  // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the
-  // source value. To be correct, the mask must account for this override.
-  *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask);
-}
-
-// -----------------------------------------------------------------------------
-// FilterN functions.
-
-// Calculate Filter4() or Filter2() based on |hev_mask|.
-inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
-                    const uint16x8_t p1q1, const uint16x4_t hev_mask,
-                    uint16x8_t* const p1q1_result,
-                    uint16x8_t* const p0q0_result) {
-  const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
-  // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
-  // q0mp0 means "q0 minus p0".
-  const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
-  const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
-
-  // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
-  const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/)));
-  const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1);
-  const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
-  const int16x4_t p1mq1_saturated =
-      Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel);
-  const int16x4_t hev_option =
-      vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
-
-  const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
-
-  // Need to figure out what's going on here because there are some unnecessary
-  // tricks to accommodate 8x8 as smallest 8bpp vector
-
-  // We can not shift with rounding because the clamp comes *before* the
-  // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
-  // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
-  const int16x4_t plus_four =
-      Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
-  const int16x4_t plus_three =
-      Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
-  const int16x4_t a1 = vshr_n_s16(plus_four, 3);
-  const int16x4_t a2 = vshr_n_s16(plus_three, 3);
-
-  // a3 = (a1 + 1) >> 1;
-  const int16x4_t a3 = vrshr_n_s16(a1, 1);
-
-  const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
-  const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
-
-  // Need to shift the second term or we end up with a2_ma2.
-  const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
-  const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
-  *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10);
-  *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10);
-}
-
-void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
-                      int outer_thresh, int inner_thresh, int hev_thresh) {
-  auto* const dst = static_cast<uint8_t*>(dest);
-  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
-  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
-  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
-  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
-
-  const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0),
-                             vld1_u16(dst_q0), vld1_u16(dst_q1)};
-
-  // Adjust thresholds to bitdepth.
-  outer_thresh <<= 2;
-  inner_thresh <<= 2;
-  hev_thresh <<= 2;
-  const uint16x4_t outer_mask =
-      OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
-  uint16x4_t hev_mask;
-  uint16x4_t needs_filter4_mask;
-  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
-  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
-  Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
-               &needs_filter4_mask);
-
-#if defined(__aarch64__)
-  // This provides a good speedup for the unit test, but may not come up often
-  // enough to warrant it.
-  if (vaddv_u16(needs_filter4_mask) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#else   // !defined(__aarch64__)
-  const uint64x1_t needs_filter4_mask64 =
-      vreinterpret_u64_u16(needs_filter4_mask);
-  if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#endif  // defined(__aarch64__)
-
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t needs_filter4_mask_8 =
-      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
-
-  uint16x8_t f_p1q1;
-  uint16x8_t f_p0q0;
-  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
-  Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
-
-  // Already integrated the Hev mask when calculating the filtered values.
-  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
-
-  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
-  // with |needs_filter4_mask| previously.
-  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
-  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
-
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
-  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
-}
-
-void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
-                    int inner_thresh, int hev_thresh) {
-  // Offset by 2 uint16_t values to load from first p1 position.
-  auto* dst = static_cast<uint8_t*>(dest) - 4;
-  auto* dst_p1 = reinterpret_cast<uint16_t*>(dst);
-  auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride);
-  auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2);
-  auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3);
-
-  uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
-                       vld1_u16(dst_q1)};
-  Transpose4x4(src);
-
-  // Adjust thresholds to bitdepth.
-  outer_thresh <<= 2;
-  inner_thresh <<= 2;
-  hev_thresh <<= 2;
-  const uint16x4_t outer_mask =
-      OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
-  uint16x4_t hev_mask;
-  uint16x4_t needs_filter4_mask;
-  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
-  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
-  Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
-               &needs_filter4_mask);
-
-#if defined(__aarch64__)
-  // This provides a good speedup for the unit test. Not sure how applicable it
-  // is to valid streams though.
-  // Consider doing this on armv7 if there is a quick way to check if a vector
-  // is zero.
-  if (vaddv_u16(needs_filter4_mask) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#else   // !defined(__aarch64__)
-  const uint64x1_t needs_filter4_mask64 =
-      vreinterpret_u64_u16(needs_filter4_mask);
-  if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#endif  // defined(__aarch64__)
-
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t needs_filter4_mask_8 =
-      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
-
-  uint16x8_t f_p1q1;
-  uint16x8_t f_p0q0;
-  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
-  Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
-
-  // Already integrated the Hev mask when calculating the filtered values.
-  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
-
-  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
-  // with |needs_filter4_mask| previously.
-  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
-  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
-
-  uint16x4_t output[4] = {
-      vget_low_u16(p1q1_output),
-      vget_low_u16(p0q0_output),
-      vget_high_u16(p0q0_output),
-      vget_high_u16(p1q1_output),
-  };
-  Transpose4x4(output);
-
-  vst1_u16(dst_p1, output[0]);
-  vst1_u16(dst_p0, output[1]);
-  vst1_u16(dst_q0, output[2]);
-  vst1_u16(dst_q1, output[3]);
-}
-
-inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
-                    const uint16x8_t p0q0, uint16x8_t* const p1q1_output,
-                    uint16x8_t* const p0q0_output) {
-  // Sum p1 and q1 output from opposite directions.
-  // The formula is regrouped to allow 3 doubling operations to be combined.
-  //
-  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
-  //      ^^^^^^^^
-  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
-  //                                 ^^^^^^^^
-  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
-  //                    ^^^^^^^^^^^
-  uint16x8_t sum = vaddq_u16(p2q2, p1q1);
-
-  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
-  //                                ^^^^^^
-  sum = vaddq_u16(sum, p0q0);
-
-  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
-  //               ^^^^^
-  sum = vshlq_n_u16(sum, 1);
-
-  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
-  //        ^^^^^^                          ^^^^^^
-  // Should dual issue with the left shift.
-  const uint16x8_t q0p0 = Transpose64(p0q0);
-  const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
-  sum = vaddq_u16(sum, outer_sum);
-
-  *p1q1_output = vrshrq_n_u16(sum, 3);
-
-  // Convert to p0 and q0 output:
-  // p0 = p1 - (2 * p2) + q0 + q1
-  // q0 = q1 - (2 * q2) + p0 + p1
-  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
-  //                ^^^^^^^^
-  const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
-  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
-  //        ^^^^^^^^
-  sum = vsubq_u16(sum, p2q2_double);
-  const uint16x8_t q1p1 = Transpose64(p1q1);
-  sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
-
-  *p0q0_output = vrshrq_n_u16(sum, 3);
-}
-
-void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
-                      int outer_thresh, int inner_thresh, int hev_thresh) {
-  auto* const dst = static_cast<uint8_t*>(dest);
-  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
-  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
-  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
-  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
-  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
-  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
-
-  const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1),
-                             vld1_u16(dst_p0), vld1_u16(dst_q0),
-                             vld1_u16(dst_q1), vld1_u16(dst_q2)};
-
-  // Adjust thresholds to bitdepth.
-  outer_thresh <<= 2;
-  inner_thresh <<= 2;
-  hev_thresh <<= 2;
-  const uint16x4_t outer_mask =
-      OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
-  uint16x4_t hev_mask;
-  uint16x4_t needs_filter_mask;
-  uint16x4_t is_flat3_mask;
-  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
-  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
-  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
-  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
-               &needs_filter_mask, &is_flat3_mask, &hev_mask);
-
-#if defined(__aarch64__)
-  if (vaddv_u16(needs_filter_mask) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#endif  // defined(__aarch64__)
-
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  // ZIP1 p0q0, p1q1 may perform better here.
-  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
-  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
-  uint16x8_t p0q0_output, p1q1_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
-  // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
-  // output is not used.
-  uint16x8_t f6_p1q1, f6_p0q0;
-  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
-  if (vget_lane_u64(need_filter6, 0) == 0) {
-    // Filter6() does not apply, but Filter4() applies to one or more values.
-    p0q0_output = p0q0;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
-  } else {
-    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
-    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
-    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
-  }
-
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
-  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
-}
-
-void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
-                    int inner_thresh, int hev_thresh) {
-  // Left side of the filter window.
-  auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t);
-  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
-  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
-  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
-  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
-  // Overread by 2 values. These overreads become the high halves of src_raw[2]
-  // and src_raw[3] after transpose.
-  uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
-                           vld1q_u16(dst_3)};
-  Transpose4x8(src_raw);
-  // p2, p1, p0, q0, q1, q2
-  const uint16x4_t src[6] = {
-      vget_low_u16(src_raw[0]),  vget_low_u16(src_raw[1]),
-      vget_low_u16(src_raw[2]),  vget_low_u16(src_raw[3]),
-      vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
-  };
-
-  // Adjust thresholds to bitdepth.
-  outer_thresh <<= 2;
-  inner_thresh <<= 2;
-  hev_thresh <<= 2;
-  const uint16x4_t outer_mask =
-      OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
-  uint16x4_t hev_mask;
-  uint16x4_t needs_filter_mask;
-  uint16x4_t is_flat3_mask;
-  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
-  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
-  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
-  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
-               &needs_filter_mask, &is_flat3_mask, &hev_mask);
-
-#if defined(__aarch64__)
-  if (vaddv_u16(needs_filter_mask) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#endif  // defined(__aarch64__)
-
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  // ZIP1 p0q0, p1q1 may perform better here.
-  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
-  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
-  uint16x8_t p0q0_output, p1q1_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
-  // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
-  // output is not used.
-  uint16x8_t f6_p1q1, f6_p0q0;
-  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
-  if (vget_lane_u64(need_filter6, 0) == 0) {
-    // Filter6() does not apply, but Filter4() applies to one or more values.
-    p0q0_output = p0q0;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
-  } else {
-    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
-    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
-    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
-  }
-
-  uint16x4_t output[4] = {
-      vget_low_u16(p1q1_output),
-      vget_low_u16(p0q0_output),
-      vget_high_u16(p0q0_output),
-      vget_high_u16(p1q1_output),
-  };
-  Transpose4x4(output);
-
-  // dst_n starts at p2, so adjust to p1.
-  vst1_u16(dst_0 + 1, output[0]);
-  vst1_u16(dst_1 + 1, output[1]);
-  vst1_u16(dst_2 + 1, output[2]);
-  vst1_u16(dst_3 + 1, output[3]);
-}
-
-inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
-                    const uint16x8_t p1q1, const uint16x8_t p0q0,
-                    uint16x8_t* const p2q2_output,
-                    uint16x8_t* const p1q1_output,
-                    uint16x8_t* const p0q0_output) {
-  // Sum p2 and q2 output from opposite directions.
-  // The formula is regrouped to allow 2 doubling operations to be combined.
-  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
-  //      ^^^^^^^^
-  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
-  //                                ^^^^^^^^
-  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
-  //                    ^^^^^^^^^^^
-  const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
-
-  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
-  //               ^^^^^
-  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
-
-  // Add two other terms to make dual issue with shift more likely.
-  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
-  //                                   ^^^^^^^^^^^
-  const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
-
-  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
-  //                                 ^^^^^^^^^^^^^
-  sum = vaddq_u16(sum, p01q01);
-
-  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
-  //        ^^^^^^
-  sum = vaddq_u16(sum, p3q3);
-
-  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
-  //                                               ^^^^^^
-  const uint16x8_t q0p0 = Transpose64(p0q0);
-  sum = vaddq_u16(sum, q0p0);
-
-  *p2q2_output = vrshrq_n_u16(sum, 3);
-
-  // Convert to p1 and q1 output:
-  // p1 = p2 - p3 - p2 + p1 + q1
-  // q1 = q2 - q3 - q2 + q0 + p1
-  sum = vsubq_u16(sum, p23q23);
-  const uint16x8_t q1p1 = Transpose64(p1q1);
-  sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
-
-  *p1q1_output = vrshrq_n_u16(sum, 3);
-
-  // Convert to p0 and q0 output:
-  // p0 = p1 - p3 - p1 + p0 + q2
-  // q0 = q1 - q3 - q1 + q0 + p2
-  sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
-  const uint16x8_t q2p2 = Transpose64(p2q2);
-  sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
-
-  *p0q0_output = vrshrq_n_u16(sum, 3);
-}
-
-void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
-                      int outer_thresh, int inner_thresh, int hev_thresh) {
-  auto* const dst = static_cast<uint8_t*>(dest);
-  auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
-  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
-  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
-  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
-  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
-  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
-  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
-  auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
-  const uint16x4_t src[8] = {
-      vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0),
-      vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)};
-
-  // Adjust thresholds to bitdepth.
-  outer_thresh <<= 2;
-  inner_thresh <<= 2;
-  hev_thresh <<= 2;
-  const uint16x4_t outer_mask =
-      OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh);
-  uint16x4_t hev_mask;
-  uint16x4_t needs_filter_mask;
-  uint16x4_t is_flat4_mask;
-  const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
-  const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
-  const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
-  const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
-  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
-               &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
-  if (vaddv_u16(needs_filter_mask) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#endif  // defined(__aarch64__)
-
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  // ZIP1 p0q0, p1q1 may perform better here.
-  const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
-  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
-  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
-  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
-  // output is not used.
-  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
-  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
-  if (vget_lane_u64(need_filter8, 0) == 0) {
-    // Filter8() does not apply, but Filter4() applies to one or more values.
-    p2q2_output = p2q2;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
-  } else {
-    const uint16x8_t is_flat4_mask_8 =
-        vcombine_u16(is_flat4_mask, is_flat4_mask);
-    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
-    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
-    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
-    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
-  }
-
-  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
-  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
-  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
-}
-
-inline uint16x8_t ReverseLowHalf(const uint16x8_t a) {
-  return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
-}
-
-void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
-                    int inner_thresh, int hev_thresh) {
-  auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t);
-  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
-  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
-  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
-  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
-  // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
-  // To get desired pairs after transpose, one half should be reversed.
-  uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
-                       vld1q_u16(dst_3)};
-
-  // src[0] = p0q0
-  // src[1] = p1q1
-  // src[2] = p2q2
-  // src[3] = p3q3
-  LoopFilterTranspose4x8(src);
-
-  // Adjust thresholds to bitdepth.
-  outer_thresh <<= 2;
-  inner_thresh <<= 2;
-  hev_thresh <<= 2;
-  const uint16x4_t outer_mask = OuterThreshold(
-      vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
-      vget_high_u16(src[1]), outer_thresh);
-  uint16x4_t hev_mask;
-  uint16x4_t needs_filter_mask;
-  uint16x4_t is_flat4_mask;
-  const uint16x8_t p0q0 = src[0];
-  const uint16x8_t p1q1 = src[1];
-  const uint16x8_t p2q2 = src[2];
-  const uint16x8_t p3q3 = src[3];
-  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
-               &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
-  if (vaddv_u16(needs_filter_mask) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#endif  // defined(__aarch64__)
-
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
-  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
-  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
-  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
-  // output is not used.
-  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
-  if (vget_lane_u64(need_filter8, 0) == 0) {
-    // Filter8() does not apply, but Filter4() applies to one or more values.
-    p2q2_output = p2q2;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
-  } else {
-    const uint16x8_t is_flat4_mask_8 =
-        vcombine_u16(is_flat4_mask, is_flat4_mask);
-    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
-    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
-    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
-    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
-    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
-  }
-
-  uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3};
-  // After transpose, |output| will contain rows of the form:
-  // p0 p1 p2 p3 q0 q1 q2 q3
-  Transpose4x8(output);
-
-  // Reverse p values to produce original order:
-  // p3 p2 p1 p0 q0 q1 q2 q3
-  vst1q_u16(dst_0, ReverseLowHalf(output[0]));
-  vst1q_u16(dst_1, ReverseLowHalf(output[1]));
-  vst1q_u16(dst_2, ReverseLowHalf(output[2]));
-  vst1q_u16(dst_3, ReverseLowHalf(output[3]));
-}
-inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5,
-                     const uint16x8_t p4q4, const uint16x8_t p3q3,
-                     const uint16x8_t p2q2, const uint16x8_t p1q1,
-                     const uint16x8_t p0q0, uint16x8_t* const p5q5_output,
-                     uint16x8_t* const p4q4_output,
-                     uint16x8_t* const p3q3_output,
-                     uint16x8_t* const p2q2_output,
-                     uint16x8_t* const p1q1_output,
-                     uint16x8_t* const p0q0_output) {
-  // Sum p5 and q5 output from opposite directions.
-  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
-  //      ^^^^^^^^
-  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
-  //                                                     ^^^^^^^^
-  const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
-
-  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
-  //                 ^^^^^^^^^^^^^^^^^^^
-  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
-  //                               ^^^^^^^^^^^^^^^^^^^
-  uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
-  sum = vaddq_u16(sum, p6q6_x7);
-
-  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
-  //                                       ^^^^^^^
-  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
-  //                     ^^^^^^^
-  sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
-
-  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
-  //                                                 ^^^^^^^
-  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
-  //           ^^^^^^^
-  sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
-
-  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
-  //                                                           ^^
-  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
-  //      ^^
-  const uint16x8_t q0p0 = Transpose64(p0q0);
-  sum = vaddq_u16(sum, q0p0);
-
-  *p5q5_output = vrshrq_n_u16(sum, 4);
-
-  // Convert to p4 and q4 output:
-  // p4 = p5 - (2 * p6) + p3 + q1
-  // q4 = q5 - (2 * q6) + q3 + p1
-  sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
-  const uint16x8_t q1p1 = Transpose64(p1q1);
-  sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
-
-  *p4q4_output = vrshrq_n_u16(sum, 4);
-
-  // Convert to p3 and q3 output:
-  // p3 = p4 - p6 - p5 + p2 + q2
-  // q3 = q4 - q6 - q5 + q2 + p2
-  sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
-  const uint16x8_t q2p2 = Transpose64(p2q2);
-  sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
-
-  *p3q3_output = vrshrq_n_u16(sum, 4);
-
-  // Convert to p2 and q2 output:
-  // p2 = p3 - p6 - p4 + p1 + q3
-  // q2 = q3 - q6 - q4 + q1 + p3
-  sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
-  const uint16x8_t q3p3 = Transpose64(p3q3);
-  sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
-
-  *p2q2_output = vrshrq_n_u16(sum, 4);
-
-  // Convert to p1 and q1 output:
-  // p1 = p2 - p6 - p3 + p0 + q4
-  // q1 = q2 - q6 - q3 + q0 + p4
-  sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
-  const uint16x8_t q4p4 = Transpose64(p4q4);
-  sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
-
-  *p1q1_output = vrshrq_n_u16(sum, 4);
-
-  // Convert to p0 and q0 output:
-  // p0 = p1 - p6 - p2 + q0 + q5
-  // q0 = q1 - q6 - q2 + p0 + p5
-  sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
-  const uint16x8_t q5p5 = Transpose64(p5q5);
-  sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
-
-  *p0q0_output = vrshrq_n_u16(sum, 4);
-}
-
-void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
-                       int outer_thresh, int inner_thresh, int hev_thresh) {
-  auto* const dst = static_cast<uint8_t*>(dest);
-  auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride);
-  auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride);
-  auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride);
-  auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
-  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
-  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
-  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
-  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
-  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
-  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
-  auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-  auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride);
-  auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride);
-  auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride);
-
-  const uint16x4_t src[14] = {
-      vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
-      vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
-      vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
-      vld1_u16(dst_q5), vld1_u16(dst_q6)};
-
-  // Adjust thresholds to bitdepth.
-  outer_thresh <<= 2;
-  inner_thresh <<= 2;
-  hev_thresh <<= 2;
-  const uint16x4_t outer_mask =
-      OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh);
-  uint16x4_t hev_mask;
-  uint16x4_t needs_filter_mask;
-  uint16x4_t is_flat4_mask;
-  const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
-  const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
-  const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
-  const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
-  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
-               &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
-  if (vaddv_u16(needs_filter_mask) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#endif  // defined(__aarch64__)
-  const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
-  const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
-  const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
-  // Mask to choose between the outputs of Filter8 and Filter14.
-  // As with the derivation of |is_flat4_mask|, the question of whether to use
-  // Filter14 is only raised where |is_flat4_mask| is true.
-  const uint16x4_t is_flat4_outer_mask = vand_u16(
-      is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
-                             vabdq_u16(p0q0, p6q6)));
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  // ZIP1 p0q0, p1q1 may perform better here.
-  const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
-  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
-  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
-      p5q5_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
-  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
-  // output is not used.
-  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
-  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
-  if (vget_lane_u64(need_filter8, 0) == 0) {
-    // Filter8() and Filter14() do not apply, but Filter4() applies to one or
-    // more values.
-    p5q5_output = p5q5;
-    p4q4_output = p4q4;
-    p3q3_output = p3q3;
-    p2q2_output = p2q2;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
-  } else {
-    const uint16x8_t use_filter8_mask =
-        vcombine_u16(is_flat4_mask, is_flat4_mask);
-    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
-    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
-    if (vget_lane_u64(need_filter14, 0) == 0) {
-      // Filter14() does not apply, but Filter8() and Filter4() apply to one or
-      // more values.
-      p5q5_output = p5q5;
-      p4q4_output = p4q4;
-      p3q3_output = p3q3;
-      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
-      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
-      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
-      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
-    } else {
-      // All filters may contribute values to final outputs.
-      const uint16x8_t use_filter14_mask =
-          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
-      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
-      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
-               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
-      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
-      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
-      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
-      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
-      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
-      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
-      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
-      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
-      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
-      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
-      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
-    }
-  }
-
-  vst1_u16(dst_p5, vget_low_u16(p5q5_output));
-  vst1_u16(dst_p4, vget_low_u16(p4q4_output));
-  vst1_u16(dst_p3, vget_low_u16(p3q3_output));
-  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
-  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
-  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
-  vst1_u16(dst_q3, vget_high_u16(p3q3_output));
-  vst1_u16(dst_q4, vget_high_u16(p4q4_output));
-  vst1_u16(dst_q5, vget_high_u16(p5q5_output));
-}
-
-inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) {
-  uint16x8x2_t acdb;
-#if defined(__aarch64__)
-  // a[b] <- [c]d
-  acdb.val[0] = vreinterpretq_u16_u64(
-      vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
-  // [a]b <- c[d]
-  acdb.val[1] = vreinterpretq_u16_u64(
-      vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
-#else
-  // a[b] <- [c]d
-  acdb.val[0] = vreinterpretq_u16_u64(
-      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
-                     vreinterpretq_u64_u16(ab), 1));
-  // [a]b <- c[d]
-  acdb.val[1] = vreinterpretq_u16_u64(
-      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
-                     vreinterpretq_u64_u16(ab), 0));
-#endif  // defined(__aarch64__)
-  return acdb;
-}
-
-void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
-                     int inner_thresh, int hev_thresh) {
-  auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t);
-  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
-  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
-  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
-  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
-  // Low halves:  p7 p6 p5 p4
-  // High halves: p3 p2 p1 p0
-  uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
-                         vld1q_u16(dst_3)};
-  // p7 will be the low half of src_p[0]. Not used until the end.
-  Transpose4x8(src_p);
-
-  // Low halves:  q0 q1 q2 q3
-  // High halves: q4 q5 q6 q7
-  uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
-                         vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)};
-  // q7 will be the high half of src_q[3]. Not used until the end.
-  Transpose4x8(src_q);
-
-  // Adjust thresholds to bitdepth.
-  outer_thresh <<= 2;
-  inner_thresh <<= 2;
-  hev_thresh <<= 2;
-  const uint16x4_t outer_mask = OuterThreshold(
-      vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
-      vget_low_u16(src_q[1]), outer_thresh);
-  const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
-  const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
-  const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
-  const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
-  uint16x4_t hev_mask;
-  uint16x4_t needs_filter_mask;
-  uint16x4_t is_flat4_mask;
-  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
-               &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
-  if (vaddv_u16(needs_filter_mask) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#else   // !defined(__aarch64__)
-  // This might be faster than vaddv (latency 3) because mov to general register
-  // has latency 2.
-  const uint64x1_t needs_filter_mask64 =
-      vreinterpret_u64_u16(needs_filter_mask);
-  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
-    // None of the values will be filtered.
-    return;
-  }
-#endif  // defined(__aarch64__)
-  const uint16x8_t p4q4 =
-      vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
-  const uint16x8_t p5q5 =
-      vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
-  const uint16x8_t p6q6 =
-      vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
-  const uint16x8_t p7q7 =
-      vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
-  // Mask to choose between the outputs of Filter8 and Filter14.
-  // As with the derivation of |is_flat4_mask|, the question of whether to use
-  // Filter14 is only raised where |is_flat4_mask| is true.
-  const uint16x4_t is_flat4_outer_mask = vand_u16(
-      is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
-                             vabdq_u16(p0q0, p6q6)));
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
-  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
-  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
-      p5q5_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
-  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
-  // output is not used.
-  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
-  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
-  if (vget_lane_u64(need_filter8, 0) == 0) {
-    // Filter8() and Filter14() do not apply, but Filter4() applies to one or
-    // more values.
-    p5q5_output = p5q5;
-    p4q4_output = p4q4;
-    p3q3_output = p3q3;
-    p2q2_output = p2q2;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
-  } else {
-    const uint16x8_t use_filter8_mask =
-        vcombine_u16(is_flat4_mask, is_flat4_mask);
-    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
-    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
-    if (vget_lane_u64(need_filter14, 0) == 0) {
-      // Filter14() does not apply, but Filter8() and Filter4() apply to one or
-      // more values.
-      p5q5_output = p5q5;
-      p4q4_output = p4q4;
-      p3q3_output = p3q3;
-      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
-      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
-      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
-      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
-    } else {
-      // All filters may contribute values to final outputs.
-      const uint16x8_t use_filter14_mask =
-          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
-      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
-      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
-               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
-      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
-      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
-      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
-      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
-      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
-      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
-      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
-      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
-      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
-      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
-      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
-    }
-  }
-  // To get the correctly ordered rows from the transpose, we need:
-  // p7p3 p6p2 p5p1 p4p0
-  // q0q4 q1q5 q2q6 q3q7
-  const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output);
-  const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output);
-  const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output);
-  const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output);
-  uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0],
-                            p5p1_q1q5.val[0], p4p0_q0q4.val[0]};
-  Transpose4x8(output_p);
-  uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1],
-                            p6p2_q2q6.val[1], p7p3_q3q7.val[1]};
-  Transpose4x8(output_q);
-
-  // Reverse p values to produce original order:
-  // p3 p2 p1 p0 q0 q1 q2 q3
-  vst1q_u16(dst_0, output_p[0]);
-  vst1q_u16(dst_0 + 8, output_q[0]);
-  vst1q_u16(dst_1, output_p[1]);
-  vst1q_u16(dst_1 + 8, output_q[1]);
-  vst1q_u16(dst_2, output_p[2]);
-  vst1q_u16(dst_2 + 8, output_q[2]);
-  vst1q_u16(dst_3, output_p[3]);
-  vst1q_u16(dst_3 + 8, output_q[3]);
-}
-
-void Init10bpp() {
-  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
-  assert(dsp != nullptr);
-  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
-      Horizontal4_NEON;
-  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
-  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
-      Horizontal6_NEON;
-  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
-  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
-      Horizontal8_NEON;
-  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
-  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
-      Horizontal14_NEON;
-  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
-      Vertical14_NEON;
-}
-
-}  // namespace
-}  // namespace high_bitdepth
-#endif  // LIBGAV1_MAX_BITDEPTH >= 10
-
-void LoopFilterInit_NEON() {
-  low_bitdepth::Init8bpp();
-#if LIBGAV1_MAX_BITDEPTH >= 10
-  high_bitdepth::Init10bpp();
-#endif
-}
 
 }  // namespace dsp
 }  // namespace libgav1
diff --git a/libgav1/src/dsp/arm/loop_filter_neon.h b/libgav1/src/dsp/arm/loop_filter_neon.h
index 540defc..531cd0d 100644
--- a/libgav1/src/dsp/arm/loop_filter_neon.h
+++ b/libgav1/src/dsp/arm/loop_filter_neon.h
@@ -26,6 +26,7 @@ namespace dsp {
 // Initializes Dsp::loop_filters, see the defines below for specifics. This
 // function is not thread-safe.
 void LoopFilterInit_NEON();
+void LoopFilterInit10bpp_NEON();
 
 }  // namespace dsp
 }  // namespace libgav1
diff --git a/libgav1/src/dsp/arm/loop_restoration_neon.cc b/libgav1/src/dsp/arm/loop_restoration_neon.cc
index 2db137f..cd8552e 100644
--- a/libgav1/src/dsp/arm/loop_restoration_neon.cc
+++ b/libgav1/src/dsp/arm/loop_restoration_neon.cc
@@ -1504,7 +1504,6 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
   const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
   uint8x16_t s[2][2], mas[2];
   uint16x8_t sq[2][4], bs[3];
-  // TODO(b/194217060): Future msan load.
   s[0][0] = vld1q_u8(src0);
   s[1][0] = vld1q_u8(src1);
 
@@ -1599,7 +1598,6 @@ inline void BoxSumFilterPreProcess(
   const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
   uint8x16_t s[2][2], ma3[2][2], ma5[2];
   uint16x8_t sq[2][4], b3[2][3], b5[3];
-  // TODO(b/194217060): Future msan load.
   s[0][0] = vld1q_u8(src0);
   s[1][0] = vld1q_u8(src1);
 
@@ -1801,7 +1799,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
                                   uint8_t* const dst) {
   uint8x16_t s[2], mas[2];
   uint16x8_t sq[4], bs[4];
-  // TODO(b/194217060): Future msan load.
   s[0] = vld1q_u8(src0);
 
   BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0],
@@ -1812,7 +1809,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
     uint16x8_t ma[2];
     uint8x16_t masx[3];
     uint32x4x2_t b[2];
-    // TODO(b/194217060): Future msan load.
     s[1] = vld1q_u8(src0 + x + 16);
 
     BoxFilterPreProcess5LastRow(s, x + 8, scale, sum5, square_sum5, sq + 1, mas,
@@ -1856,7 +1852,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
   const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
   uint8x16_t s[2], mas[2];
   uint16x8_t sq[4], bs[3];
-  // TODO(b/194217060): Future msan load.
   s[0] = vld1q_u8(src0);
 
   BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
@@ -1915,7 +1910,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
   const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
   uint8x16_t s[2][2], ma3[2][2], ma5[2];
   uint16x8_t sq[2][4], b3[2][3], b5[3];
-  // TODO(b/194217060): Future msan load.
   s[0][0] = vld1q_u8(src0);
   s[1][0] = vld1q_u8(src1);
 
@@ -2023,7 +2017,6 @@ inline void BoxFilterLastRow(
   uint8x16_t s[2], ma3[2], ma5[2];
   uint16x8_t sq[4], ma[3], b3[3], b5[3];
   uint32x4x2_t b[3];
-  // TODO(b/194217060): Future msan load.
   s[0] = vld1q_u8(src0);
 
   BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
@@ -2033,7 +2026,6 @@ inline void BoxFilterLastRow(
   do {
     uint8x16_t ma3x[3], ma5x[3];
     int16x8_t p[2];
-    // TODO(b/194217060): Future msan load.
     s[1] = vld1q_u8(src0 + x + 16);
 
     BoxFilterPreProcessLastRow(s, x + 8, scales, sum3, sum5, square_sum3,
diff --git a/libgav1/src/dsp/arm/mask_blend_neon.cc b/libgav1/src/dsp/arm/mask_blend_neon.cc
index 853f949..ecc67f8 100644
--- a/libgav1/src/dsp/arm/mask_blend_neon.cc
+++ b/libgav1/src/dsp/arm/mask_blend_neon.cc
@@ -33,50 +33,40 @@ namespace dsp {
 namespace low_bitdepth {
 namespace {
 
-// TODO(b/150461164): Consider combining with GetInterIntraMask4x2().
-// Compound predictors use int16_t values and need to multiply long because the
-// Convolve range * 64 is 20 bits. Unfortunately there is no multiply int16_t by
-// int8_t and accumulate into int32_t instruction.
-template <int subsampling_x, int subsampling_y>
-inline int16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
-  if (subsampling_x == 1) {
-    const int16x4_t mask_val0 = vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask)));
-    const int16x4_t mask_val1 = vreinterpret_s16_u16(
-        vpaddl_u8(vld1_u8(mask + (mask_stride << subsampling_y))));
-    int16x8_t final_val;
-    if (subsampling_y == 1) {
-      const int16x4_t next_mask_val0 =
-          vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride)));
-      const int16x4_t next_mask_val1 =
-          vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride * 3)));
-      final_val = vaddq_s16(vcombine_s16(mask_val0, mask_val1),
-                            vcombine_s16(next_mask_val0, next_mask_val1));
-    } else {
-      final_val = vreinterpretq_s16_u16(
-          vpaddlq_u8(vreinterpretq_u8_s16(vcombine_s16(mask_val0, mask_val1))));
-    }
-    return vrshrq_n_s16(final_val, subsampling_y + 1);
+template <int subsampling_y>
+inline uint8x8_t GetMask4x2(const uint8_t* mask) {
+  if (subsampling_y == 1) {
+    const uint8x16x2_t mask_val = vld2q_u8(mask);
+    const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]);
+    const uint32x2_t row_01 = vreinterpret_u32_u8(vget_low_u8(combined_horz));
+    const uint32x2_t row_23 = vreinterpret_u32_u8(vget_high_u8(combined_horz));
+
+    const uint32x2x2_t row_02_13 = vtrn_u32(row_01, row_23);
+    // Use a halving add to work around the case where all |mask| values are 64.
+    return vrshr_n_u8(vhadd_u8(vreinterpret_u8_u32(row_02_13.val[0]),
+                               vreinterpret_u8_u32(row_02_13.val[1])),
+                      1);
   }
-  assert(subsampling_y == 0 && subsampling_x == 0);
-  const uint8x8_t mask_val0 = Load4(mask);
-  const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0);
-  return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+  // subsampling_x == 1
+  const uint8x8x2_t mask_val = vld2_u8(mask);
+  return vrhadd_u8(mask_val.val[0], mask_val.val[1]);
 }
 
 template <int subsampling_x, int subsampling_y>
-inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) {
+inline uint8x8_t GetMask8(const uint8_t* mask) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const uint8x16x2_t mask_val = vld2q_u8(mask);
+    const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]);
+    // Use a halving add to work around the case where all |mask| values are 64.
+    return vrshr_n_u8(
+        vhadd_u8(vget_low_u8(combined_horz), vget_high_u8(combined_horz)), 1);
+  }
   if (subsampling_x == 1) {
-    int16x8_t mask_val = vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask)));
-    if (subsampling_y == 1) {
-      const int16x8_t next_mask_val =
-          vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask + mask_stride)));
-      mask_val = vaddq_s16(mask_val, next_mask_val);
-    }
-    return vrshrq_n_s16(mask_val, 1 + subsampling_y);
+    const uint8x8x2_t mask_val = vld2_u8(mask);
+    return vrhadd_u8(mask_val.val[0], mask_val.val[1]);
   }
   assert(subsampling_y == 0 && subsampling_x == 0);
-  const uint8x8_t mask_val = vld1_u8(mask);
-  return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+  return vld1_u8(mask);
 }
 
 inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
@@ -109,89 +99,162 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
   StoreHi4(dst + dst_stride, result);
 }
 
-template <int subsampling_x, int subsampling_y>
+template <int subsampling_y>
 inline void MaskBlending4x4_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
                                  const int16_t* LIBGAV1_RESTRICT pred_1,
                                  const uint8_t* LIBGAV1_RESTRICT mask,
-                                 const ptrdiff_t mask_stride,
                                  uint8_t* LIBGAV1_RESTRICT dst,
                                  const ptrdiff_t dst_stride) {
+  constexpr int subsampling_x = 1;
+  constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
   const int16x8_t mask_inverter = vdupq_n_s16(64);
-  int16x8_t pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  // Compound predictors use int16_t values and need to multiply long because
+  // the Convolve range * 64 is 20 bits. Unfortunately there is no multiply
+  // int16_t by int8_t and accumulate into int32_t instruction.
+  int16x8_t pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
   int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
   WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                         dst_stride);
-  // TODO(b/150461164): Arm tends to do better with load(val); val += stride
-  // It may be possible to turn this into a loop with a templated height.
-  pred_0 += 4 << 1;
-  pred_1 += 4 << 1;
-  mask += mask_stride << (1 + subsampling_y);
-  dst += dst_stride << 1;
+  pred_0 += 4 << subsampling_x;
+  pred_1 += 4 << subsampling_x;
+  mask += mask_stride << (subsampling_x + subsampling_y);
+  dst += dst_stride << subsampling_x;
 
-  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
   pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
   WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                         dst_stride);
 }
 
-template <int subsampling_x, int subsampling_y>
+template <int subsampling_y>
 inline void MaskBlending4xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
                                  const int16_t* LIBGAV1_RESTRICT pred_1,
                                  const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
-                                 const ptrdiff_t mask_stride, const int height,
+                                 const int height,
                                  uint8_t* LIBGAV1_RESTRICT dst,
                                  const ptrdiff_t dst_stride) {
   const uint8_t* mask = mask_ptr;
   if (height == 4) {
-    MaskBlending4x4_NEON<subsampling_x, subsampling_y>(
-        pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+    MaskBlending4x4_NEON<subsampling_y>(pred_0, pred_1, mask, dst, dst_stride);
     return;
   }
+  constexpr int subsampling_x = 1;
+  constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
   const int16x8_t mask_inverter = vdupq_n_s16(64);
   int y = 0;
   do {
     int16x8_t pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+        vreinterpretq_s16_u16(vmovl_u8(GetMask4x2<subsampling_y>(mask)));
     int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
 
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                           dst_stride);
-    pred_0 += 4 << 1;
-    pred_1 += 4 << 1;
-    mask += mask_stride << (1 + subsampling_y);
-    dst += dst_stride << 1;
+    pred_0 += 4 << subsampling_x;
+    pred_1 += 4 << subsampling_x;
+    mask += mask_stride << (subsampling_x + subsampling_y);
+    dst += dst_stride << subsampling_x;
 
-    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
     pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                           dst_stride);
-    pred_0 += 4 << 1;
-    pred_1 += 4 << 1;
-    mask += mask_stride << (1 + subsampling_y);
-    dst += dst_stride << 1;
+    pred_0 += 4 << subsampling_x;
+    pred_1 += 4 << subsampling_x;
+    mask += mask_stride << (subsampling_x + subsampling_y);
+    dst += dst_stride << subsampling_x;
 
-    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
     pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                           dst_stride);
-    pred_0 += 4 << 1;
-    pred_1 += 4 << 1;
-    mask += mask_stride << (1 + subsampling_y);
-    dst += dst_stride << 1;
+    pred_0 += 4 << subsampling_x;
+    pred_1 += 4 << subsampling_x;
+    mask += mask_stride << (subsampling_x + subsampling_y);
+    dst += dst_stride << subsampling_x;
 
-    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
     pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                           dst_stride);
-    pred_0 += 4 << 1;
-    pred_1 += 4 << 1;
-    mask += mask_stride << (1 + subsampling_y);
-    dst += dst_stride << 1;
+    pred_0 += 4 << subsampling_x;
+    pred_1 += 4 << subsampling_x;
+    mask += mask_stride << (subsampling_x + subsampling_y);
+    dst += dst_stride << subsampling_x;
     y += 8;
   } while (y < height);
 }
 
+inline uint8x8_t CombinePred8(const int16_t* LIBGAV1_RESTRICT pred_0,
+                              const int16_t* LIBGAV1_RESTRICT pred_1,
+                              const int16x8_t pred_mask_0,
+                              const int16x8_t pred_mask_1) {
+  // First 8 values.
+  const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+  const int16x8_t pred_val_1 = vld1q_s16(pred_1);
+  // int res = (mask_value * prediction_0[x] +
+  //      (64 - mask_value) * prediction_1[x]) >> 6;
+  const int32x4_t weighted_pred_lo =
+      vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+  const int32x4_t weighted_pred_hi =
+      vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+  const int32x4_t weighted_combo_lo = vmlal_s16(
+      weighted_pred_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
+  const int32x4_t weighted_combo_hi = vmlal_s16(
+      weighted_pred_hi, vget_high_s16(pred_mask_1), vget_high_s16(pred_val_1));
+
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //           (1 << kBitdepth8) - 1));
+  return vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+                                     vshrn_n_s32(weighted_combo_hi, 6)),
+                        4);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending8xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                 const int16_t* LIBGAV1_RESTRICT pred_1,
+                                 const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                                 const int height,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  int y = height;
+  do {
+    const int16x8_t pred_mask_0 =
+        ZeroExtend(GetMask8<subsampling_x, subsampling_y>(mask));
+    // 64 - mask
+    const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    const uint8x8_t result =
+        CombinePred8(pred_0, pred_1, pred_mask_0, pred_mask_1);
+    vst1_u8(dst, result);
+    dst += dst_stride;
+    mask += 8 << (subsampling_x + subsampling_y);
+    pred_0 += 8;
+    pred_1 += 8;
+  } while (--y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x16_t GetMask16(const uint8_t* mask, const ptrdiff_t mask_stride) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const uint8x16x2_t mask_val0 = vld2q_u8(mask);
+    const uint8x16x2_t mask_val1 = vld2q_u8(mask + mask_stride);
+    const uint8x16_t combined_horz0 =
+        vaddq_u8(mask_val0.val[0], mask_val0.val[1]);
+    const uint8x16_t combined_horz1 =
+        vaddq_u8(mask_val1.val[0], mask_val1.val[1]);
+    // Use a halving add to work around the case where all |mask| values are 64.
+    return vrshrq_n_u8(vhaddq_u8(combined_horz0, combined_horz1), 1);
+  }
+  if (subsampling_x == 1) {
+    const uint8x16x2_t mask_val = vld2q_u8(mask);
+    return vrhaddq_u8(mask_val.val[0], mask_val.val[1]);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  return vld1q_u8(mask);
+}
+
 template <int subsampling_x, int subsampling_y>
 inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
                            const void* LIBGAV1_RESTRICT prediction_1,
@@ -204,8 +267,13 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   if (width == 4) {
-    MaskBlending4xH_NEON<subsampling_x, subsampling_y>(
-        pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+    MaskBlending4xH_NEON<subsampling_y>(pred_0, pred_1, mask_ptr, height, dst,
+                                        dst_stride);
+    return;
+  }
+  if (width == 8) {
+    MaskBlending8xH_NEON<subsampling_x, subsampling_y>(pred_0, pred_1, mask_ptr,
+                                                       height, dst, dst_stride);
     return;
   }
   const uint8_t* mask = mask_ptr;
@@ -214,35 +282,24 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
   do {
     int x = 0;
     do {
-      const int16x8_t pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+      const uint8x16_t pred_mask_0 = GetMask16<subsampling_x, subsampling_y>(
           mask + (x << subsampling_x), mask_stride);
+      const int16x8_t pred_mask_0_lo = ZeroExtend(vget_low_u8(pred_mask_0));
+      const int16x8_t pred_mask_0_hi = ZeroExtend(vget_high_u8(pred_mask_0));
       // 64 - mask
-      const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
-      const int16x8_t pred_val_0 = vld1q_s16(pred_0 + x);
-      const int16x8_t pred_val_1 = vld1q_s16(pred_1 + x);
+      const int16x8_t pred_mask_1_lo = vsubq_s16(mask_inverter, pred_mask_0_lo);
+      const int16x8_t pred_mask_1_hi = vsubq_s16(mask_inverter, pred_mask_0_hi);
+
       uint8x8_t result;
-      // int res = (mask_value * prediction_0[x] +
-      //      (64 - mask_value) * prediction_1[x]) >> 6;
-      const int32x4_t weighted_pred_0_lo =
-          vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
-      const int32x4_t weighted_pred_0_hi =
-          vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
-      const int32x4_t weighted_combo_lo =
-          vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1),
-                    vget_low_s16(pred_val_1));
-      const int32x4_t weighted_combo_hi =
-          vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
-                    vget_high_s16(pred_val_1));
-
-      // dst[x] = static_cast<Pixel>(
-      //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
-      //           (1 << kBitdepth8) - 1));
-      result = vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
-                                           vshrn_n_s32(weighted_combo_hi, 6)),
-                              4);
+      result =
+          CombinePred8(pred_0 + x, pred_1 + x, pred_mask_0_lo, pred_mask_1_lo);
       vst1_u8(dst + x, result);
 
-      x += 8;
+      result = CombinePred8(pred_0 + x + 8, pred_1 + x + 8, pred_mask_0_hi,
+                            pred_mask_1_hi);
+      vst1_u8(dst + x + 8, result);
+
+      x += 16;
     } while (x < width);
     dst += dst_stride;
     pred_0 += width;
@@ -251,63 +308,19 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
   } while (++y < height);
 }
 
-// TODO(b/150461164): This is much faster for inter_intra (input is Pixel
-// values) but regresses compound versions (input is int16_t). Try to
-// consolidate these.
 template <int subsampling_x, int subsampling_y>
 inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask,
                                       ptrdiff_t mask_stride) {
   if (subsampling_x == 1) {
-    const uint8x8_t mask_val =
-        vpadd_u8(vld1_u8(mask), vld1_u8(mask + (mask_stride << subsampling_y)));
-    if (subsampling_y == 1) {
-      const uint8x8_t next_mask_val = vpadd_u8(vld1_u8(mask + mask_stride),
-                                               vld1_u8(mask + mask_stride * 3));
-
-      // Use a saturating add to work around the case where all |mask| values
-      // are 64. Together with the rounding shift this ensures the correct
-      // result.
-      const uint8x8_t sum = vqadd_u8(mask_val, next_mask_val);
-      return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
-    }
-
-    return vrshr_n_u8(mask_val, /*subsampling_x=*/1);
+    return GetMask4x2<subsampling_y>(mask);
   }
-
+  // When using intra or difference weighted masks, the function doesn't use
+  // subsampling, so |mask_stride| may be 4 or 8.
   assert(subsampling_y == 0 && subsampling_x == 0);
   const uint8x8_t mask_val0 = Load4(mask);
-  // TODO(b/150461164): Investigate the source of |mask| and see if the stride
-  // can be removed.
-  // TODO(b/150461164): The unit tests start at 8x8. Does this get run?
   return Load4<1>(mask + mask_stride, mask_val0);
 }
 
-template <int subsampling_x, int subsampling_y>
-inline uint8x8_t GetInterIntraMask8(const uint8_t* mask,
-                                    ptrdiff_t mask_stride) {
-  if (subsampling_x == 1) {
-    const uint8x16_t mask_val = vld1q_u8(mask);
-    const uint8x8_t mask_paired =
-        vpadd_u8(vget_low_u8(mask_val), vget_high_u8(mask_val));
-    if (subsampling_y == 1) {
-      const uint8x16_t next_mask_val = vld1q_u8(mask + mask_stride);
-      const uint8x8_t next_mask_paired =
-          vpadd_u8(vget_low_u8(next_mask_val), vget_high_u8(next_mask_val));
-
-      // Use a saturating add to work around the case where all |mask| values
-      // are 64. Together with the rounding shift this ensures the correct
-      // result.
-      const uint8x8_t sum = vqadd_u8(mask_paired, next_mask_paired);
-      return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
-    }
-
-    return vrshr_n_u8(mask_paired, /*subsampling_x=*/1);
-  }
-
-  assert(subsampling_y == 0 && subsampling_x == 0);
-  return vld1_u8(mask);
-}
-
 inline void InterIntraWriteMaskBlendLine8bpp4x2(
     const uint8_t* LIBGAV1_RESTRICT const pred_0,
     uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
@@ -374,6 +387,32 @@ inline void InterIntraMaskBlending8bpp4xH_NEON(
 }
 
 template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp8xH_NEON(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+    const ptrdiff_t mask_stride, const int height) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  int y = height;
+  do {
+    const uint8x8_t pred_mask_1 = GetMask8<subsampling_x, subsampling_y>(mask);
+    // 64 - mask
+    const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+    const uint8x8_t pred_val_0 = vld1_u8(pred_0);
+    const uint8x8_t pred_val_1 = vld1_u8(pred_1);
+    const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+    // weighted_pred0 + weighted_pred1
+    const uint16x8_t weighted_combo =
+        vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+    const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+    vst1_u8(pred_1, result);
+
+    pred_0 += 8;
+    pred_1 += pred_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (--y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
 inline void InterIntraMaskBlend8bpp_NEON(
     const uint8_t* LIBGAV1_RESTRICT prediction_0,
     uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
@@ -385,30 +424,46 @@ inline void InterIntraMaskBlend8bpp_NEON(
         height);
     return;
   }
+  if (width == 8) {
+    InterIntraMaskBlending8bpp8xH_NEON<subsampling_x, subsampling_y>(
+        prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+        height);
+    return;
+  }
   const uint8_t* mask = mask_ptr;
-  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8x16_t mask_inverter = vdupq_n_u8(64);
   int y = 0;
   do {
     int x = 0;
     do {
-      // TODO(b/150461164): Consider a 16 wide specialization (at least for the
-      // unsampled version) to take advantage of vld1q_u8().
-      const uint8x8_t pred_mask_1 =
-          GetInterIntraMask8<subsampling_x, subsampling_y>(
-              mask + (x << subsampling_x), mask_stride);
+      const uint8x16_t pred_mask_1 = GetMask16<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride);
       // 64 - mask
-      const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
-      const uint8x8_t pred_val_0 = vld1_u8(prediction_0);
+      const uint8x16_t pred_mask_0 = vsubq_u8(mask_inverter, pred_mask_1);
+      const uint8x8_t pred_val_0_lo = vld1_u8(prediction_0);
+      prediction_0 += 8;
+      const uint8x8_t pred_val_0_hi = vld1_u8(prediction_0);
       prediction_0 += 8;
-      const uint8x8_t pred_val_1 = vld1_u8(prediction_1 + x);
-      const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+      // Ensure armv7 build combines the load.
+      const uint8x16_t pred_val_1 = vld1q_u8(prediction_1 + x);
+      const uint8x8_t pred_val_1_lo = vget_low_u8(pred_val_1);
+      const uint8x8_t pred_val_1_hi = vget_high_u8(pred_val_1);
+      const uint16x8_t weighted_pred_0_lo =
+          vmull_u8(vget_low_u8(pred_mask_0), pred_val_0_lo);
       // weighted_pred0 + weighted_pred1
-      const uint16x8_t weighted_combo =
-          vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
-      const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
-      vst1_u8(prediction_1 + x, result);
+      const uint16x8_t weighted_combo_lo =
+          vmlal_u8(weighted_pred_0_lo, vget_low_u8(pred_mask_1), pred_val_1_lo);
+      const uint8x8_t result_lo = vrshrn_n_u16(weighted_combo_lo, 6);
+      vst1_u8(prediction_1 + x, result_lo);
+      const uint16x8_t weighted_pred_0_hi =
+          vmull_u8(vget_high_u8(pred_mask_0), pred_val_0_hi);
+      // weighted_pred0 + weighted_pred1
+      const uint16x8_t weighted_combo_hi = vmlal_u8(
+          weighted_pred_0_hi, vget_high_u8(pred_mask_1), pred_val_1_hi);
+      const uint8x8_t result_hi = vrshrn_n_u16(weighted_combo_hi, 6);
+      vst1_u8(prediction_1 + x + 8, result_hi);
 
-      x += 8;
+      x += 16;
     } while (x < width);
     prediction_1 += prediction_stride_1;
     mask += mask_stride << subsampling_y;
diff --git a/libgav1/src/dsp/arm/obmc_neon.cc b/libgav1/src/dsp/arm/obmc_neon.cc
index 659ed8e..271bbaa 100644
--- a/libgav1/src/dsp/arm/obmc_neon.cc
+++ b/libgav1/src/dsp/arm/obmc_neon.cc
@@ -52,6 +52,17 @@ inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred,
   StoreLo4(pred, result);
 }
 
+inline void WriteObmcLine8(uint8_t* LIBGAV1_RESTRICT const pred,
+                           const uint8x8_t obmc_pred_val,
+                           const uint8x8_t pred_mask,
+                           const uint8x8_t obmc_pred_mask) {
+  const uint8x8_t pred_val = vld1_u8(pred);
+  const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+  const uint8x8_t result =
+      vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+  vst1_u8(pred, result);
+}
+
 inline void OverlapBlendFromLeft2xH_NEON(
     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
@@ -99,24 +110,25 @@ inline void OverlapBlendFromLeft4xH_NEON(
 
 inline void OverlapBlendFromLeft8xH_NEON(
     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
-    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
-    const ptrdiff_t obmc_prediction_stride) {
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
+  constexpr int obmc_prediction_stride = 8;
   // 64 - mask
   const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
   int y = 0;
   do {
-    const uint8x8_t pred_val = vld1_u8(pred);
-    const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
-    const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
-    const uint8x8_t result =
-        vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+    const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+    WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
 
-    vst1_u8(pred, result);
+    WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask,
+                   obmc_pred_mask);
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-  } while (++y != height);
+
+    obmc_pred += obmc_prediction_stride << 1;
+    y += 2;
+  } while (y != height);
 }
 
 void OverlapBlendFromLeft_NEON(
@@ -140,8 +152,7 @@ void OverlapBlendFromLeft_NEON(
     return;
   }
   if (width == 8) {
-    OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred,
-                                 obmc_prediction_stride);
+    OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred);
     return;
   }
   const uint8x16_t mask_inverter = vdupq_n_u8(64);
@@ -262,26 +273,31 @@ inline void OverlapBlendFromTop4xH_NEON(
 
 inline void OverlapBlendFromTop8xH_NEON(
     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
-    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
-    const ptrdiff_t obmc_prediction_stride) {
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
+  constexpr int obmc_prediction_stride = 8;
   const uint8x8_t mask_inverter = vdup_n_u8(64);
   const uint8_t* mask = kObmcMask + height - 2;
   const int compute_height = height - (height >> 2);
   int y = 0;
   do {
-    const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+    const uint8x8_t pred_mask0 = vdup_n_u8(mask[y]);
     // 64 - mask
-    const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
-    const uint8x8_t pred_val = vld1_u8(pred);
-    const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
-    const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
-    const uint8x8_t result =
-        vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+    const uint8x8_t obmc_pred_mask0 = vsub_u8(mask_inverter, pred_mask0);
+    const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
 
-    vst1_u8(pred, result);
+    WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask0,
+                   obmc_pred_mask0);
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-  } while (++y != compute_height);
+    ++y;
+
+    const uint8x8_t pred_mask1 = vdup_n_u8(mask[y]);
+    // 64 - mask
+    const uint8x8_t obmc_pred_mask1 = vsub_u8(mask_inverter, pred_mask1);
+    WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask1,
+                   obmc_pred_mask1);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+  } while (++y < compute_height);
 }
 
 void OverlapBlendFromTop_NEON(
@@ -301,8 +317,7 @@ void OverlapBlendFromTop_NEON(
   }
 
   if (width == 8) {
-    OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred,
-                                obmc_prediction_stride);
+    OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred);
     return;
   }
 
@@ -371,26 +386,23 @@ constexpr uint16_t kObmcMask[62] = {
     33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
     59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
 
-inline uint16x4_t BlendObmc2Or4(uint8_t* LIBGAV1_RESTRICT const pred,
-                                const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+inline uint16x4_t BlendObmc2Or4(uint16_t* const pred,
+                                const uint16x4_t obmc_pred_val,
                                 const uint16x4_t pred_mask,
                                 const uint16x4_t obmc_pred_mask) {
-  const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred));
-  const uint16x4_t obmc_pred_val =
-      vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+  const uint16x4_t pred_val = vld1_u16(pred);
   const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val);
   const uint16x4_t result =
       vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
   return result;
 }
 
-inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred,
-                             const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+inline uint16x8_t BlendObmc8(uint16_t* LIBGAV1_RESTRICT const pred,
+                             const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
                              const uint16x8_t pred_mask,
                              const uint16x8_t obmc_pred_mask) {
-  const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred));
-  const uint16x8_t obmc_pred_val =
-      vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+  const uint16x8_t pred_val = vld1q_u16(pred);
+  const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
   const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val);
   const uint16x8_t result =
       vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
@@ -398,27 +410,29 @@ inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred,
 }
 
 inline void OverlapBlendFromLeft2xH_NEON(
-    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
-    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
-    const ptrdiff_t obmc_prediction_stride) {
+    uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+  constexpr int obmc_prediction_stride = 2;
   const uint16x4_t mask_inverter = vdup_n_u16(64);
   // Second two lanes unused.
   const uint16x4_t pred_mask = vld1_u16(kObmcMask);
   const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
   int y = 0;
   do {
+    const uint16x4_t obmc_pred_0 = vld1_u16(obmc_pred);
     const uint16x4_t result_0 =
-        BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
-    Store2<0>(reinterpret_cast<uint16_t*>(pred), result_0);
+        BlendObmc2Or4(pred, obmc_pred_0, pred_mask, obmc_pred_mask);
+    Store2<0>(pred, result_0);
 
-    pred += prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
     obmc_pred += obmc_prediction_stride;
 
+    const uint16x4_t obmc_pred_1 = vld1_u16(obmc_pred);
     const uint16x4_t result_1 =
-        BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
-    Store2<0>(reinterpret_cast<uint16_t*>(pred), result_1);
+        BlendObmc2Or4(pred, obmc_pred_1, pred_mask, obmc_pred_mask);
+    Store2<0>(pred, result_1);
 
-    pred += prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
     obmc_pred += obmc_prediction_stride;
 
     y += 2;
@@ -426,26 +440,26 @@ inline void OverlapBlendFromLeft2xH_NEON(
 }
 
 inline void OverlapBlendFromLeft4xH_NEON(
-    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
-    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
-    const ptrdiff_t obmc_prediction_stride) {
+    uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+  constexpr int obmc_prediction_stride = 4;
   const uint16x4_t mask_inverter = vdup_n_u16(64);
   const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2);
   // 64 - mask
   const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
   int y = 0;
   do {
-    const uint16x4_t result_0 =
-        BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
-    vst1_u16(reinterpret_cast<uint16_t*>(pred), result_0);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-
-    const uint16x4_t result_1 =
-        BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
-    vst1_u16(reinterpret_cast<uint16_t*>(pred), result_1);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+    const uint16x4_t result_0 = BlendObmc2Or4(pred, vget_low_u16(obmc_pred_val),
+                                              pred_mask, obmc_pred_mask);
+    vst1_u16(pred, result_0);
+    pred = AddByteStride(pred, prediction_stride);
+
+    const uint16x4_t result_1 = BlendObmc2Or4(
+        pred, vget_high_u16(obmc_pred_val), pred_mask, obmc_pred_mask);
+    vst1_u16(pred, result_1);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred += obmc_prediction_stride << 1;
 
     y += 2;
   } while (y != height);
@@ -456,52 +470,47 @@ void OverlapBlendFromLeft_NEON(
     const int width, const int height,
     const void* LIBGAV1_RESTRICT const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
-  auto* pred = static_cast<uint8_t*>(prediction);
-  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
   assert(width >= 2);
   assert(height >= 4);
 
   if (width == 2) {
-    OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
-                                 obmc_prediction_stride);
+    OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred);
     return;
   }
   if (width == 4) {
-    OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
-                                 obmc_prediction_stride);
+    OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred);
     return;
   }
   const uint16x8_t mask_inverter = vdupq_n_u16(64);
   const uint16_t* mask = kObmcMask + width - 2;
   int x = 0;
   do {
-    pred = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(prediction) + x);
-    obmc_pred = reinterpret_cast<const uint8_t*>(
-        static_cast<const uint16_t*>(obmc_prediction) + x);
+    uint16_t* pred_x = pred + x;
+    const uint16_t* obmc_pred_x = obmc_pred + x;
     const uint16x8_t pred_mask = vld1q_u16(mask + x);
     // 64 - mask
     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
     int y = 0;
     do {
       const uint16x8_t result =
-          BlendObmc8(pred, obmc_pred, pred_mask, obmc_pred_mask);
-      vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+          BlendObmc8(pred_x, obmc_pred_x, pred_mask, obmc_pred_mask);
+      vst1q_u16(pred_x, result);
 
-      pred += prediction_stride;
-      obmc_pred += obmc_prediction_stride;
+      pred_x = AddByteStride(pred_x, prediction_stride);
+      obmc_pred_x = AddByteStride(obmc_pred_x, obmc_prediction_stride);
     } while (++y < height);
     x += 8;
   } while (x < width);
 }
 
 template <int lane>
-inline uint16x4_t BlendObmcFromTop4(
-    uint8_t* LIBGAV1_RESTRICT const pred,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask,
-    const uint16x8_t obmc_pred_mask) {
-  const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred));
-  const uint16x4_t obmc_pred_val =
-      vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+inline uint16x4_t BlendObmcFromTop4(uint16_t* const pred,
+                                    const uint16x4_t obmc_pred_val,
+                                    const uint16x8_t pred_mask,
+                                    const uint16x8_t obmc_pred_mask) {
+  const uint16x4_t pred_val = vld1_u16(pred);
   const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask);
   const uint16x4_t result = vrshr_n_u16(
       VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
@@ -510,12 +519,11 @@ inline uint16x4_t BlendObmcFromTop4(
 
 template <int lane>
 inline uint16x8_t BlendObmcFromTop8(
-    uint8_t* LIBGAV1_RESTRICT const pred,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask,
-    const uint16x8_t obmc_pred_mask) {
-  const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred));
-  const uint16x8_t obmc_pred_val =
-      vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+    uint16_t* LIBGAV1_RESTRICT const pred,
+    const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
+    const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) {
+  const uint16x8_t pred_val = vld1q_u16(pred);
+  const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
   const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask);
   const uint16x8_t result = vrshrq_n_u16(
       VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
@@ -523,41 +531,43 @@ inline uint16x8_t BlendObmcFromTop8(
 }
 
 inline void OverlapBlendFromTop4x2Or4_NEON(
-    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
-    const uint8_t* LIBGAV1_RESTRICT obmc_pred,
-    const ptrdiff_t obmc_prediction_stride, const int height) {
+    uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
+  constexpr int obmc_prediction_stride = 4;
   const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]);
   const uint16x8_t mask_inverter = vdupq_n_u16(64);
   const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
-  uint16x4_t result =
-      BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
-  obmc_pred += obmc_prediction_stride;
+  const uint16x8_t obmc_pred_val_0 = vld1q_u16(obmc_pred);
+  uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val_0),
+                                           pred_mask, obmc_pred_mask);
+  vst1_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
 
   if (height == 2) {
     // Mask value is 64, meaning |pred| is unchanged.
     return;
   }
 
-  result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
-  obmc_pred += obmc_prediction_stride;
+  result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val_0), pred_mask,
+                                obmc_pred_mask);
+  vst1_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
+  obmc_pred += obmc_prediction_stride << 1;
 
-  result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+  const uint16x4_t obmc_pred_val_2 = vld1_u16(obmc_pred);
+  result =
+      BlendObmcFromTop4<2>(pred, obmc_pred_val_2, pred_mask, obmc_pred_mask);
+  vst1_u16(pred, result);
 }
 
 inline void OverlapBlendFromTop4xH_NEON(
-    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
-    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
-    const ptrdiff_t obmc_prediction_stride) {
+    uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
   if (height < 8) {
-    OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred,
-                                   obmc_prediction_stride, height);
+    OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, height);
     return;
   }
+  constexpr int obmc_prediction_stride = 4;
   const uint16_t* mask = kObmcMask + height - 2;
   const uint16x8_t mask_inverter = vdupq_n_u16(64);
   int y = 0;
@@ -566,36 +576,44 @@ inline void OverlapBlendFromTop4xH_NEON(
   do {
     const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
-    uint16x4_t result =
-        BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-
-    result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-
-    result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-
-    result = BlendObmcFromTop4<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-
-    result = BlendObmcFromTop4<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-
-    result = BlendObmcFromTop4<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    // Load obmc row 0, 1.
+    uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+    uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val),
+                                             pred_mask, obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+
+    result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+                                  obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred += obmc_prediction_stride << 1;
+
+    // Load obmc row 2, 3.
+    obmc_pred_val = vld1q_u16(obmc_pred);
+    result = BlendObmcFromTop4<2>(pred, vget_low_u16(obmc_pred_val), pred_mask,
+                                  obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+
+    result = BlendObmcFromTop4<3>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+                                  obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred += obmc_prediction_stride << 1;
+
+    // Load obmc row 4, 5.
+    obmc_pred_val = vld1q_u16(obmc_pred);
+    result = BlendObmcFromTop4<4>(pred, vget_low_u16(obmc_pred_val), pred_mask,
+                                  obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+
+    result = BlendObmcFromTop4<5>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+                                  obmc_pred_mask);
+    vst1_u16(pred, result);
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred += obmc_prediction_stride << 1;
 
     // Increment for the right mask index.
     y += 6;
@@ -603,147 +621,147 @@ inline void OverlapBlendFromTop4xH_NEON(
 }
 
 inline void OverlapBlendFromTop8xH_NEON(
-    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
-    const uint8_t* LIBGAV1_RESTRICT obmc_pred,
-    const ptrdiff_t obmc_prediction_stride, const int height) {
+    uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
   const uint16_t* mask = kObmcMask + height - 2;
   const uint16x8_t mask_inverter = vdupq_n_u16(64);
   uint16x8_t pred_mask = vld1q_u16(mask);
   uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
   uint16x8_t result =
       BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  vst1q_u16(pred, result);
   if (height == 2) return;
 
-  pred += prediction_stride;
+  constexpr int obmc_prediction_stride = 8;
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  vst1q_u16(pred, result);
   if (height == 4) return;
 
-  pred += prediction_stride;
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  vst1q_u16(pred, result);
 
   if (height == 8) return;
 
-  pred += prediction_stride;
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   pred_mask = vld1q_u16(&mask[8]);
   obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
 
   result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  vst1q_u16(pred, result);
 
   if (height == 16) return;
 
-  pred += prediction_stride;
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   pred_mask = vld1q_u16(&mask[16]);
   obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
 
   result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
-  pred += prediction_stride;
+  vst1q_u16(pred, result);
+  pred = AddByteStride(pred, prediction_stride);
   obmc_pred += obmc_prediction_stride;
 
   result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
-  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  vst1q_u16(pred, result);
 }
 
 void OverlapBlendFromTop_NEON(
@@ -751,20 +769,18 @@ void OverlapBlendFromTop_NEON(
     const int width, const int height,
     const void* LIBGAV1_RESTRICT const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
-  auto* pred = static_cast<uint8_t*>(prediction);
-  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
   assert(width >= 4);
   assert(height >= 2);
 
   if (width == 4) {
-    OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
-                                obmc_prediction_stride);
+    OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred);
     return;
   }
 
   if (width == 8) {
-    OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred,
-                                obmc_prediction_stride, height);
+    OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, height);
     return;
   }
 
@@ -773,19 +789,16 @@ void OverlapBlendFromTop_NEON(
   const uint16x8_t pred_mask = vld1q_u16(mask);
   // 64 - mask
   const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
-#define OBMC_ROW_FROM_TOP(n)                                                 \
-  do {                                                                       \
-    int x = 0;                                                               \
-    do {                                                                     \
-      const uint16x8_t result = BlendObmcFromTop8<n>(                        \
-          reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(pred) + x), \
-          reinterpret_cast<const uint8_t*>(                                  \
-              reinterpret_cast<const uint16_t*>(obmc_pred) + x),             \
-          pred_mask, obmc_pred_mask);                                        \
-      vst1q_u16(reinterpret_cast<uint16_t*>(pred) + x, result);              \
-                                                                             \
-      x += 8;                                                                \
-    } while (x < width);                                                     \
+#define OBMC_ROW_FROM_TOP(n)                                   \
+  do {                                                         \
+    int x = 0;                                                 \
+    do {                                                       \
+      const uint16x8_t result = BlendObmcFromTop8<n>(          \
+          pred + x, obmc_pred + x, pred_mask, obmc_pred_mask); \
+      vst1q_u16(pred + x, result);                             \
+                                                               \
+      x += 8;                                                  \
+    } while (x < width);                                       \
   } while (false)
 
   // Compute 1 row.
@@ -797,11 +810,11 @@ void OverlapBlendFromTop_NEON(
   // Compute 3 rows.
   if (height == 4) {
     OBMC_ROW_FROM_TOP(0);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(1);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(2);
     return;
   }
@@ -809,20 +822,20 @@ void OverlapBlendFromTop_NEON(
   // Compute 6 rows.
   if (height == 8) {
     OBMC_ROW_FROM_TOP(0);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(1);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(2);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(3);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(4);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(5);
     return;
   }
@@ -830,42 +843,42 @@ void OverlapBlendFromTop_NEON(
   // Compute 12 rows.
   if (height == 16) {
     OBMC_ROW_FROM_TOP(0);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(1);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(2);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(3);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(4);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(5);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(6);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(7);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
 
     const uint16x8_t pred_mask = vld1q_u16(&mask[8]);
     // 64 - mask
     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
     OBMC_ROW_FROM_TOP(0);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(1);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(2);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(3);
     return;
   }
@@ -879,29 +892,29 @@ void OverlapBlendFromTop_NEON(
     // 64 - mask
     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
     OBMC_ROW_FROM_TOP(0);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(1);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(2);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(3);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(4);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(5);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(6);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
     OBMC_ROW_FROM_TOP(7);
-    pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    pred = AddByteStride(pred, prediction_stride);
+    obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
 
     y += 8;
   } while (y < compute_height);
diff --git a/libgav1/src/dsp/arm/warp_neon.cc b/libgav1/src/dsp/arm/warp_neon.cc
index 71e0a43..da380b1 100644
--- a/libgav1/src/dsp/arm/warp_neon.cc
+++ b/libgav1/src/dsp/arm/warp_neon.cc
@@ -147,14 +147,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
     do {
       const int src_x = (start_x + 4) << subsampling_x;
       const int src_y = (start_y + 4) << subsampling_y;
-      const int dst_x =
-          src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
-      const int dst_y =
-          src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
-      const int x4 = dst_x >> subsampling_x;
-      const int y4 = dst_y >> subsampling_y;
-      const int ix4 = x4 >> kWarpedModelPrecisionBits;
-      const int iy4 = y4 >> kWarpedModelPrecisionBits;
+      const WarpFilterParams filter_params = GetWarpFilterParams(
+          src_x, src_y, subsampling_x, subsampling_y, warp_params);
       // A prediction block may fall outside the frame's boundaries. If a
       // prediction block is calculated using only samples outside the frame's
       // boundary, the filtering can be simplified. We can divide the plane
@@ -207,22 +201,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
       // border index (source_width - 1 or 0, respectively). Then for each x,
       // the inner for loop of the horizontal filter is reduced to multiplying
       // the border pixel by the sum of the filter coefficients.
-      if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+      if (filter_params.ix4 - 7 >= source_width - 1 ||
+          filter_params.ix4 + 7 <= 0) {
         // Regions 1 and 2.
         // Points to the left or right border of the first row of |src|.
         const uint8_t* first_row_border =
-            (ix4 + 7 <= 0) ? src : src + source_width - 1;
+            (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
         // In general, for y in [-7, 8), the row number iy4 + y is clipped:
         //   const int row = Clip3(iy4 + y, 0, source_height - 1);
         // In two special cases, iy4 + y is clipped to either 0 or
         // source_height - 1 for all y. In the rest of the cases, iy4 + y is
         // bounded and we can avoid clipping iy4 + y by relying on a reference
         // frame's boundary extension on the top and bottom.
-        if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+        if (filter_params.iy4 - 7 >= source_height - 1 ||
+            filter_params.iy4 + 7 <= 0) {
           // Region 1.
           // Every sample used to calculate the prediction block has the same
           // value. So the whole prediction block has the same value.
-          const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
           const uint8_t row_border_pixel =
               first_row_border[row * source_stride];
 
@@ -256,15 +252,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
           // We may over-read up to 13 pixels above the top source row, or up
           // to 13 pixels below the bottom source row. This is proved in
           // warp.cc.
-          const int row = iy4 + y;
+          const int row = filter_params.iy4 + y;
           int sum = first_row_border[row * source_stride];
           sum <<= (kFilterBits - kInterRoundBitsHorizontal);
           intermediate_result_column[y + 7] = sum;
         }
         // Vertical filter.
         DestType* dst_row = dst + start_x - block_start_x;
-        int sy4 =
-            (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+        int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  MultiplyBy4(delta);
         for (int y = 0; y < 8; ++y) {
           int sy = sy4 - MultiplyBy4(gamma);
 #if defined(__aarch64__)
@@ -341,10 +337,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
       // source_height - 1 for all y. In the rest of the cases, iy4 + y is
       // bounded and we can avoid clipping iy4 + y by relying on a reference
       // frame's boundary extension on the top and bottom.
-      if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+      if (filter_params.iy4 - 7 >= source_height - 1 ||
+          filter_params.iy4 + 7 <= 0) {
         // Region 3.
         // Horizontal filter.
-        const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
         const uint8_t* const src_row = src + row * source_stride;
         // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
         // read but is ignored.
@@ -354,11 +351,12 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
         // has left and right borders of at least 13 bytes that extend the
         // frame boundary pixels. We also assume there is at least one extra
         // padding byte after the right border of the last source row.
-        const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+        const uint8x16_t src_row_v = vld1q_u8(&src_row[filter_params.ix4 - 7]);
         // Convert src_row_v to int8 (subtract 128).
         const int8x16_t src_row_centered =
             vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
-        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
         for (int y = -7; y < 8; ++y) {
           HorizontalFilter(sx4, alpha, src_row_centered,
                            intermediate_result[y + 7]);
@@ -367,12 +365,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
       } else {
         // Region 4.
         // Horizontal filter.
-        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
         for (int y = -7; y < 8; ++y) {
           // We may over-read up to 13 pixels above the top source row, or up
           // to 13 pixels below the bottom source row. This is proved in
           // warp.cc.
-          const int row = iy4 + y;
+          const int row = filter_params.iy4 + y;
           const uint8_t* const src_row = src + row * source_stride;
           // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
           // read but is ignored.
@@ -382,7 +381,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
           // has left and right borders of at least 13 bytes that extend the
           // frame boundary pixels. We also assume there is at least one extra
           // padding byte after the right border of the last source row.
-          const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+          const uint8x16_t src_row_v =
+              vld1q_u8(&src_row[filter_params.ix4 - 7]);
           // Convert src_row_v to int8 (subtract 128).
           const int8x16_t src_row_centered =
               vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
@@ -395,8 +395,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
       // Regions 3 and 4.
       // Vertical filter.
       DestType* dst_row = dst + start_x - block_start_x;
-      int sy4 =
-          (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+      int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                MultiplyBy4(delta);
       for (int y = 0; y < 8; ++y) {
         int sy = sy4 - MultiplyBy4(gamma);
         int16x8_t filter[8];
@@ -574,14 +574,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
     do {
       const int src_x = (start_x + 4) << subsampling_x;
       const int src_y = (start_y + 4) << subsampling_y;
-      const int dst_x =
-          src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
-      const int dst_y =
-          src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
-      const int x4 = dst_x >> subsampling_x;
-      const int y4 = dst_y >> subsampling_y;
-      const int ix4 = x4 >> kWarpedModelPrecisionBits;
-      const int iy4 = y4 >> kWarpedModelPrecisionBits;
+      const WarpFilterParams filter_params = GetWarpFilterParams(
+          src_x, src_y, subsampling_x, subsampling_y, warp_params);
       // A prediction block may fall outside the frame's boundaries. If a
       // prediction block is calculated using only samples outside the frame's
       // boundary, the filtering can be simplified. We can divide the plane
@@ -634,22 +628,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
       // border index (source_width - 1 or 0, respectively). Then for each x,
       // the inner for loop of the horizontal filter is reduced to multiplying
       // the border pixel by the sum of the filter coefficients.
-      if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+      if (filter_params.ix4 - 7 >= source_width - 1 ||
+          filter_params.ix4 + 7 <= 0) {
         // Regions 1 and 2.
         // Points to the left or right border of the first row of |src|.
         const uint16_t* first_row_border =
-            (ix4 + 7 <= 0) ? src : src + source_width - 1;
+            (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
         // In general, for y in [-7, 8), the row number iy4 + y is clipped:
         //   const int row = Clip3(iy4 + y, 0, source_height - 1);
         // In two special cases, iy4 + y is clipped to either 0 or
         // source_height - 1 for all y. In the rest of the cases, iy4 + y is
         // bounded and we can avoid clipping iy4 + y by relying on a reference
         // frame's boundary extension on the top and bottom.
-        if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+        if (filter_params.iy4 - 7 >= source_height - 1 ||
+            filter_params.iy4 + 7 <= 0) {
           // Region 1.
           // Every sample used to calculate the prediction block has the same
           // value. So the whole prediction block has the same value.
-          const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
           const uint16_t row_border_pixel = first_row_border[row * src_stride];
 
           DestType* dst_row = dst + start_x - block_start_x;
@@ -684,15 +680,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
           // We may over-read up to 13 pixels above the top source row, or up
           // to 13 pixels below the bottom source row. This is proved in
           // warp.cc.
-          const int row = iy4 + y;
+          const int row = filter_params.iy4 + y;
           int sum = first_row_border[row * src_stride];
           sum <<= (kFilterBits - kInterRoundBitsHorizontal);
           intermediate_result_column[y + 7] = sum;
         }
         // Vertical filter.
         DestType* dst_row = dst + start_x - block_start_x;
-        int sy4 =
-            (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+        int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  MultiplyBy4(delta);
         for (int y = 0; y < 8; ++y) {
           int sy = sy4 - MultiplyBy4(gamma);
 #if defined(__aarch64__)
@@ -782,10 +778,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
       // source_height - 1 for all y. In the rest of the cases, iy4 + y is
       // bounded and we can avoid clipping iy4 + y by relying on a reference
       // frame's boundary extension on the top and bottom.
-      if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+      if (filter_params.iy4 - 7 >= source_height - 1 ||
+          filter_params.iy4 + 7 <= 0) {
         // Region 3.
         // Horizontal filter.
-        const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
         const uint16_t* const src_row = src + row * src_stride;
         // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
         // read but is ignored.
@@ -795,8 +792,10 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
         // has left and right borders of at least 13 pixels that extend the
         // frame boundary pixels. We also assume there is at least one extra
         // padding pixel after the right border of the last source row.
-        const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]);
-        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        const uint16x8x2_t src_row_v =
+            LoadSrcRow(&src_row[filter_params.ix4 - 7]);
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
         for (int y = -7; y < 8; ++y) {
           HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
           sx4 += beta;
@@ -804,12 +803,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
       } else {
         // Region 4.
         // Horizontal filter.
-        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
         for (int y = -7; y < 8; ++y) {
           // We may over-read up to 13 pixels above the top source row, or up
           // to 13 pixels below the bottom source row. This is proved in
           // warp.cc.
-          const int row = iy4 + y;
+          const int row = filter_params.iy4 + y;
           const uint16_t* const src_row = src + row * src_stride;
           // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
           // read but is ignored.
@@ -819,7 +819,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
           // frame has left and right borders of at least 13 pixels that extend
           // the frame boundary pixels. We also assume there is at least one
           // extra padding pixel after the right border of the last source row.
-          const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]);
+          const uint16x8x2_t src_row_v =
+              LoadSrcRow(&src_row[filter_params.ix4 - 7]);
           HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
           sx4 += beta;
         }
@@ -828,8 +829,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
       // Regions 3 and 4.
       // Vertical filter.
       DestType* dst_row = dst + start_x - block_start_x;
-      int sy4 =
-          (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+      int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                MultiplyBy4(delta);
       for (int y = 0; y < 8; ++y) {
         int sy = sy4 - MultiplyBy4(gamma);
         int16x8_t filter[8];
diff --git a/libgav1/src/dsp/average_blend.cc b/libgav1/src/dsp/average_blend.cc
index 273b355..1a37aa1 100644
--- a/libgav1/src/dsp/average_blend.cc
+++ b/libgav1/src/dsp/average_blend.cc
@@ -87,6 +87,21 @@ void Init10bpp() {
 }
 #endif
 
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->average_blend = AverageBlend_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_AverageBlend
+  dsp->average_blend = AverageBlend_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
 }  // namespace
 
 void AverageBlendInit_C() {
@@ -94,6 +109,9 @@ void AverageBlendInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/cdef.cc b/libgav1/src/dsp/cdef.cc
index ca2adfd..9dd9287 100644
--- a/libgav1/src/dsp/cdef.cc
+++ b/libgav1/src/dsp/cdef.cc
@@ -32,9 +32,11 @@ namespace {
 #include "src/dsp/cdef.inc"
 
 // Silence unused function warnings when CdefDirection_C is obviated.
-#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||        \
-    !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \
-    (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||           \
+    !defined(LIBGAV1_Dsp8bpp_CdefDirection) ||    \
+    (LIBGAV1_MAX_BITDEPTH >= 10 &&                \
+     !defined(LIBGAV1_Dsp10bpp_CdefDirection)) || \
+    (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefDirection))
 constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105};
 
 int32_t Square(int32_t x) { return x * x; }
@@ -103,12 +105,15 @@ void CdefDirection_C(const void* LIBGAV1_RESTRICT const source,
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
         // !defined(LIBGAV1_Dsp8bpp_CdefDirection) ||
         // (LIBGAV1_MAX_BITDEPTH >= 10 &&
-        // !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+        //  !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+        // (LIBGAV1_MAX_BITDEPTH == 12 &&
+        //  !defined(LIBGAV1_Dsp12bpp_CdefDirection))
 
 // Silence unused function warnings when CdefFilter_C is obviated.
-#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||      \
-    !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \
-    (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||                                       \
+    !defined(LIBGAV1_Dsp8bpp_CdefFilters) ||                                  \
+    (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters)) || \
+    (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefFilters))
 
 int Constrain(int diff, int threshold, int damping) {
   assert(threshold != 0);
@@ -218,7 +223,9 @@ void CdefFilter_C(const uint16_t* LIBGAV1_RESTRICT src,
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
         // !defined(LIBGAV1_Dsp8bpp_CdefFilters) ||
         // (LIBGAV1_MAX_BITDEPTH >= 10 &&
-        // !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+        //  !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+        // (LIBGAV1_MAX_BITDEPTH == 12 &&
+        //  !defined(LIBGAV1_Dsp12bpp_CdefFilters))
 
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
@@ -294,7 +301,48 @@ void Init10bpp() {
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->cdef_direction = CdefDirection_C<12, uint16_t>;
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_CdefDirection
+  dsp->cdef_direction = CdefDirection_C<12, uint16_t>;
 #endif
+#ifndef LIBGAV1_Dsp12bpp_CdefFilters
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 
 }  // namespace
 
@@ -303,6 +351,9 @@ void CdefInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/cdef.h b/libgav1/src/dsp/cdef.h
index b820b77..ce23ea5 100644
--- a/libgav1/src/dsp/cdef.h
+++ b/libgav1/src/dsp/cdef.h
@@ -38,6 +38,11 @@
 namespace libgav1 {
 namespace dsp {
 
+enum {
+  kCdefSecondaryTap0 = 2,
+  kCdefSecondaryTap1 = 1,
+};
+
 // Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
 // thread-safe.
 void CdefInit_C();
diff --git a/libgav1/src/dsp/constants.h b/libgav1/src/dsp/constants.h
index 7c1b62c..dd0a4e0 100644
--- a/libgav1/src/dsp/constants.h
+++ b/libgav1/src/dsp/constants.h
@@ -27,25 +27,7 @@
 namespace libgav1 {
 
 enum {
-  // Documentation variables.
-  kBitdepth8 = 8,
-  kBitdepth10 = 10,
-  kBitdepth12 = 12,
-  // Weights are quadratic from '1' to '1 / block_size', scaled by
-  // 2^kSmoothWeightScale.
-  kSmoothWeightScale = 8,
   kCflLumaBufferStride = 32,
-  // InterRound0, Section 7.11.3.2.
-  kInterRoundBitsHorizontal = 3,  // 8 & 10-bit.
-  kInterRoundBitsHorizontal12bpp = 5,
-  kInterRoundBitsCompoundVertical = 7,  // 8, 10 & 12-bit compound prediction.
-  kInterRoundBitsVertical = 11,         // 8 & 10-bit, single prediction.
-  kInterRoundBitsVertical12bpp = 9,
-  // Offset applied to 10bpp and 12bpp predictors to allow storing them in
-  // uint16_t. Removed before blending.
-  kCompoundOffset = (1 << 14) + (1 << 13),
-  kCdefSecondaryTap0 = 2,
-  kCdefSecondaryTap1 = 1,
 };  // anonymous enum
 
 extern const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8];
diff --git a/libgav1/src/dsp/convolve.cc b/libgav1/src/dsp/convolve.cc
index f11b45e..6989da0 100644
--- a/libgav1/src/dsp/convolve.cc
+++ b/libgav1/src/dsp/convolve.cc
@@ -864,7 +864,93 @@ void Init10bpp() {
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>;
+  dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>;
+
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>;
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>;
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>;
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>;
+#else  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCopy
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveHorizontal
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveVertical
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Convolve2D
+  dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundCopy
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundHorizontal
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>;
 #endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundVertical
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompound2D
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockCopy
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockHorizontal
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockVertical
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlock2D
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>;
+#endif
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveScale2D
+  dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundScale2D
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 
 }  // namespace
 
@@ -873,6 +959,9 @@ void ConvolveInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/convolve.h b/libgav1/src/dsp/convolve.h
index 5bc0bad..8780bfc 100644
--- a/libgav1/src/dsp/convolve.h
+++ b/libgav1/src/dsp/convolve.h
@@ -17,6 +17,8 @@
 #ifndef LIBGAV1_SRC_DSP_CONVOLVE_H_
 #define LIBGAV1_SRC_DSP_CONVOLVE_H_
 
+#include <cassert>
+
 // Pull in LIBGAV1_DspXXX defines representing the implementation status
 // of each function. The resulting value of each can be used by each module to
 // determine whether an implementation is needed at compile time.
@@ -43,6 +45,35 @@ namespace dsp {
 // thread-safe.
 void ConvolveInit_C();
 
+inline int GetNumTapsInFilter(const int filter_index) {
+  if (filter_index < 2) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    return 6;
+  }
+
+  if (filter_index == 2) {
+    // kInterpolationFilterEightTapSharp
+    return 8;
+  }
+
+  if (filter_index == 3) {
+    // kInterpolationFilterBilinear
+    return 2;
+  }
+
+  assert(filter_index > 3);
+  // For small sizes (width/height <= 4) the large filters are replaced with 4
+  // tap options.
+  // If the original filters were |kInterpolationFilterEightTap| or
+  // |kInterpolationFilterEightTapSharp| then it becomes
+  // |kInterpolationFilterSwitchable|.
+  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+  // tap filter.
+  return 4;
+}
+
 }  // namespace dsp
 }  // namespace libgav1
 
diff --git a/libgav1/src/dsp/convolve.inc b/libgav1/src/dsp/convolve.inc
index e0f755e..2e0b270 100644
--- a/libgav1/src/dsp/convolve.inc
+++ b/libgav1/src/dsp/convolve.inc
@@ -12,39 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Constants and utility functions used for convolve implementations.
+// Constants used for convolve implementations.
 // This will be included inside an anonymous namespace on files where these are
 // necessary.
 
-int GetNumTapsInFilter(const int filter_index) {
-  if (filter_index < 2) {
-    // Despite the names these only use 6 taps.
-    // kInterpolationFilterEightTap
-    // kInterpolationFilterEightTapSmooth
-    return 6;
-  }
-
-  if (filter_index == 2) {
-    // kInterpolationFilterEightTapSharp
-    return 8;
-  }
-
-  if (filter_index == 3) {
-    // kInterpolationFilterBilinear
-    return 2;
-  }
-
-  assert(filter_index > 3);
-  // For small sizes (width/height <= 4) the large filters are replaced with 4
-  // tap options.
-  // If the original filters were |kInterpolationFilterEightTap| or
-  // |kInterpolationFilterEightTapSharp| then it becomes
-  // |kInterpolationFilterSwitchable|.
-  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
-  // tap filter.
-  return 4;
-}
-
 constexpr int kIntermediateAllocWidth = kMaxSuperBlockSizeInPixels;
 constexpr int kIntermediateStride = 8;
 constexpr int kHorizontalOffset = 3;
diff --git a/libgav1/src/dsp/distance_weighted_blend.cc b/libgav1/src/dsp/distance_weighted_blend.cc
index 34d10fc..ef83235 100644
--- a/libgav1/src/dsp/distance_weighted_blend.cc
+++ b/libgav1/src/dsp/distance_weighted_blend.cc
@@ -88,7 +88,22 @@ void Init10bpp() {
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_DistanceWeightedBlend
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>;
 #endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 
 }  // namespace
 
@@ -97,6 +112,9 @@ void DistanceWeightedBlendInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/dsp.cc b/libgav1/src/dsp/dsp.cc
index aac0ca0..97a064f 100644
--- a/libgav1/src/dsp/dsp.cc
+++ b/libgav1/src/dsp/dsp.cc
@@ -78,6 +78,12 @@ dsp::Dsp* GetWritableDspTable(int bitdepth) {
       return &dsp_10bpp;
     }
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+    case 12: {
+      static dsp::Dsp dsp_12bpp;
+      return &dsp_12bpp;
+    }
+#endif
   }
   return nullptr;
 }
@@ -157,6 +163,7 @@ void DspInit() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
     ConvolveInit10bpp_NEON();
     InverseTransformInit10bpp_NEON();
+    LoopFilterInit10bpp_NEON();
     LoopRestorationInit10bpp_NEON();
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 #endif  // LIBGAV1_ENABLE_NEON
diff --git a/libgav1/src/dsp/film_grain.cc b/libgav1/src/dsp/film_grain.cc
index fa12b69..906230d 100644
--- a/libgav1/src/dsp/film_grain.cc
+++ b/libgav1/src/dsp/film_grain.cc
@@ -19,17 +19,16 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <new>
 
-#include "src/dsp/common.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/film_grain_common.h"
 #include "src/utils/array_2d.h"
 #include "src/utils/common.h"
 #include "src/utils/compiler_attributes.h"
-#include "src/utils/logging.h"
+#include "src/utils/constants.h"
 #include "src/utils/memory.h"
+#include "src/utils/types.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -45,7 +44,7 @@ void InitializeScalingLookupTable_C(int num_points, const uint8_t point_value[],
     memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
     return;
   }
-  constexpr int index_shift = bitdepth - kBitdepth8;
+  constexpr int index_shift = (bitdepth == kBitdepth10) ? 2 : 0;
   static_assert(sizeof(scaling_lut[0]) == 2, "");
   Memset(scaling_lut, point_scaling[0],
          std::max(static_cast<int>(point_value[0]), 1) << index_shift);
@@ -866,6 +865,121 @@ void Init10bpp() {
 }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+
+  // ChromaAutoRegressionFunc
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>;
+
+  // ConstructNoiseStripesFunc
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth12, int16_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>;
+
+  // ConstructNoiseImageOverlapFunc
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>;
+
+  // InitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth12>;
+
+  // BlendNoiseWithImageLumaFunc
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>;
+
+  // BlendNoiseWithImageChromaFunc
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionLuma
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionChroma
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseStripes
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth12, int16_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseImageOverlap
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainInitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth12>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseLuma
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChroma
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChromaWithCfl
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
 }  // namespace
 }  // namespace film_grain
 
@@ -874,6 +988,9 @@ void FilmGrainInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   film_grain::Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  film_grain::Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/film_grain_common.h b/libgav1/src/dsp/film_grain_common.h
index 2e6ad45..3c8d761 100644
--- a/libgav1/src/dsp/film_grain_common.h
+++ b/libgav1/src/dsp/film_grain_common.h
@@ -17,15 +17,7 @@
 #ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
 #define LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
 
-#include <cstddef>
 #include <cstdint>
-#include <memory>
-#include <type_traits>
-
-#include "src/dsp/common.h"
-#include "src/utils/array_2d.h"
-#include "src/utils/constants.h"
-#include "src/utils/cpu.h"
 
 namespace libgav1 {
 
diff --git a/libgav1/src/dsp/intra_edge.cc b/libgav1/src/dsp/intra_edge.cc
index fe66db2..9875ef1 100644
--- a/libgav1/src/dsp/intra_edge.cc
+++ b/libgav1/src/dsp/intra_edge.cc
@@ -100,7 +100,26 @@ void Init10bpp() {
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_IntraEdgeFilter
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_IntraEdgeUpsampler
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>;
 #endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 
 }  // namespace
 
@@ -109,6 +128,9 @@ void IntraEdgeInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/intrapred.cc b/libgav1/src/dsp/intrapred.cc
index 75af279..3162acc 100644
--- a/libgav1/src/dsp/intrapred.cc
+++ b/libgav1/src/dsp/intrapred.cc
@@ -1422,6 +1422,551 @@ void Init10bpp() {
 }  // NOLINT(readability/fn_size)
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
+#if LIBGAV1_MAX_BITDEPTH == 12
+using Defs12bpp = IntraPredBppDefs<12, uint16_t>;
+
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_INTRAPREDICTORS(DefsHbd, Defs12bpp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+      Defs12bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DefsHbd::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DefsHbd::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DefsHbd::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      DefsHbd::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DefsHbd::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      DefsHbd::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+      Defs12bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DefsHbd::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DefsHbd::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DefsHbd::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      DefsHbd::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DefsHbd::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      DefsHbd::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+      Defs12bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DefsHbd::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DefsHbd::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DefsHbd::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      DefsHbd::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DefsHbd::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      DefsHbd::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+      Defs12bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DefsHbd::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DefsHbd::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DefsHbd::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      DefsHbd::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DefsHbd::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      DefsHbd::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+      Defs12bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DefsHbd::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DefsHbd::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DefsHbd::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      DefsHbd::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DefsHbd::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      DefsHbd::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+      Defs12bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DefsHbd::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DefsHbd::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DefsHbd::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      DefsHbd::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DefsHbd::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      DefsHbd::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+      Defs12bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DefsHbd::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DefsHbd::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DefsHbd::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      DefsHbd::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DefsHbd::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      DefsHbd::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+      Defs12bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DefsHbd::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DefsHbd::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DefsHbd::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      DefsHbd::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DefsHbd::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      DefsHbd::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+      Defs12bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DefsHbd::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DefsHbd::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DefsHbd::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      DefsHbd::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DefsHbd::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      DefsHbd::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+      Defs12bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DefsHbd::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DefsHbd::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DefsHbd::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      DefsHbd::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DefsHbd::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      DefsHbd::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+      Defs12bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DefsHbd::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DefsHbd::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DefsHbd::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      DefsHbd::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DefsHbd::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      DefsHbd::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+      Defs12bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DefsHbd::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DefsHbd::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DefsHbd::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      DefsHbd::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DefsHbd::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      DefsHbd::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+      Defs12bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DefsHbd::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DefsHbd::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DefsHbd::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      DefsHbd::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DefsHbd::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      DefsHbd::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+      Defs12bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DefsHbd::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DefsHbd::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DefsHbd::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      DefsHbd::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DefsHbd::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      DefsHbd::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+      Defs12bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DefsHbd::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DefsHbd::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DefsHbd::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      DefsHbd::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DefsHbd::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      DefsHbd::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+      Defs12bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DefsHbd::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DefsHbd::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DefsHbd::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      DefsHbd::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DefsHbd::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      DefsHbd::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+      Defs12bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DefsHbd::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DefsHbd::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DefsHbd::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      DefsHbd::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DefsHbd::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      DefsHbd::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+      Defs12bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DefsHbd::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DefsHbd::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DefsHbd::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      DefsHbd::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DefsHbd::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      DefsHbd::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+      Defs12bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DefsHbd::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DefsHbd::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DefsHbd::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      DefsHbd::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DefsHbd::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      DefsHbd::_64x64::Paeth;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
 #undef INIT_INTRAPREDICTORS_WxH
 #undef INIT_INTRAPREDICTORS
 }  // namespace
@@ -1431,6 +1976,9 @@ void IntraPredInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/intrapred_cfl.cc b/libgav1/src/dsp/intrapred_cfl.cc
index 0f7f4f2..798bb73 100644
--- a/libgav1/src/dsp/intrapred_cfl.cc
+++ b/libgav1/src/dsp/intrapred_cfl.cc
@@ -639,6 +639,263 @@ void Init10bpp() {
 }  // NOLINT(readability/fn_size)
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_CFL_INTRAPREDICTORS(12, uint16_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
 #undef INIT_CFL_INTRAPREDICTOR_WxH
 #undef INIT_CFL_INTRAPREDICTORS
 
@@ -649,6 +906,9 @@ void IntraPredCflInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/intrapred_directional.cc b/libgav1/src/dsp/intrapred_directional.cc
index 21a40b5..9146074 100644
--- a/libgav1/src/dsp/intrapred_directional.cc
+++ b/libgav1/src/dsp/intrapred_directional.cc
@@ -94,11 +94,19 @@ void DirectionalIntraPredictorZone1_C(
   } while (++y < height);
 }
 
+// clang 14.0.0 produces incorrect code with LIBGAV1_RESTRICT.
+// https://github.com/llvm/llvm-project/issues/54427
+#if defined(__clang__) && __clang_major__ == 14
+#define LOCAL_RESTRICT
+#else
+#define LOCAL_RESTRICT LIBGAV1_RESTRICT
+#endif
+
 template <typename Pixel>
 void DirectionalIntraPredictorZone2_C(
-    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
-    const void* LIBGAV1_RESTRICT const top_row,
-    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    void* LOCAL_RESTRICT const dest, ptrdiff_t stride,
+    const void* LOCAL_RESTRICT const top_row,
+    const void* LOCAL_RESTRICT const left_column, const int width,
     const int height, const int xstep, const int ystep,
     const bool upsampled_top, const bool upsampled_left) {
   const auto* const top = static_cast<const Pixel*>(top_row);
@@ -143,6 +151,8 @@ void DirectionalIntraPredictorZone2_C(
   } while (++y < height);
 }
 
+#undef LOCAL_RESTRICT
+
 template <typename Pixel>
 void DirectionalIntraPredictorZone3_C(
     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
@@ -236,6 +246,34 @@ void Init10bpp() {
 }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
 }  // namespace
 
 void IntraPredDirectionalInit_C() {
@@ -243,6 +281,9 @@ void IntraPredDirectionalInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/intrapred_filter.cc b/libgav1/src/dsp/intrapred_filter.cc
index 9a45eff..2d183cf 100644
--- a/libgav1/src/dsp/intrapred_filter.cc
+++ b/libgav1/src/dsp/intrapred_filter.cc
@@ -131,6 +131,21 @@ void Init10bpp() {
 }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
 }  // namespace
 
 void IntraPredFilterInit_C() {
@@ -138,6 +153,9 @@ void IntraPredFilterInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/intrapred_smooth.cc b/libgav1/src/dsp/intrapred_smooth.cc
index 0c7f272..16b8274 100644
--- a/libgav1/src/dsp/intrapred_smooth.cc
+++ b/libgav1/src/dsp/intrapred_smooth.cc
@@ -714,6 +714,266 @@ void Init10bpp() {
 }  // NOLINT(readability/fn_size)
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
+#if LIBGAV1_MAX_BITDEPTH == 12
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_SMOOTH(DefsHbd);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
 #undef INIT_SMOOTH_WxH
 #undef INIT_SMOOTH
 }  // namespace
@@ -723,6 +983,9 @@ void IntraPredSmoothInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/intrapred_smooth.h b/libgav1/src/dsp/intrapred_smooth.h
index 6802003..06454af 100644
--- a/libgav1/src/dsp/intrapred_smooth.h
+++ b/libgav1/src/dsp/intrapred_smooth.h
@@ -38,6 +38,12 @@
 namespace libgav1 {
 namespace dsp {
 
+enum {
+  // Weights are quadratic from '1' to '1 / block_size', scaled by
+  // 2^kSmoothWeightScale.
+  kSmoothWeightScale = 8,
+};
+
 // Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
 // This function is not thread-safe.
 void IntraPredSmoothInit_C();
diff --git a/libgav1/src/dsp/inverse_transform.cc b/libgav1/src/dsp/inverse_transform.cc
index 1b0064f..0bbdffa 100644
--- a/libgav1/src/dsp/inverse_transform.cc
+++ b/libgav1/src/dsp/inverse_transform.cc
@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cstdint>
 #include <cstring>
+#include <type_traits>
 
 #include "src/dsp/dsp.h"
 #include "src/utils/array_2d.h"
@@ -25,6 +26,15 @@
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/logging.h"
 
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#endif
+
+#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
+    LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#include <cinttypes>
+#endif
+
 namespace libgav1 {
 namespace dsp {
 namespace {
@@ -34,24 +44,25 @@ namespace {
 
 constexpr uint8_t kTransformColumnShift = 4;
 
-#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
-#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
-#endif
-
-int32_t RangeCheckValue(int32_t value, int8_t range) {
+template <typename T>
+int32_t RangeCheckValue(T value, int8_t range) {
 #if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
     LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+  static_assert(
+      std::is_same<T, int32_t>::value || std::is_same<T, std::int64_t>::value,
+      "");
   assert(range <= 32);
   const auto min = static_cast<int32_t>(-(uint32_t{1} << (range - 1)));
   const auto max = static_cast<int32_t>((uint32_t{1} << (range - 1)) - 1);
   if (min > value || value > max) {
-    LIBGAV1_DLOG(ERROR, "coeff out of bit range, value: %d bit range %d\n",
-                 value, range);
+    LIBGAV1_DLOG(ERROR,
+                 "coeff out of bit range, value: %" PRId64 " bit range %d",
+                 static_cast<int64_t>(value), range);
     assert(min <= value && value <= max);
   }
 #endif  // LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
   static_cast<void>(range);
-  return value;
+  return static_cast<int32_t>(value);
 }
 
 template <typename Residual>
@@ -433,7 +444,13 @@ void Adst4_C(void* dest, int8_t range) {
   // Section 7.13.2.6: It is a requirement of bitstream conformance that all
   // values stored in the s and x arrays by this process are representable by
   // a signed integer using range + 12 bits of precision.
-  int32_t s[7];
+  // Note the intermediate value can only exceed INT32_MAX with invalid 12-bit
+  // content. For simplicity in unoptimized code, int64_t is used for both 10 &
+  // 12-bit. SIMD implementations can allow these to rollover on platforms
+  // where this has defined behavior.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+  Intermediate s[7];
   s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
   s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
   s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12);
@@ -454,19 +471,23 @@ void Adst4_C(void* dest, int8_t range) {
   s[0] = RangeCheckValue(s[0] + s[3], range + 12);
   s[1] = RangeCheckValue(s[1] - s[4], range + 12);
   s[3] = s[2];
-  s[2] = RangeCheckValue(kAdst4Multiplier[2] * b7, range + 12);
+  // With range checking enabled b7 would be trapped above. This prevents an
+  // integer sanitizer warning. In SIMD implementations the multiply can be
+  // allowed to rollover on platforms where this has defined behavior.
+  const auto adst2_b7 = static_cast<Intermediate>(kAdst4Multiplier[2]) * b7;
+  s[2] = RangeCheckValue(adst2_b7, range + 12);
   // stage 4.
   s[0] = RangeCheckValue(s[0] + s[5], range + 12);
   s[1] = RangeCheckValue(s[1] - s[6], range + 12);
   // stages 5 and 6.
-  const int32_t x0 = RangeCheckValue(s[0] + s[3], range + 12);
-  const int32_t x1 = RangeCheckValue(s[1] + s[3], range + 12);
-  int32_t x3 = RangeCheckValue(s[0] + s[1], range + 12);
+  const Intermediate x0 = RangeCheckValue(s[0] + s[3], range + 12);
+  const Intermediate x1 = RangeCheckValue(s[1] + s[3], range + 12);
+  Intermediate x3 = RangeCheckValue(s[0] + s[1], range + 12);
   x3 = RangeCheckValue(x3 - s[3], range + 12);
-  int32_t dst_0 = RightShiftWithRounding(x0, 12);
-  int32_t dst_1 = RightShiftWithRounding(x1, 12);
-  int32_t dst_2 = RightShiftWithRounding(s[2], 12);
-  int32_t dst_3 = RightShiftWithRounding(x3, 12);
+  auto dst_0 = static_cast<int32_t>(RightShiftWithRounding(x0, 12));
+  auto dst_1 = static_cast<int32_t>(RightShiftWithRounding(x1, 12));
+  auto dst_2 = static_cast<int32_t>(RightShiftWithRounding(s[2], 12));
+  auto dst_3 = static_cast<int32_t>(RightShiftWithRounding(x3, 12));
   if (sizeof(Residual) == 2) {
     // If the first argument to RightShiftWithRounding(..., 12) is only
     // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
@@ -840,6 +861,10 @@ void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
 
 template <typename Residual>
 void Identity4Row_C(void* dest, int8_t shift) {
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
   assert(shift == 0 || shift == 1);
   auto* const dst = static_cast<Residual*>(dest);
   // If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding|
@@ -847,10 +872,10 @@ void Identity4Row_C(void* dest, int8_t shift) {
   // values of |shift|.
   const int32_t rounding = (1 + (shift << 1)) << 11;
   for (int i = 0; i < 4; ++i) {
-    // The intermediate value here will have to fit into an int32_t for it to be
-    // bitstream conformant. The multiplication is promoted to int32_t by
-    // defining kIdentity4Multiplier as int32_t.
-    int32_t dst_i = (dst[i] * kIdentity4Multiplier + rounding) >> (12 + shift);
+    const auto intermediate =
+        static_cast<Intermediate>(dst[i]) * kIdentity4Multiplier;
+    int32_t dst_i =
+        static_cast<int32_t>((intermediate + rounding) >> (12 + shift));
     if (sizeof(Residual) == 2) {
       dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
     }
@@ -874,16 +899,24 @@ void Identity4Column_C(void* dest, int8_t /*shift*/) {
 template <int bitdepth, typename Residual>
 void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
                        int row_shift, bool is_row) {
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
   auto* const dst = static_cast<Residual*>(dest);
 
   if (is_row) {
     if (should_round) {
-      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+      const auto intermediate =
+          static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+      dst[0] = RightShiftWithRounding(intermediate, 12);
     }
 
     const int32_t rounding = (1 + (row_shift << 1)) << 11;
+    const auto intermediate =
+        static_cast<Intermediate>(dst[0]) * kIdentity4Multiplier;
     int32_t dst_i =
-        (dst[0] * kIdentity4Multiplier + rounding) >> (12 + row_shift);
+        static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift));
     if (sizeof(Residual) == 2) {
       dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
     }
@@ -923,11 +956,17 @@ void Identity8Column_C(void* dest, int8_t /*shift*/) {
 template <int bitdepth, typename Residual>
 void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
                        int row_shift, bool is_row) {
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
   auto* const dst = static_cast<Residual*>(dest);
 
   if (is_row) {
     if (should_round) {
-      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+      const auto intermediate =
+          static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+      dst[0] = RightShiftWithRounding(intermediate, 12);
     }
 
     int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[0]), row_shift);
@@ -954,13 +993,19 @@ void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
 template <typename Residual>
 void Identity16Row_C(void* dest, int8_t shift) {
   assert(shift == 1 || shift == 2);
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
   auto* const dst = static_cast<Residual*>(dest);
   const int32_t rounding = (1 + (1 << shift)) << 11;
   for (int i = 0; i < 16; ++i) {
-    // The intermediate value here will have to fit into an int32_t for it to be
-    // bitstream conformant. The multiplication is promoted to int32_t by
-    // defining kIdentity16Multiplier as int32_t.
-    int32_t dst_i = (dst[i] * kIdentity16Multiplier + rounding) >> (12 + shift);
+    // Note the intermediate value can only exceed 32 bits with 12-bit content.
+    // For simplicity in unoptimized code, int64_t is used for all cases.
+    const auto intermediate =
+        static_cast<Intermediate>(dst[i]) * kIdentity16Multiplier;
+    int32_t dst_i =
+        static_cast<int32_t>((intermediate + rounding) >> (12 + shift));
     if (sizeof(Residual) == 2) {
       dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
     }
@@ -985,16 +1030,24 @@ void Identity16Column_C(void* dest, int8_t /*shift*/) {
 template <int bitdepth, typename Residual>
 void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
                         int row_shift, bool is_row) {
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
   auto* const dst = static_cast<Residual*>(dest);
 
   if (is_row) {
     if (should_round) {
-      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+      const auto intermediate =
+          static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+      dst[0] = RightShiftWithRounding(intermediate, 12);
     }
 
     const int32_t rounding = (1 + (1 << row_shift)) << 11;
+    const auto intermediate =
+        static_cast<Intermediate>(dst[0]) * kIdentity16Multiplier;
     int32_t dst_i =
-        (dst[0] * kIdentity16Multiplier + rounding) >> (12 + row_shift);
+        static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift));
     if (sizeof(Residual) == 2) {
       dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
     }
@@ -1034,11 +1087,17 @@ void Identity32Column_C(void* dest, int8_t /*shift*/) {
 template <int bitdepth, typename Residual>
 void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
                         int row_shift, bool is_row) {
+  // Note the intermediate value can only exceed 32 bits with 12-bit content.
+  // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+  using Intermediate =
+      typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
   auto* const dst = static_cast<Residual*>(dest);
 
   if (is_row) {
     if (should_round) {
-      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+      const auto intermediate =
+          static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+      dst[0] = RightShiftWithRounding(intermediate, 12);
     }
 
     int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[0]), row_shift);
@@ -1612,6 +1671,148 @@ void Init10bpp() {
 }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  InitAll<12, int32_t, uint16_t>(dsp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize64_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+                      Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<12, int32_t>, Identity4Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<12, int32_t>,
+                      Identity4Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<12, int32_t>, Identity8Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<12, int32_t>,
+                      Identity8Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<12, int32_t>, Identity16Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<12, int32_t>,
+                      Identity16Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<12, int32_t>, Identity32Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<12, int32_t>,
+                      Identity32Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dWht
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht,
+                      Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht,
+                      Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
 }  // namespace
 
 void InverseTransformInit_C() {
@@ -1619,10 +1820,12 @@ void InverseTransformInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 
   // Local functions that may be unused depending on the optimizations
   // available.
-  static_cast<void>(RangeCheckValue);
   static_cast<void>(kBitReverseLookup);
 }
 
diff --git a/libgav1/src/dsp/libgav1_dsp.cmake b/libgav1/src/dsp/libgav1_dsp.cmake
index 4bd1443..fedb35b 100644
--- a/libgav1/src/dsp/libgav1_dsp.cmake
+++ b/libgav1/src/dsp/libgav1_dsp.cmake
@@ -113,6 +113,7 @@ list(APPEND libgav1_dsp_sources_neon
             "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc"
             "${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
             "${libgav1_source}/dsp/arm/inverse_transform_neon.h"
+            "${libgav1_source}/dsp/arm/loop_filter_10bit_neon.cc"
             "${libgav1_source}/dsp/arm/loop_filter_neon.cc"
             "${libgav1_source}/dsp/arm/loop_filter_neon.h"
             "${libgav1_source}/dsp/arm/loop_restoration_10bit_neon.cc"
diff --git a/libgav1/src/dsp/loop_filter.cc b/libgav1/src/dsp/loop_filter.cc
index 14d47bf..bb0583f 100644
--- a/libgav1/src/dsp/loop_filter.cc
+++ b/libgav1/src/dsp/loop_filter.cc
@@ -603,6 +603,73 @@ void Init10bpp() {
 }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
+#if LIBGAV1_MAX_BITDEPTH == 12
+using Defs12bpp = LoopFilterFuncs_C<12, uint16_t>;
+
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal4;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical4;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal6;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical6;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal8;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical8;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal14;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical14;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs12bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs12bpp::Vertical14;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
 }  // namespace
 
 void LoopFilterInit_C() {
@@ -610,6 +677,9 @@ void LoopFilterInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
   // Local functions that may be unused depending on the optimizations
   // available.
   static_cast<void>(AdjustThresholds);
diff --git a/libgav1/src/dsp/loop_restoration.cc b/libgav1/src/dsp/loop_restoration.cc
index 2301a3e..eb8052c 100644
--- a/libgav1/src/dsp/loop_restoration.cc
+++ b/libgav1/src/dsp/loop_restoration.cc
@@ -922,7 +922,6 @@ void Init8bpp() {
 }
 
 #if LIBGAV1_MAX_BITDEPTH >= 10
-
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
   assert(dsp != nullptr);
@@ -939,8 +938,27 @@ void Init10bpp() {
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
-
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_WienerFilter
+  dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_SelfGuidedFilter
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
+
 }  // namespace
 
 void LoopRestorationInit_C() {
@@ -948,6 +966,9 @@ void LoopRestorationInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/loop_restoration.h b/libgav1/src/dsp/loop_restoration.h
index de80926..8fefc40 100644
--- a/libgav1/src/dsp/loop_restoration.h
+++ b/libgav1/src/dsp/loop_restoration.h
@@ -39,16 +39,6 @@
 namespace libgav1 {
 namespace dsp {
 
-enum {
-  // Precision of a division table (mtable)
-  kSgrProjScaleBits = 20,
-  kSgrProjReciprocalBits = 12,
-  // Core self-guided restoration precision bits.
-  kSgrProjSgrBits = 8,
-  // Precision bits of generated values higher than source before projection.
-  kSgrProjRestoreBits = 4
-};  // anonymous enum
-
 extern const uint8_t kSgrMaLookup[256];
 
 // Initializes Dsp::loop_restorations. This function is not thread-safe.
diff --git a/libgav1/src/dsp/mask_blend.cc b/libgav1/src/dsp/mask_blend.cc
index 207fde0..34d7fe8 100644
--- a/libgav1/src/dsp/mask_blend.cc
+++ b/libgav1/src/dsp/mask_blend.cc
@@ -197,7 +197,50 @@ void Init10bpp() {
   dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>;
+  dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>;
+  dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>;
+  dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>;
+  // These are only used with 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend444
+  dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend422
+  dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend420
+  dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>;
 #endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra444
+  dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra422
+  dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra420
+  dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>;
+#endif
+  // These are only used with 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 
 }  // namespace
 
@@ -206,6 +249,9 @@ void MaskBlendInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/obmc.cc b/libgav1/src/dsp/obmc.cc
index 6b5c6e3..479cb1d 100644
--- a/libgav1/src/dsp/obmc.cc
+++ b/libgav1/src/dsp/obmc.cc
@@ -116,7 +116,28 @@ void Init10bpp() {
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+  dsp->obmc_blend[kObmcDirectionHorizontal] =
+      OverlapBlendHorizontal_C<uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_ObmcVertical
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ObmcHorizontal
+  dsp->obmc_blend[kObmcDirectionHorizontal] =
+      OverlapBlendHorizontal_C<uint16_t>;
 #endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 
 }  // namespace
 
@@ -125,6 +146,9 @@ void ObmcInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/super_res.cc b/libgav1/src/dsp/super_res.cc
index 570ba73..7593729 100644
--- a/libgav1/src/dsp/super_res.cc
+++ b/libgav1/src/dsp/super_res.cc
@@ -95,7 +95,23 @@ void Init10bpp() {
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->super_res = SuperRes_C<12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_SuperRes
+  dsp->super_res = SuperRes_C<12, uint16_t>;
 #endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 
 }  // namespace
 
@@ -104,6 +120,9 @@ void SuperResInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/warp.cc b/libgav1/src/dsp/warp.cc
index dd467ea..f62f1ed 100644
--- a/libgav1/src/dsp/warp.cc
+++ b/libgav1/src/dsp/warp.cc
@@ -111,14 +111,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
          start_x += 8) {
       const int src_x = (start_x + 4) << subsampling_x;
       const int src_y = (start_y + 4) << subsampling_y;
-      const int dst_x =
-          src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
-      const int dst_y =
-          src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
-      const int x4 = dst_x >> subsampling_x;
-      const int y4 = dst_y >> subsampling_y;
-      const int ix4 = x4 >> kWarpedModelPrecisionBits;
-      const int iy4 = y4 >> kWarpedModelPrecisionBits;
+      const WarpFilterParams filter_params = GetWarpFilterParams(
+          src_x, src_y, subsampling_x, subsampling_y, warp_params);
 
       // A prediction block may fall outside the frame's boundaries. If a
       // prediction block is calculated using only samples outside the frame's
@@ -172,22 +166,24 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
       // border index (source_width - 1 or 0, respectively). Then for each x,
       // the inner for loop of the horizontal filter is reduced to multiplying
       // the border pixel by the sum of the filter coefficients.
-      if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+      if (filter_params.ix4 - 7 >= source_width - 1 ||
+          filter_params.ix4 + 7 <= 0) {
         // Regions 1 and 2.
         // Points to the left or right border of the first row of |src|.
         const Pixel* first_row_border =
-            (ix4 + 7 <= 0) ? src : src + source_width - 1;
+            (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
         // In general, for y in [-7, 8), the row number iy4 + y is clipped:
         //   const int row = Clip3(iy4 + y, 0, source_height - 1);
         // In two special cases, iy4 + y is clipped to either 0 or
         // source_height - 1 for all y. In the rest of the cases, iy4 + y is
         // bounded and we can avoid clipping iy4 + y by relying on a reference
         // frame's boundary extension on the top and bottom.
-        if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+        if (filter_params.iy4 - 7 >= source_height - 1 ||
+            filter_params.iy4 + 7 <= 0) {
           // Region 1.
           // Every sample used to calculate the prediction block has the same
           // value. So the whole prediction block has the same value.
-          const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
           const Pixel row_border_pixel = first_row_border[row * source_stride];
           DestType* dst_row = dst + start_x - block_start_x;
           if (is_compound) {
@@ -220,15 +216,15 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
         for (int y = -7; y < 8; ++y) {
           // We may over-read up to 13 pixels above the top source row, or up
           // to 13 pixels below the bottom source row. This is proved below.
-          const int row = iy4 + y;
+          const int row = filter_params.iy4 + y;
           int sum = first_row_border[row * source_stride];
           sum <<= kFilterBits - kRoundBitsHorizontal;
           intermediate_result_column[y + 7] = sum;
         }
         // Vertical filter.
         DestType* dst_row = dst + start_x - block_start_x;
-        int sy4 =
-            (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+        int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  MultiplyBy4(delta);
         for (int y = 0; y < 8; ++y) {
           int sy = sy4 - MultiplyBy4(gamma);
           for (int x = 0; x < 8; ++x) {
@@ -269,12 +265,14 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
       // source_height - 1 for all y. In the rest of the cases, iy4 + y is
       // bounded and we can avoid clipping iy4 + y by relying on a reference
       // frame's boundary extension on the top and bottom.
-      if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+      if (filter_params.iy4 - 7 >= source_height - 1 ||
+          filter_params.iy4 + 7 <= 0) {
         // Region 3.
         // Horizontal filter.
-        const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
         const Pixel* const src_row = src + row * source_stride;
-        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
         for (int y = -7; y < 8; ++y) {
           int sx = sx4 - MultiplyBy4(alpha);
           for (int x = -4; x < 4; ++x) {
@@ -300,7 +298,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
               //   -13 <= column <= (source_width - 1) + 13.
               // Therefore we may over-read up to 13 pixels before the source
               // row, or up to 13 pixels after the source row.
-              const int column = ix4 + x + k - 3;
+              const int column = filter_params.ix4 + x + k - 3;
               sum += kWarpedFilters[offset][k] * src_row[column];
             }
             intermediate_result[y + 7][x + 4] =
@@ -315,7 +313,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
         // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0.
         // It follows that -6 <= iy4 <= source_height + 5. This inequality is
         // used below.
-        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                  beta * 7;
         for (int y = -7; y < 8; ++y) {
           // We assume the source frame has top and bottom borders of at least
           // 13 pixels that extend the frame boundary pixels.
@@ -326,7 +325,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
           //   -13 <= row <= (source_height - 1) + 13.
           // Therefore we may over-read up to 13 pixels above the top source
           // row, or up to 13 pixels below the bottom source row.
-          const int row = iy4 + y;
+          const int row = filter_params.iy4 + y;
           const Pixel* const src_row = src + row * source_stride;
           int sx = sx4 - MultiplyBy4(alpha);
           for (int x = -4; x < 4; ++x) {
@@ -352,7 +351,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
               //   -13 <= column <= (source_width - 1) + 13.
               // Therefore we may over-read up to 13 pixels before the source
               // row, or up to 13 pixels after the source row.
-              const int column = ix4 + x + k - 3;
+              const int column = filter_params.ix4 + x + k - 3;
               sum += kWarpedFilters[offset][k] * src_row[column];
             }
             intermediate_result[y + 7][x + 4] =
@@ -367,8 +366,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
       // Regions 3 and 4.
       // Vertical filter.
       DestType* dst_row = dst + start_x - block_start_x;
-      int sy4 =
-          (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+      int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+                MultiplyBy4(delta);
       // The spec says we should use the following loop condition:
       //   y < std::min(4, block_start_y + block_height - start_y - 4);
       // We can prove that block_start_y + block_height - start_y >= 8, which
@@ -460,7 +459,26 @@ void Init10bpp() {
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>;
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_Warp
+  dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>;
 #endif
+#ifndef LIBGAV1_Dsp12bpp_WarpCompound
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 
 }  // namespace
 
@@ -469,6 +487,9 @@ void WarpInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/warp.h b/libgav1/src/dsp/warp.h
index 7367a9b..9c20f12 100644
--- a/libgav1/src/dsp/warp.h
+++ b/libgav1/src/dsp/warp.h
@@ -38,9 +38,39 @@
 namespace libgav1 {
 namespace dsp {
 
+// Section 7.11.3.5.
+struct WarpFilterParams {
+  int64_t x4;
+  int64_t y4;
+  int ix4;
+  int iy4;
+};
+
 // Initializes Dsp::warp. This function is not thread-safe.
 void WarpInit_C();
 
+// Section 7.11.3.5.
+inline WarpFilterParams GetWarpFilterParams(int src_x, int src_y,
+                                            int subsampling_x,
+                                            int subsampling_y,
+                                            const int* warp_params) {
+  WarpFilterParams filter_params;
+  // warp_params[2]/[5] require 17 bits (the others 14). With large resolutions
+  // the result of the multiplication will require 33.
+  const int64_t dst_x = static_cast<int64_t>(src_x) * warp_params[2] +
+                        src_y * warp_params[3] + warp_params[0];
+  const int64_t dst_y = src_x * warp_params[4] +
+                        static_cast<int64_t>(src_y) * warp_params[5] +
+                        warp_params[1];
+  filter_params.x4 = dst_x >> subsampling_x;
+  filter_params.y4 = dst_y >> subsampling_y;
+  filter_params.ix4 =
+      static_cast<int>(filter_params.x4 >> kWarpedModelPrecisionBits);
+  filter_params.iy4 =
+      static_cast<int>(filter_params.y4 >> kWarpedModelPrecisionBits);
+  return filter_params;
+}
+
 }  // namespace dsp
 }  // namespace libgav1
 
diff --git a/libgav1/src/dsp/weight_mask.cc b/libgav1/src/dsp/weight_mask.cc
index 41f4c70..ee3808b 100644
--- a/libgav1/src/dsp/weight_mask.cc
+++ b/libgav1/src/dsp/weight_mask.cc
@@ -213,7 +213,86 @@ void Init10bpp() {
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_WEIGHT_MASK(8, 8, 12, 0, 0);
+  INIT_WEIGHT_MASK(8, 16, 12, 0, 1);
+  INIT_WEIGHT_MASK(8, 32, 12, 0, 2);
+  INIT_WEIGHT_MASK(16, 8, 12, 1, 0);
+  INIT_WEIGHT_MASK(16, 16, 12, 1, 1);
+  INIT_WEIGHT_MASK(16, 32, 12, 1, 2);
+  INIT_WEIGHT_MASK(16, 64, 12, 1, 3);
+  INIT_WEIGHT_MASK(32, 8, 12, 2, 0);
+  INIT_WEIGHT_MASK(32, 16, 12, 2, 1);
+  INIT_WEIGHT_MASK(32, 32, 12, 2, 2);
+  INIT_WEIGHT_MASK(32, 64, 12, 2, 3);
+  INIT_WEIGHT_MASK(64, 16, 12, 3, 1);
+  INIT_WEIGHT_MASK(64, 32, 12, 3, 2);
+  INIT_WEIGHT_MASK(64, 64, 12, 3, 3);
+  INIT_WEIGHT_MASK(64, 128, 12, 3, 4);
+  INIT_WEIGHT_MASK(128, 64, 12, 4, 3);
+  INIT_WEIGHT_MASK(128, 128, 12, 4, 4);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x8
+  INIT_WEIGHT_MASK(8, 8, 12, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x16
+  INIT_WEIGHT_MASK(8, 16, 12, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x32
+  INIT_WEIGHT_MASK(8, 32, 12, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x8
+  INIT_WEIGHT_MASK(16, 8, 12, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x16
+  INIT_WEIGHT_MASK(16, 16, 12, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x32
+  INIT_WEIGHT_MASK(16, 32, 12, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x64
+  INIT_WEIGHT_MASK(16, 64, 12, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x8
+  INIT_WEIGHT_MASK(32, 8, 12, 2, 0);
 #endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x16
+  INIT_WEIGHT_MASK(32, 16, 12, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x32
+  INIT_WEIGHT_MASK(32, 32, 12, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x64
+  INIT_WEIGHT_MASK(32, 64, 12, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x16
+  INIT_WEIGHT_MASK(64, 16, 12, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x32
+  INIT_WEIGHT_MASK(64, 32, 12, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x64
+  INIT_WEIGHT_MASK(64, 64, 12, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x128
+  INIT_WEIGHT_MASK(64, 128, 12, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x64
+  INIT_WEIGHT_MASK(128, 64, 12, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x128
+  INIT_WEIGHT_MASK(128, 128, 12, 4, 4);
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 
 }  // namespace
 
@@ -222,6 +301,9 @@ void WeightMaskInit_C() {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+  Init12bpp();
+#endif
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/x86/average_blend_sse4.cc b/libgav1/src/dsp/x86/average_blend_sse4.cc
index 911c5a9..c08b3d6 100644
--- a/libgav1/src/dsp/x86/average_blend_sse4.cc
+++ b/libgav1/src/dsp/x86/average_blend_sse4.cc
@@ -35,24 +35,46 @@ namespace {
 
 constexpr int kInterPostRoundBit = 4;
 
-inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
-                             const int16_t* LIBGAV1_RESTRICT prediction_1,
-                             uint8_t* LIBGAV1_RESTRICT dest) {
-  const __m128i pred_0 = LoadLo8(prediction_0);
-  const __m128i pred_1 = LoadLo8(prediction_1);
-  __m128i res = _mm_add_epi16(pred_0, pred_1);
-  res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
-  Store4(dest, _mm_packus_epi16(res, res));
+inline void AverageBlend4x4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                               const int16_t* LIBGAV1_RESTRICT prediction_1,
+                               uint8_t* LIBGAV1_RESTRICT dest,
+                               const ptrdiff_t dest_stride) {
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+  res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+  res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+  const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+  Store4(dest, result_pixels);
+  dest += dest_stride;
+  const int result_1 = _mm_extract_epi32(result_pixels, 1);
+  memcpy(dest, &result_1, sizeof(result_1));
+  dest += dest_stride;
+  const int result_2 = _mm_extract_epi32(result_pixels, 2);
+  memcpy(dest, &result_2, sizeof(result_2));
+  dest += dest_stride;
+  const int result_3 = _mm_extract_epi32(result_pixels, 3);
+  memcpy(dest, &result_3, sizeof(result_3));
 }
 
 inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
                              const int16_t* LIBGAV1_RESTRICT prediction_1,
-                             uint8_t* LIBGAV1_RESTRICT dest) {
-  const __m128i pred_0 = LoadAligned16(prediction_0);
-  const __m128i pred_1 = LoadAligned16(prediction_1);
-  __m128i res = _mm_add_epi16(pred_0, pred_1);
-  res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
-  StoreLo8(dest, _mm_packus_epi16(res, res));
+                             uint8_t* LIBGAV1_RESTRICT dest,
+                             const ptrdiff_t dest_stride) {
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+  res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+  res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+  const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+  StoreLo8(dest, result_pixels);
+  StoreHi8(dest + dest_stride, result_pixels);
 }
 
 inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
@@ -85,35 +107,27 @@ void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
   int y = height;
 
   if (width == 4) {
+    const ptrdiff_t dest_stride4 = dest_stride << 2;
+    constexpr ptrdiff_t width4 = 4 << 2;
     do {
-      // TODO(b/150326556): |prediction_[01]| values are packed. It is possible
-      // to load 8 values at a time.
-      AverageBlend4Row(pred_0, pred_1, dst);
-      dst += dest_stride;
-      pred_0 += width;
-      pred_1 += width;
-
-      AverageBlend4Row(pred_0, pred_1, dst);
-      dst += dest_stride;
-      pred_0 += width;
-      pred_1 += width;
+      AverageBlend4x4Row(pred_0, pred_1, dst, dest_stride);
+      dst += dest_stride4;
+      pred_0 += width4;
+      pred_1 += width4;
 
-      y -= 2;
+      y -= 4;
     } while (y != 0);
     return;
   }
 
   if (width == 8) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    constexpr ptrdiff_t width2 = 8 << 1;
     do {
-      AverageBlend8Row(pred_0, pred_1, dst);
-      dst += dest_stride;
-      pred_0 += width;
-      pred_1 += width;
-
-      AverageBlend8Row(pred_0, pred_1, dst);
-      dst += dest_stride;
-      pred_0 += width;
-      pred_1 += width;
+      AverageBlend8Row(pred_0, pred_1, dst, dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
 
       y -= 2;
     } while (y != 0);
diff --git a/libgav1/src/dsp/x86/convolve_avx2.cc b/libgav1/src/dsp/x86/convolve_avx2.cc
index 4126ca9..6e94347 100644
--- a/libgav1/src/dsp/x86/convolve_avx2.cc
+++ b/libgav1/src/dsp/x86/convolve_avx2.cc
@@ -39,17 +39,17 @@ namespace {
 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
 // sum from outranging int16_t.
-template <int filter_index>
+template <int num_taps>
 __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
   __m256i sum;
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]);  // k2k1
     const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]);  // k4k3
     const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]);  // k6k5
     sum = _mm256_add_epi16(v_madd_21, v_madd_43);
     sum = _mm256_add_epi16(sum, v_madd_65);
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]);  // k1k0
     const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]);  // k3k2
@@ -58,7 +58,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
     const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
     const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
     sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     sum = _mm256_maddubs_epi16(src[0], taps[0]);  // k4k3
   } else {
@@ -70,7 +70,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
   return sum;
 }
 
-template <int filter_index>
+template <int num_taps>
 __m256i SumHorizontalTaps(const __m256i* const src,
                           const __m256i* const v_tap) {
   __m256i v_src[4];
@@ -78,32 +78,32 @@ __m256i SumHorizontalTaps(const __m256i* const src,
   const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
   const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
 
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
     v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
     v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
     v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
     v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
     v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
-  } else if (filter_index > 3) {
+  } else {
     // 4 taps.
     v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
     v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
   }
-  return SumOnePassTaps<filter_index>(v_src, v_tap);
+  return SumOnePassTaps<num_taps>(v_src, v_tap);
 }
 
-template <int filter_index>
+template <int num_taps>
 __m256i SimpleHorizontalTaps(const __m256i* const src,
                              const __m256i* const v_tap) {
-  __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+  __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
 
   // Normally the Horizontal pass does the downshift in two passes:
   // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -116,17 +116,16 @@ __m256i SimpleHorizontalTaps(const __m256i* const src,
   return _mm256_packus_epi16(sum, sum);
 }
 
-template <int filter_index>
+template <int num_taps>
 __m256i HorizontalTaps8To16(const __m256i* const src,
                             const __m256i* const v_tap) {
-  const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+  const __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
 
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
 // Filter 2xh sizes.
-template <int num_taps, int filter_index, bool is_2d = false,
-          bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
 void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
                       const ptrdiff_t src_stride,
                       void* LIBGAV1_RESTRICT const dest,
@@ -145,14 +144,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       do {
         if (is_2d) {
           const __m128i sum =
-              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+              HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
           Store4(&dest16[0], sum);
           dest16 += pred_stride;
           Store4(&dest16[0], _mm_srli_si128(sum, 8));
           dest16 += pred_stride;
         } else {
           const __m128i sum =
-              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+              SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
           Store2(dest8, sum);
           dest8 += pred_stride;
           Store2(dest8, _mm_srli_si128(sum, 4));
@@ -169,7 +168,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         assert(height % 2 == 1);
         __m128i sum;
         const __m128i input = LoadLo8(&src[2]);
-        if (filter_index == 3) {
+        if (num_taps == 2) {
           // 03 04 04 05 05 06 06 07 ....
           const __m128i v_src_43 =
               _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
@@ -194,8 +193,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
 }
 
 // Filter widths >= 4.
-template <int num_taps, int filter_index, bool is_2d = false,
-          bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
 void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
                       const ptrdiff_t src_stride,
                       void* LIBGAV1_RESTRICT const dest,
@@ -214,11 +212,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
           const __m256i src_long =
               SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
           const __m256i result =
-              HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+              HorizontalTaps8To16<num_taps>(&src_long, v_tap);
           const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
                                               LoadUnaligned16(&src[x + 24]));
           const __m256i result2 =
-              HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+              HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
           if (is_2d) {
             StoreAligned32(&dest16[x], result);
             StoreAligned32(&dest16[x + 16], result2);
@@ -230,11 +228,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
           // Load src used to calculate dest8[7:0] and dest8[23:16].
           const __m256i src_long = LoadUnaligned32(&src[x]);
           const __m256i result =
-              SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+              SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
           // Load src used to calculate dest8[15:8] and dest8[31:24].
           const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
           const __m256i result2 =
-              SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+              SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
           // Combine results and store.
           StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
         }
@@ -252,13 +250,12 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         // Load into 2 128 bit lanes.
         const __m256i src_long =
             SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
-        const __m256i result =
-            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
         const __m256i src_long2 =
             SetrM128i(LoadUnaligned16(&src[src_stride]),
                       LoadUnaligned16(&src[8 + src_stride]));
         const __m256i result2 =
-            HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+            HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
         if (is_2d) {
           StoreAligned32(&dest16[0], result);
           StoreAligned32(&dest16[pred_stride], result2);
@@ -270,12 +267,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         // Load into 2 128 bit lanes.
         const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
                                            LoadUnaligned16(&src[src_stride]));
-        const __m256i result =
-            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
         const __m256i src_long2 = SetrM128i(
             LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
         const __m256i result2 =
-            SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+            SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
         const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
         StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
         StoreUnaligned16(&dest8[pred_stride],
@@ -292,8 +288,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
     if (is_2d) {
       const __m256i src_long =
           SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
-      const __m256i result =
-          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
       StoreAligned32(&dest16[0], result);
     }
 
@@ -306,8 +301,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       const __m128i next_row = LoadUnaligned16(&src[src_stride]);
       const __m256i src_long = SetrM128i(this_row, next_row);
       if (is_2d || is_compound) {
-        const __m256i result =
-            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
         if (is_2d) {
           StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
           StoreAligned16(&dest16[pred_stride],
@@ -322,8 +316,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         const __m128i next_row = LoadUnaligned16(&src[src_stride]);
         // Load into 2 128 bit lanes.
         const __m256i src_long = SetrM128i(this_row, next_row);
-        const __m256i result =
-            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
         StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
         StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
       }
@@ -337,8 +330,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
     // filter the remaining row.
     if (is_2d) {
       const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
-      const __m256i result =
-          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
       StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
     }
 
@@ -351,8 +343,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       const __m128i next_row = LoadUnaligned16(&src[src_stride]);
       const __m256i src_long = SetrM128i(this_row, next_row);
       if (is_2d || is_compound) {
-        const __m256i result =
-            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
         StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
         StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
       } else {
@@ -360,8 +351,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         const __m128i next_row = LoadUnaligned16(&src[src_stride]);
         // Load into 2 128 bit lanes.
         const __m256i src_long = SetrM128i(this_row, next_row);
-        const __m256i result =
-            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
         Store4(&dest8[0], _mm256_castsi256_si128(result));
         Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
       }
@@ -375,8 +365,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
     // filter the remaining row.
     if (is_2d) {
       const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
-      const __m256i result =
-          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
       StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
     }
   }
@@ -554,18 +543,15 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
   const __m128i v_horizontal_filter =
       LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
 
-  if (filter_index == 4) {  // 4 tap.
-    SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
-  } else if (filter_index == 5) {  // 4 tap.
+  if ((filter_index & 0x4) != 0) {  // 4 tap.
+    // ((filter_index == 4) | (filter_index == 5))
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else {  // 2 tap.
     SetupTaps<2>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   }
 }
 
@@ -582,28 +568,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
 
   if (filter_index == 2) {  // 8 tap.
     SetupTaps<8>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else if (filter_index == 0) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
-  } else if (filter_index == 4) {  // 4 tap.
-    SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
-  } else if (filter_index == 5) {  // 4 tap.
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if ((filter_index & 0x4) != 0) {  // 4 tap.
+    // ((filter_index == 4) | (filter_index == 5))
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else {  // 2 tap.
     SetupTaps<2>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   }
 }
 
@@ -617,7 +600,8 @@ void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
                      const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
 
   // The output of the horizontal filter is guaranteed to fit in 16 bits.
   alignas(32) uint16_t
@@ -730,61 +714,60 @@ __m256i Compound1DShift(const __m256i sum) {
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
-template <int filter_index, bool unpack_high = false>
+template <int num_taps, bool unpack_high = false>
 __m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
   __m256i v_src[4];
 
   if (!unpack_high) {
-    if (filter_index < 2) {
+    if (num_taps == 6) {
       // 6 taps.
       v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
       v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
-    } else if (filter_index == 2) {
+    } else if (num_taps == 8) {
       // 8 taps.
       v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
       v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
       v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
-    } else if (filter_index == 3) {
+    } else if (num_taps == 2) {
       // 2 taps.
       v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
-    } else if (filter_index > 3) {
+    } else {
       // 4 taps.
       v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
     }
   } else {
-    if (filter_index < 2) {
+    if (num_taps == 6) {
       // 6 taps.
       v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
       v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
-    } else if (filter_index == 2) {
+    } else if (num_taps == 8) {
       // 8 taps.
       v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
       v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
       v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
-    } else if (filter_index == 3) {
+    } else if (num_taps == 2) {
       // 2 taps.
       v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
-    } else if (filter_index > 3) {
+    } else {
       // 4 taps.
       v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
     }
   }
-  return SumOnePassTaps<filter_index>(v_src, v_tap);
+  return SumOnePassTaps<num_taps>(v_src, v_tap);
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
                         const ptrdiff_t src_stride,
                         void* LIBGAV1_RESTRICT const dst,
                         const ptrdiff_t dst_stride, const int width,
                         const int height, const __m256i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -821,9 +804,9 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
       srcs[next_row] = LoadUnaligned32(src_x);
       src_x += src_stride;
 
-      const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       const __m256i sums_hi =
-          SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+          SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
       if (is_compound) {
         const __m256i results =
             Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
@@ -861,13 +844,12 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
   } while (x < width);
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
                         const ptrdiff_t src_stride,
                         void* LIBGAV1_RESTRICT const dst,
                         const ptrdiff_t dst_stride, const int /*width*/,
                         const int height, const __m256i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps;
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -922,9 +904,9 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
     srcs[next_row - 1] = _mm256_inserti128_si256(
         srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
 
-    const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
     const __m256i sums_hi =
-        SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+        SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
     if (is_compound) {
       const __m256i results =
           Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
@@ -964,13 +946,12 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
   } while (y != 0);
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
                        const ptrdiff_t src_stride,
                        void* LIBGAV1_RESTRICT const dst,
                        const ptrdiff_t dst_stride, const int /*width*/,
                        const int height, const __m256i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps;
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -1025,7 +1006,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
     srcs[next_row - 1] = _mm256_inserti128_si256(
         srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
 
-    const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
     if (is_compound) {
       const __m256i results = Compound1DShift(sums);
       const __m128i this_dst = _mm256_castsi256_si128(results);
@@ -1062,13 +1043,12 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
   } while (y != 0);
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
                        const ptrdiff_t src_stride,
                        void* LIBGAV1_RESTRICT const dst,
                        const ptrdiff_t dst_stride, const int /*width*/,
                        const int height, const __m128i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -1101,7 +1081,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
     srcs[next_row] = LoadLo8(src_x);
     src_x += src_stride;
 
-    const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
     if (is_compound) {
       const __m128i results = Compound1DShift(sums);
       StoreUnaligned16(dst16, results);
@@ -1137,7 +1117,8 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
                            const int height, void* LIBGAV1_RESTRICT prediction,
                            const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
@@ -1151,43 +1132,43 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
   // Use 256 bits for width > 4.
   if (width > 4) {
     __m256i taps_256[4];
-    if (filter_index < 2) {  // 6 tap.
+    if (vertical_taps == 6) {  // 6 tap.
       SetupTaps<6>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical8xH<6>(src, src_stride, dest, dest_stride, width, height,
                              taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical16xH<6>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       } else {
-        FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical32xH<6>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       }
-    } else if (filter_index == 2) {  // 8 tap.
+    } else if (vertical_taps == 8) {  // 8 tap.
       SetupTaps<8>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical8xH<8>(src, src_stride, dest, dest_stride, width, height,
                              taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical16xH<8>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       } else {
-        FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical32xH<8>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       }
-    } else if (filter_index == 3) {  // 2 tap.
+    } else if (vertical_taps == 2) {  // 2 tap.
       SetupTaps<2>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
                              taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       } else {
-        FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       }
-    } else if (filter_index == 4) {  // 4 tap.
+    } else {  // 4 tap.
       SetupTaps<4>(&v_filter, taps_256);
       if (width == 8) {
         FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
@@ -1199,67 +1180,38 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
         FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       }
-    } else {
-      SetupTaps<4>(&v_filter, taps_256);
-      if (width == 8) {
-        FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height,
-                             taps_256);
-      } else if (width == 16) {
-        FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height,
-                              taps_256);
-      } else {
-        FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height,
-                              taps_256);
-      }
     }
   } else {  // width <= 8
     // Use 128 bit code.
     __m128i taps[4];
 
-    if (filter_index < 2) {  // 6 tap.
+    if (vertical_taps == 6) {  // 6 tap.
       SetupTaps<6>(&v_filter, taps);
       if (width == 2) {
-        FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
       } else {
-        FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
       }
-    } else if (filter_index == 2) {  // 8 tap.
+    } else if (vertical_taps == 8) {  // 8 tap.
       SetupTaps<8>(&v_filter, taps);
       if (width == 2) {
-        FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
       } else {
-        FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
       }
-    } else if (filter_index == 3) {  // 2 tap.
+    } else if (vertical_taps == 2) {  // 2 tap.
       SetupTaps<2>(&v_filter, taps);
       if (width == 2) {
-        FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height,
-                                taps);
-      } else {
-        FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height,
-                                taps);
-      }
-    } else if (filter_index == 4) {  // 4 tap.
-      SetupTaps<4>(&v_filter, taps);
-      if (width == 2) {
-        FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
       } else {
-        FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
       }
-    } else {
+    } else {  // 4 tap.
       SetupTaps<4>(&v_filter, taps);
       if (width == 2) {
-        FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
       } else {
-        FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
       }
     }
   }
@@ -1272,7 +1224,8 @@ void ConvolveCompoundVertical_AVX2(
     const int vertical_filter_id, const int width, const int height,
     void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
@@ -1286,43 +1239,43 @@ void ConvolveCompoundVertical_AVX2(
   // Use 256 bits for width > 4.
   if (width > 4) {
     __m256i taps_256[4];
-    if (filter_index < 2) {  // 6 tap.
+    if (vertical_taps == 6) {  // 6 tap.
       SetupTaps<6>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<0, /*is_compound=*/true>(
+        FilterVertical8xH<6, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<0, /*is_compound=*/true>(
+        FilterVertical16xH<6, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else {
-        FilterVertical32xH<0, /*is_compound=*/true>(
+        FilterVertical32xH<6, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       }
-    } else if (filter_index == 2) {  // 8 tap.
+    } else if (vertical_taps == 8) {  // 8 tap.
       SetupTaps<8>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<2, /*is_compound=*/true>(
+        FilterVertical8xH<8, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<2, /*is_compound=*/true>(
+        FilterVertical16xH<8, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else {
-        FilterVertical32xH<2, /*is_compound=*/true>(
+        FilterVertical32xH<8, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       }
-    } else if (filter_index == 3) {  // 2 tap.
+    } else if (vertical_taps == 2) {  // 2 tap.
       SetupTaps<2>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<3, /*is_compound=*/true>(
+        FilterVertical8xH<2, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<3, /*is_compound=*/true>(
+        FilterVertical16xH<2, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else {
-        FilterVertical32xH<3, /*is_compound=*/true>(
+        FilterVertical32xH<2, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       }
-    } else if (filter_index == 4) {  // 4 tap.
+    } else {  // 4 tap.
       SetupTaps<4>(&v_filter, taps_256);
       if (width == 8) {
         FilterVertical8xH<4, /*is_compound=*/true>(
@@ -1334,43 +1287,27 @@ void ConvolveCompoundVertical_AVX2(
         FilterVertical32xH<4, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       }
-    } else {
-      SetupTaps<4>(&v_filter, taps_256);
-      if (width == 8) {
-        FilterVertical8xH<5, /*is_compound=*/true>(
-            src, src_stride, dest, dest_stride, width, height, taps_256);
-      } else if (width == 16) {
-        FilterVertical16xH<5, /*is_compound=*/true>(
-            src, src_stride, dest, dest_stride, width, height, taps_256);
-      } else {
-        FilterVertical32xH<5, /*is_compound=*/true>(
-            src, src_stride, dest, dest_stride, width, height, taps_256);
-      }
     }
   } else {  // width <= 4
     // Use 128 bit code.
     __m128i taps[4];
 
-    if (filter_index < 2) {  // 6 tap.
+    if (vertical_taps == 6) {  // 6 tap.
       SetupTaps<6>(&v_filter, taps);
-      FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest,
-                                                    dest_stride, height, taps);
-    } else if (filter_index == 2) {  // 8 tap.
+      FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
+    } else if (vertical_taps == 8) {  // 8 tap.
       SetupTaps<8>(&v_filter, taps);
-      FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest,
-                                                    dest_stride, height, taps);
-    } else if (filter_index == 3) {  // 2 tap.
+      FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
+    } else if (vertical_taps == 2) {  // 2 tap.
       SetupTaps<2>(&v_filter, taps);
-      FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest,
-                                                    dest_stride, height, taps);
-    } else if (filter_index == 4) {  // 4 tap.
-      SetupTaps<4>(&v_filter, taps);
-      FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest,
-                                                    dest_stride, height, taps);
-    } else {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
+    } else {  // 4 tap.
       SetupTaps<4>(&v_filter, taps);
-      FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest,
-                                                    dest_stride, height, taps);
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
     }
   }
 }
@@ -1430,7 +1367,8 @@ void ConvolveCompound2D_AVX2(
     void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
 
   // The output of the horizontal filter is guaranteed to fit in 16 bits.
   alignas(32) uint16_t
diff --git a/libgav1/src/dsp/x86/convolve_sse4.cc b/libgav1/src/dsp/x86/convolve_sse4.cc
index f7e5a71..f427c4c 100644
--- a/libgav1/src/dsp/x86/convolve_sse4.cc
+++ b/libgav1/src/dsp/x86/convolve_sse4.cc
@@ -36,7 +36,7 @@ namespace {
 
 #include "src/dsp/x86/convolve_sse4.inc"
 
-template <int filter_index>
+template <int num_taps>
 __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
                           const __m128i* const v_tap) {
   __m128i v_src[4];
@@ -44,33 +44,33 @@ __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
   const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
   const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
 
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
     v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
     v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
     v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
     v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
     v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
-  } else if (filter_index > 3) {
+  } else {
     // 4 taps.
     v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
     v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
   }
-  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
   return sum;
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
                              const __m128i* const v_tap) {
-  __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+  __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
 
   // Normally the Horizontal pass does the downshift in two passes:
   // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -83,16 +83,15 @@ __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
   return _mm_packus_epi16(sum, sum);
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
                             const __m128i* const v_tap) {
-  const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+  const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
 
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
-template <int num_taps, int filter_index, bool is_2d = false,
-          bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
 void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
                       const ptrdiff_t src_stride,
                       void* LIBGAV1_RESTRICT const dest,
@@ -108,16 +107,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       int x = 0;
       do {
         if (is_2d || is_compound) {
-          const __m128i v_sum =
-              HorizontalTaps8To16<filter_index>(&src[x], v_tap);
+          const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap);
           if (is_2d) {
             StoreAligned16(&dest16[x], v_sum);
           } else {
             StoreUnaligned16(&dest16[x], v_sum);
           }
         } else {
-          const __m128i result =
-              SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
+          const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap);
           StoreLo8(&dest8[x], result);
         }
         x += 8;
@@ -138,10 +135,10 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       int y = height;
       do {
         if (is_2d || is_compound) {
-          const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
+          const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap);
           StoreLo8(dest16, v_sum);
         } else {
-          const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap);
+          const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap);
           Store4(&dest8[0], result);
         }
         src += src_stride;
@@ -157,14 +154,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       do {
         if (is_2d) {
           const __m128i sum =
-              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+              HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
           Store4(&dest16[0], sum);
           dest16 += pred_stride;
           Store4(&dest16[0], _mm_srli_si128(sum, 8));
           dest16 += pred_stride;
         } else {
           const __m128i sum =
-              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+              SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
           Store2(dest8, sum);
           dest8 += pred_stride;
           Store2(dest8, _mm_srli_si128(sum, 4));
@@ -181,7 +178,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         assert(height % 2 == 1);
         __m128i sum;
         const __m128i input = LoadLo8(&src[2]);
-        if (filter_index == 3) {
+        if (num_taps == 2) {
           // 03 04 04 05 05 06 06 07 ....
           const __m128i v_src_43 =
               _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
@@ -218,28 +215,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
 
   if (filter_index == 2) {  // 8 tap.
     SetupTaps<8>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else if (filter_index == 0) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
-  } else if (filter_index == 4) {  // 4 tap.
-    SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
-  } else if (filter_index == 5) {  // 4 tap.
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if ((filter_index & 0x4) != 0) {  // 4 tap.
+    // ((filter_index == 4) | (filter_index == 5))
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else {  // 2 tap.
     SetupTaps<2>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   }
 }
 
@@ -253,7 +247,8 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
                        const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
 
   // The output of the horizontal filter is guaranteed to fit in 16 bits.
   alignas(16) uint16_t
@@ -329,13 +324,12 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
   }
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
                     const ptrdiff_t src_stride,
                     void* LIBGAV1_RESTRICT const dst,
                     const ptrdiff_t dst_stride, const int width,
                     const int height, const __m128i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -373,7 +367,7 @@ void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
       srcs[next_row] = LoadLo8(src_x);
       src_x += src_stride;
 
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
         StoreUnaligned16(dst16_x, results);
@@ -410,7 +404,8 @@ void ConvolveVertical_SSE4_1(
     const int vertical_filter_id, const int width, const int height,
     void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
@@ -422,63 +417,50 @@ void ConvolveVertical_SSE4_1(
   const __m128i v_filter =
       LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
 
-  if (filter_index < 2) {  // 6 tap.
+  if (vertical_taps == 6) {  // 6 tap.
     SetupTaps<6>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
     } else {
-      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
                         taps);
     }
-  } else if (filter_index == 2) {  // 8 tap.
+  } else if (vertical_taps == 8) {  // 8 tap.
     SetupTaps<8>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
     } else {
-      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
                         taps);
     }
-  } else if (filter_index == 3) {  // 2 tap.
+  } else if (vertical_taps == 2) {  // 2 tap.
     SetupTaps<2>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
     } else {
-      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
                         taps);
     }
-  } else if (filter_index == 4) {  // 4 tap.
+  } else {  // 4 tap
     SetupTaps<4>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
                         taps);
     }
-  } else {
-    // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
-    // See convolve_neon.cc
-    SetupTaps<4>(&v_filter, taps);
-
-    if (width == 2) {
-      FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
-    } else if (width == 4) {
-      FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
-    } else {
-      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
-                        taps);
-    }
   }
 }
 
-void ConvolveCompoundCopy_SSE4(
+void ConvolveCompoundCopy_SSE4_1(
     const void* LIBGAV1_RESTRICT const reference,
     const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
     const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
@@ -502,7 +484,6 @@ void ConvolveCompoundCopy_SSE4(
             _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
         const __m128i v_dest_hi =
             _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
-        // TODO(slavarnway): Investigate using aligned stores.
         StoreUnaligned16(&dest[x], v_dest_lo);
         StoreUnaligned16(&dest[x + 8], v_dest_hi);
         x += 16;
@@ -544,7 +525,8 @@ void ConvolveCompoundVertical_SSE4_1(
     const int vertical_filter_id, const int width, const int height,
     void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
@@ -555,55 +537,42 @@ void ConvolveCompoundVertical_SSE4_1(
   const __m128i v_filter =
       LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
 
-  if (filter_index < 2) {  // 6 tap.
+  if (vertical_taps == 6) {  // 6 tap.
     SetupTaps<6>(&v_filter, taps);
     if (width == 4) {
-      FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                    height, taps);
+      FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
     } else {
-      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
     }
-  } else if (filter_index == 2) {  // 8 tap.
+  } else if (vertical_taps == 8) {  // 8 tap.
     SetupTaps<8>(&v_filter, taps);
-
     if (width == 4) {
-      FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                    height, taps);
+      FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
     } else {
-      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
     }
-  } else if (filter_index == 3) {  // 2 tap.
+  } else if (vertical_taps == 2) {  // 2 tap.
     SetupTaps<2>(&v_filter, taps);
-
     if (width == 4) {
-      FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                    height, taps);
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
     } else {
-      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
     }
-  } else if (filter_index == 4) {  // 4 tap.
+  } else {  // 4 tap
     SetupTaps<4>(&v_filter, taps);
-
     if (width == 4) {
-      FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                    height, taps);
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
     } else {
       FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
     }
-  } else {
-    SetupTaps<4>(&v_filter, taps);
-
-    if (width == 4) {
-      FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                    height, taps);
-    } else {
-      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
-                                              width, height, taps);
-    }
   }
 }
 
@@ -656,7 +625,8 @@ void ConvolveCompound2D_SSE4_1(
   // Similarly for height.
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
   const int intermediate_height = height + vertical_taps - 1;
   const ptrdiff_t src_stride = reference_stride;
   const auto* const src = static_cast<const uint8_t*>(reference) -
@@ -933,7 +903,7 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
                                                       source);
 
         StoreLo8(intermediate, RightShiftWithRounding_S16(
-                                   SumOnePassTaps<filter_index>(source, taps),
+                                   SumOnePassTaps<num_taps>(source, taps),
                                    kInterRoundBitsHorizontal - 1));
         src_x += src_stride;
         intermediate += kIntermediateStride;
@@ -960,10 +930,9 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
       PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
 
       // Shift by one less because the taps are halved.
-      StoreAligned16(
-          intermediate_x,
-          RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
-                                     kInterRoundBitsHorizontal - 1));
+      StoreAligned16(intermediate_x, RightShiftWithRounding_S16(
+                                         SumOnePassTaps<num_taps>(source, taps),
+                                         kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
     } while (--y != 0);
@@ -1188,7 +1157,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
   alignas(16) int16_t
       intermediate_result[kIntermediateAllocWidth *
                           (2 * kIntermediateAllocWidth + kSubPixelTaps)];
-  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index);
   const int intermediate_height =
       (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
        kScaleSubPixelBits) +
@@ -1211,7 +1180,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
   // inputs in each iteration on large blocks. When step_x is large, we need a
   // second register and alignr in order to gather all filter inputs.
   // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
-  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index);
   const int kernel_start_ceiling = 16 - num_horiz_taps;
   // This truncated quotient |grade_x_threshold| selects |step_x| such that:
   // (step_x * 7) >> kScaleSubPixelBits < single load limit
@@ -1891,7 +1860,7 @@ void Init8bpp() {
   dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
   dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
 
-  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1;
   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
   dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
diff --git a/libgav1/src/dsp/x86/convolve_sse4.inc b/libgav1/src/dsp/x86/convolve_sse4.inc
index 550d6a4..5548c5b 100644
--- a/libgav1/src/dsp/x86/convolve_sse4.inc
+++ b/libgav1/src/dsp/x86/convolve_sse4.inc
@@ -18,20 +18,63 @@
 
 #include "src/dsp/convolve.inc"
 
+// This version checks for the special cases when filter_index == 1.
+int GetNumTapsInFilter(const int filter_index, const int filter_id) {
+  if (filter_index == 0) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    return 6;
+  }
+
+  if (filter_index == 1) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) |
+         (filter_id == 8) | (filter_id == 9)) != 0) {
+      return 6;
+    }
+    // When |filter_index| == 1, the |filter_id| values not listed above map to
+    // 4 tap filters.
+    return 4;
+  }
+
+  if (filter_index == 2) {
+    // kInterpolationFilterEightTapSharp
+    return 8;
+  }
+
+  if (filter_index == 3) {
+    // kInterpolationFilterBilinear
+    return 2;
+  }
+
+  assert(filter_index > 3);
+  // For small sizes (width/height <= 4) the large filters are replaced with 4
+  // tap options.
+  // If the original filters were |kInterpolationFilterEightTap| or
+  // |kInterpolationFilterEightTapSharp| then it becomes
+  // |kInterpolationFilterSwitchable|.
+  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+  // tap filter.
+  return 4;
+}
+
 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
 // sum from outranging int16_t.
-template <int filter_index>
+template <int num_taps>
 __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
   __m128i sum;
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
     const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
     const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
     sum = _mm_add_epi16(v_madd_21, v_madd_43);
     sum = _mm_add_epi16(sum, v_madd_65);
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
     const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
@@ -40,7 +83,7 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
     const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
     const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
     sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
   } else {
@@ -52,13 +95,13 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
   return sum;
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
                              const __m128i* const v_tap) {
   // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
   const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
 
-  if (filter_index == 3) {
+  if (num_taps == 2) {
     // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
     const __m128i v_src_43 = _mm_shuffle_epi8(
         v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
@@ -79,10 +122,10 @@ __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
   return v_sum_5432;
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
                                 const __m128i* const v_tap) {
-  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+  __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
 
   // Normally the Horizontal pass does the downshift in two passes:
   // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -95,11 +138,10 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
   return _mm_packus_epi16(sum, sum);
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
                                 const __m128i* const v_tap) {
-  const __m128i sum =
-      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+  const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
 
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
@@ -411,36 +453,34 @@ __m128i Compound1DShift(const __m128i sum) {
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
   __m128i v_src[4];
 
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
     v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
     v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
     v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
     v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
     v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
-  } else if (filter_index > 3) {
+  } else {
     // 4 taps.
     v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
     v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
   }
-  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
   return sum;
 }
 
-// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the
-// 2D version.
-template <int num_taps, int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
                        void* const dst, const ptrdiff_t dst_stride,
                        const int height, const __m128i* const v_tap) {
@@ -468,7 +508,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       // 10 11 12 13 20 21 22 23
       srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
 
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
         StoreUnaligned16(dst16, results);
@@ -515,7 +555,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       // 30 31 32 33 40 41 42 43
       srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
 
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
         StoreUnaligned16(dst16, results);
@@ -574,7 +614,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       // 50 51 52 53 60 61 62 63
       srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
 
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
         StoreUnaligned16(dst16, results);
@@ -645,7 +685,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       // 70 71 72 73 80 81 82 83
       srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
 
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
         StoreUnaligned16(dst16, results);
@@ -672,7 +712,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
   }
 }
 
-template <int num_taps, int filter_index, bool negative_outside_taps = false>
+template <int num_taps, bool negative_outside_taps = false>
 void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
                        void* const dst, const ptrdiff_t dst_stride,
                        const int height, const __m128i* const v_tap) {
@@ -705,7 +745,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
       // 10 11 20 21 30 31 40 41
       srcs[1] = _mm_srli_si128(srcs_0_2, 2);
       // This uses srcs[0]..srcs[1].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       const __m128i results_16 =
           RightShiftWithRounding_S16(sums, kFilterBits - 1);
       const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -760,7 +800,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[3] = _mm_srli_si128(srcs_0_4, 6);
 
       // This uses srcs[0]..srcs[3].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       const __m128i results_16 =
           RightShiftWithRounding_S16(sums, kFilterBits - 1);
       const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -829,7 +869,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[5] = _mm_srli_si128(srcs_4_8, 2);
 
       // This uses srcs[0]..srcs[5].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       const __m128i results_16 =
           RightShiftWithRounding_S16(sums, kFilterBits - 1);
       const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -909,7 +949,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[7] = _mm_srli_si128(srcs_4_8, 6);
 
       // This uses srcs[0]..srcs[7].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       const __m128i results_16 =
           RightShiftWithRounding_S16(sums, kFilterBits - 1);
       const __m128i results = _mm_packus_epi16(results_16, results_16);
diff --git a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc
index c813df4..8c32117 100644
--- a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc
+++ b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -34,54 +34,50 @@ namespace low_bitdepth {
 namespace {
 
 constexpr int kInterPostRoundBit = 4;
+constexpr int kInterPostRhsAdjust = 1 << (16 - kInterPostRoundBit - 1);
 
 inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
                                        const __m128i& pred1,
-                                       const __m128i& weights) {
-  // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
-  const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1);
-  const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights);
-  const __m128i result_lo =
-      RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4);
-
-  const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1);
-  const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights);
-  const __m128i result_hi =
-      RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4);
-
-  return _mm_packs_epi32(result_lo, result_hi);
+                                       const __m128i& weight) {
+  // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+  // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+  //    8(=kInterPostRoundBit + 4)
+  // The formula is manipulated to avoid lengthening to 32 bits.
+  // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+  // = (p0 - p1) * w0 + 16 * p1
+  // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+  const __m128i diff = _mm_slli_epi16(_mm_sub_epi16(pred0, pred1), 1);
+  // (((p0 - p1) * (w0 << 12) >> 16) + ((16 * p1) >> 4)
+  const __m128i weighted_diff = _mm_mulhi_epi16(diff, weight);
+  // ((p0 - p1) * w0 >> 4) + p1
+  const __m128i upscaled_average = _mm_add_epi16(weighted_diff, pred1);
+  // (x << 11) >> 15 == x >> 4
+  const __m128i right_shift_prep = _mm_set1_epi16(kInterPostRhsAdjust);
+  // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+  return _mm_mulhrs_epi16(upscaled_average, right_shift_prep);
 }
 
 template <int height>
 inline void DistanceWeightedBlend4xH_SSE4_1(
     const int16_t* LIBGAV1_RESTRICT pred_0,
-    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
-    const ptrdiff_t dest_stride) {
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
-  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+  // Upscale the weight for mulhi.
+  const __m128i weights = _mm_set1_epi16(weight << 11);
 
   for (int y = 0; y < height; y += 4) {
-    // TODO(b/150326556): Use larger loads.
-    const __m128i src_00 = LoadLo8(pred_0);
-    const __m128i src_10 = LoadLo8(pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    __m128i src_0 = LoadHi8(src_00, pred_0);
-    __m128i src_1 = LoadHi8(src_10, pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights);
-
-    const __m128i src_01 = LoadLo8(pred_0);
-    const __m128i src_11 = LoadLo8(pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    src_0 = LoadHi8(src_01, pred_0);
-    src_1 = LoadHi8(src_11, pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights);
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
 
     const __m128i result_pixels = _mm_packus_epi16(res0, res1);
     Store4(dst, result_pixels);
@@ -101,11 +97,11 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
 template <int height>
 inline void DistanceWeightedBlend8xH_SSE4_1(
     const int16_t* LIBGAV1_RESTRICT pred_0,
-    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
-    const ptrdiff_t dest_stride) {
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
-  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+  // Upscale the weight for mulhi.
+  const __m128i weights = _mm_set1_epi16(weight << 11);
 
   for (int y = 0; y < height; y += 2) {
     const __m128i src_00 = LoadAligned16(pred_0);
@@ -130,11 +126,12 @@ inline void DistanceWeightedBlend8xH_SSE4_1(
 
 inline void DistanceWeightedBlendLarge_SSE4_1(
     const int16_t* LIBGAV1_RESTRICT pred_0,
-    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, const int width, const int height,
-    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+    const int width, const int height, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
-  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+  // Upscale the weight for mulhi.
+  const __m128i weights = _mm_set1_epi16(weight << 11);
 
   int y = height;
   do {
@@ -162,23 +159,24 @@ inline void DistanceWeightedBlendLarge_SSE4_1(
 void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
                                   const void* LIBGAV1_RESTRICT prediction_1,
                                   const uint8_t weight_0,
-                                  const uint8_t weight_1, const int width,
+                                  const uint8_t /*weight_1*/, const int width,
                                   const int height,
                                   void* LIBGAV1_RESTRICT const dest,
                                   const ptrdiff_t dest_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  const uint8_t weight = weight_0;
   if (width == 4) {
     if (height == 4) {
-      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
-                                         dest, dest_stride);
+      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+                                         dest_stride);
     } else if (height == 8) {
-      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
-                                         dest, dest_stride);
+      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+                                         dest_stride);
     } else {
       assert(height == 16);
-      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
-                                          dest, dest_stride);
+      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+                                          dest_stride);
     }
     return;
   }
@@ -186,28 +184,28 @@ void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
   if (width == 8) {
     switch (height) {
       case 4:
-        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
-                                           dest, dest_stride);
+        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+                                           dest_stride);
         return;
       case 8:
-        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
-                                           dest, dest_stride);
+        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+                                           dest_stride);
         return;
       case 16:
-        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
-                                            dest, dest_stride);
+        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+                                            dest_stride);
         return;
       default:
         assert(height == 32);
-        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
-                                            dest, dest_stride);
+        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight, dest,
+                                            dest_stride);
 
         return;
     }
   }
 
-  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
-                                    height, dest, dest_stride);
+  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight, width, height, dest,
+                                    dest_stride);
 }
 
 void Init8bpp() {
@@ -273,27 +271,19 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
 
   int y = height;
   do {
-    const __m128i src_00 = LoadLo8(pred_0);
-    const __m128i src_10 = LoadLo8(pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    __m128i src_0 = LoadHi8(src_00, pred_0);
-    __m128i src_1 = LoadHi8(src_10, pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
     const __m128i res0 =
-        ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
-
-    const __m128i src_01 = LoadLo8(pred_0);
-    const __m128i src_11 = LoadLo8(pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    src_0 = LoadHi8(src_01, pred_0);
-    src_1 = LoadHi8(src_11, pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
+        ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
     const __m128i res1 =
-        ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+        ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
 
     StoreLo8(dst, res0);
     dst += dest_stride;
diff --git a/libgav1/src/dsp/x86/film_grain_sse4.cc b/libgav1/src/dsp/x86/film_grain_sse4.cc
index 9ece947..59d18a6 100644
--- a/libgav1/src/dsp/x86/film_grain_sse4.cc
+++ b/libgav1/src/dsp/x86/film_grain_sse4.cc
@@ -23,14 +23,15 @@
 #include <cstdint>
 #include <cstring>
 
-#include "src/dsp/common.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/film_grain_common.h"
 #include "src/dsp/x86/common_sse4.h"
+#include "src/utils/array_2d.h"
 #include "src/utils/common.h"
 #include "src/utils/compiler_attributes.h"
-#include "src/utils/logging.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -165,7 +166,7 @@ void BlendNoiseWithImageLuma_SSE4_1(
   int y = 0;
   do {
     int x = 0;
-    for (; x < safe_width; x += 8) {
+    for (; x + 8 <= safe_width; x += 8) {
       const __m128i orig = LoadSource(&in_y_row[x]);
       const __m128i scaling =
           GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
@@ -181,6 +182,7 @@ void BlendNoiseWithImageLuma_SSE4_1(
       // Prevent arbitrary indices from entering GetScalingFactors.
       memset(luma_buffer, 0, sizeof(luma_buffer));
       const int valid_range = width - x;
+      assert(valid_range < 8);
       memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
       luma_buffer[valid_range] = in_y_row[width - 1];
       const __m128i orig = LoadSource(&in_y_row[x]);
@@ -239,7 +241,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
   int y = 0;
   do {
     int x = 0;
-    for (; x < safe_chroma_width; x += 8) {
+    for (; x + 8 <= safe_chroma_width; x += 8) {
       const int luma_x = x << subsampling_x;
       const __m128i average_luma =
           GetAverageLuma(&in_y_row[luma_x], subsampling_x);
@@ -252,8 +254,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
       StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
     }
 
-    // This section only runs if width % (8 << sub_x) != 0. It should never run
-    // on 720p and above.
     if (x < chroma_width) {
       // Prevent huge indices from entering GetScalingFactors due to
       // uninitialized values. This is not a problem in 8bpp because the table
@@ -365,7 +365,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
   int y = 0;
   do {
     int x = 0;
-    for (; x < safe_chroma_width; x += 8) {
+    for (; x + 8 <= safe_chroma_width; x += 8) {
       const int luma_x = x << subsampling_x;
       const __m128i average_luma =
           GetAverageLuma(&in_y_row[luma_x], subsampling_x);
diff --git a/libgav1/src/dsp/x86/intrapred_directional_sse4.cc b/libgav1/src/dsp/x86/intrapred_directional_sse4.cc
index e642aee..bc61745 100644
--- a/libgav1/src/dsp/x86/intrapred_directional_sse4.cc
+++ b/libgav1/src/dsp/x86/intrapred_directional_sse4.cc
@@ -624,14 +624,6 @@ inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
   }
 }
 
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
-    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
 template <bool upsampled>
 inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
     uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
@@ -729,6 +721,103 @@ inline void DirectionalZone1Blend_8xH(
   }
 }
 
+template <bool shuffle_left_column, bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_8xH(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep, const int x, const int left_offset,
+    const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base,
+    const __m128i& left_y) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Loop incrementers for moving by block (8x8). This function handles blocks
+  // with height 4 as well. They are calculated in one pass so these variables
+  // do not get used.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+
+  // Cover 8x4 case.
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  uint8_t* dst_x = dst + x;
+
+  // Round down to the nearest multiple of 8 (or 4, if height is 4).
+  const int max_top_only_y =
+      std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1);
+  DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                       max_top_only_y, -xstep, upsampled_top);
+  DirectionalZone1_4xH(dst_x + 4, stride,
+                       top_row + ((x + 4) << upsample_top_shift),
+                       max_top_only_y, -xstep, upsampled_top);
+  if (max_top_only_y == height) return;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i dest_index_x =
+      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  int y = max_top_only_y;
+  dst_x += stride * y;
+  const int xstep_y = xstep * y;
+  const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+  // All rows from |min_left_only_y| down for this set of columns, only need
+  // |left_column| to compute.
+  const int min_left_only_y =
+      Align(std::min(((x + 8) << 6) / xstep, height), 8);
+
+  __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+  __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+  int top_x = -xstep_y;
+
+  const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+  for (; y < min_left_only_y;
+       y += 8, dst_x += stride8,
+       xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+       xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+       top_x -= xstep8) {
+    // Pick up from the last y-value, using the 10% slower but secure method for
+    // left prediction.
+    if (shuffle_left_column) {
+      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+    } else {
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+    }
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                      shift_mask),
+        1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+    __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+    DirectionalZone1Blend_8xH<upsampled_top, 8>(
+        dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+        xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+  }
+  // Loop over y for left_only rows.
+  for (; y < height; y += 8, dst_x += stride8) {
+    DirectionalZone3_8xH<upsampled_left, 8>(
+        dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift),
+        base_left_y, -ystep);
+  }
+}
+
 // 7.11.2.4 (8) 90 < angle > 180
 // The strategy for this function is to know how many blocks can be processed
 // with just pixels from |top_ptr|, then handle mixed blocks, then handle only
@@ -742,29 +831,11 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
                                     const int width, const int height,
                                     const int xstep, const int ystep) {
   auto* dst = static_cast<uint8_t*>(dest);
-  const int upsample_left_shift = static_cast<int>(upsampled_left);
   const int upsample_top_shift = static_cast<int>(upsampled_top);
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const ptrdiff_t stride8 = stride << 3;
-  const __m128i dest_index_x =
-      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
-  const __m128i sampler_top =
-      upsampled_top
-          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
-          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
-  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-  // All columns from |min_top_only_x| to the right will only need |top_row| to
-  // compute. This assumes minimum |xstep| is 3.
+  // All columns from |min_top_only_x| to the right will only need |top_row|
+  // to compute. This assumes minimum |xstep| is 3.
   const int min_top_only_x = std::min((height * xstep) >> 6, width);
 
-  // For steep angles, the source pixels from left_column may not fit in a
-  // 16-byte load for shuffling.
-  // TODO(petersonab): Find a more precise formula for this subject to x.
-  const int max_shuffle_height =
-      std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
-
-  const int xstep8 = xstep << 3;
-  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
   // Accumulate xstep across 8 rows.
   const __m128i xstep_dup = _mm_set1_epi16(-xstep);
   const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
@@ -787,105 +858,39 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
   // offset. Following values need the full ystep as a relative offset.
   const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
   const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  const __m128i dest_index_x =
+      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
   __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
   left_y = _mm_add_epi16(ystep_init, left_y);
 
+  // Analysis finds that, for most angles (ystep < 132), all segments that use
+  // both top_row and left_column can compute from left_column using byte
+  // shuffles from a single vector. For steeper angles, the shuffle is also
+  // fully reliable when x >= 32.
+  const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+  const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
   const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
   int x = 0;
 
-  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
-  // The first stage, before the first y-loop, covers blocks that are only
-  // computed from the top row. The second stage, comprising two y-loops, covers
-  // blocks that have a mixture of values computed from top or left. The final
-  // stage covers blocks that are only computed from the left.
+  for (int left_offset = -left_base_increment; x < min_shuffle_x;
+       x += 8,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+           // Watch left_y because it can still get big.
+       left_y = _mm_add_epi16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    DirectionalZone2_8xH<false, upsampled_left, upsampled_top>(
+        dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+        xstep_for_shift, xstep_bounds_base, left_y);
+  }
   for (int left_offset = -left_base_increment; x < min_top_only_x;
        x += 8,
            xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
            // Watch left_y because it can still get big.
        left_y = _mm_add_epi16(left_y, increment_left8),
            left_offset -= left_base_increment8) {
-    uint8_t* dst_x = dst + x;
-
-    // Round down to the nearest multiple of 8.
-    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
-    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
-                         max_top_only_y, -xstep, upsampled_top);
-    DirectionalZone1_4xH(dst_x + 4, stride,
-                         top_row + ((x + 4) << upsample_top_shift),
-                         max_top_only_y, -xstep, upsampled_top);
-
-    int y = max_top_only_y;
-    dst_x += stride * y;
-    const int xstep_y = xstep * y;
-    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
-    // All rows from |min_left_only_y| down for this set of columns, only need
-    // |left_column| to compute.
-    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
-    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
-    // high. This means that max_shuffle_height is unbounded and xstep_bounds
-    // will overflow in 16 bits. This is prevented by stopping the first
-    // blending loop at min_left_only_y for such cases, which means we skip over
-    // the second blending loop as well.
-    const int left_shuffle_stop_y =
-        std::min(max_shuffle_height, min_left_only_y);
-    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
-    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
-    int top_x = -xstep_y;
-
-    for (; y < left_shuffle_stop_y;
-         y += 8, dst_x += stride8,
-         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
-         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
-         top_x -= xstep8) {
-      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), left_y);
-
-      __m128i shifts = _mm_srli_epi16(
-          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
-                        shift_mask),
-          1);
-      shifts = _mm_packus_epi16(shifts, shifts);
-      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
-      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-      __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-      DirectionalZone1Blend_8xH<upsampled_top, 8>(
-          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
-          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
-    }
-    // Pick up from the last y-value, using the 10% slower but secure method for
-    // left prediction.
-    const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
-    for (; y < min_left_only_y;
-         y += 8, dst_x += stride8,
-         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
-         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
-         top_x -= xstep8) {
-      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-
-      DirectionalZone3_8xH<upsampled_left, 8>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
-          -ystep);
-
-      __m128i shifts = _mm_srli_epi16(
-          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
-                        shift_mask),
-          1);
-      shifts = _mm_packus_epi16(shifts, shifts);
-      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
-      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-      DirectionalZone1Blend_8xH<upsampled_top, 8>(
-          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
-          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
-    }
-    // Loop over y for left_only rows.
-    for (; y < height; y += 8, dst_x += stride8) {
-      DirectionalZone3_8xH<upsampled_left, 8>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
-          -ystep);
-    }
+    DirectionalZone2_8xH<true, upsampled_left, upsampled_top>(
+        dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+        xstep_for_shift, xstep_bounds_base, left_y);
   }
   for (; x < width; x += 4) {
     DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
@@ -952,8 +957,8 @@ inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
            left_offset -= left_base_increment4) {
     uint8_t* dst_x = dst + x;
 
-    // Round down to the nearest multiple of 8.
-    const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+    // Round down to the nearest multiple of 4.
+    const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3;
     DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
                          max_top_only_y, -xstep, upsampled_top);
     int y = max_top_only_y;
diff --git a/libgav1/src/dsp/x86/loop_restoration_sse4.cc b/libgav1/src/dsp/x86/loop_restoration_sse4.cc
index 3363f0e..b4df072 100644
--- a/libgav1/src/dsp/x86/loop_restoration_sse4.cc
+++ b/libgav1/src/dsp/x86/loop_restoration_sse4.cc
@@ -2088,6 +2088,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
     uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
     uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
   __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+  ma5[1] = _mm_setzero_si128();  // Quiets -Wmaybe-unintialized with gcc.
   s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
   s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
   sq[0][0] = SquareLo8(s[0][0]);
diff --git a/libgav1/src/dsp/x86/mask_blend_sse4.cc b/libgav1/src/dsp/x86/mask_blend_sse4.cc
index a18444b..833814c 100644
--- a/libgav1/src/dsp/x86/mask_blend_sse4.cc
+++ b/libgav1/src/dsp/x86/mask_blend_sse4.cc
@@ -30,35 +30,81 @@
 
 namespace libgav1 {
 namespace dsp {
-namespace low_bitdepth {
 namespace {
 
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_val_0 = LoadUnaligned16(mask);
+    const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+    const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+    return RightShiftWithRounding_U16(mask_0, 2);
+  }
+  if (subsampling_x == 1) {
+    const __m128i row_vals = LoadUnaligned16(mask);
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+    const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+    return RightShiftWithRounding_U16(subsampled_mask, 1);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val = LoadLo8(mask);
+  return _mm_cvtepu8_epi16(mask_val);
+}
+
+// Imitate behavior of ARM vtrn1q_u64.
+inline __m128i Transpose1_U64(const __m128i a, const __m128i b) {
+  return _mm_castps_si128(
+      _mm_movelh_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
+// Imitate behavior of ARM vtrn2q_u64.
+inline __m128i Transpose2_U64(const __m128i a, const __m128i b) {
+  return _mm_castps_si128(
+      _mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
 // Width can only be 4 when it is subsampled from a block of width 8, hence
 // subsampling_x is always 1 when this function is called.
 template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+inline __m128i GetMask4x2(const uint8_t* mask) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const __m128i mask_val_01 = LoadUnaligned16(mask);
+    // Stride is fixed because this is the smallest block size.
+    const __m128i mask_val_23 = LoadUnaligned16(mask + 16);
+    // Transpose rows to add row 0 to row 1, and row 2 to row 3.
+    const __m128i mask_val_02 = Transpose1_U64(mask_val_01, mask_val_23);
+    const __m128i mask_val_13 = Transpose2_U64(mask_val_23, mask_val_01);
+    const __m128i add_0 = _mm_adds_epu8(mask_val_02, mask_val_13);
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+    return RightShiftWithRounding_U16(mask_0, 2);
+  }
+  return GetMask8<subsampling_x, 0>(mask, 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask4x2(const uint8_t* mask,
+                                    ptrdiff_t mask_stride) {
   if (subsampling_x == 1) {
-    const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
-    const __m128i mask_val_1 =
-        _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
-    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
-    if (subsampling_y == 1) {
-      const __m128i next_mask_val_0 =
-          _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride));
-      const __m128i next_mask_val_1 =
-          _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3));
-      subsampled_mask = _mm_add_epi16(
-          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
-    }
-    return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+    return GetMask4x2<subsampling_x, subsampling_y>(mask);
   }
+  // When using intra or difference weighted masks, the function doesn't use
+  // subsampling, so |mask_stride| may be 4 or 8.
+  assert(subsampling_y == 0 && subsampling_x == 0);
   const __m128i mask_val_0 = Load4(mask);
   const __m128i mask_val_1 = Load4(mask + mask_stride);
   return _mm_cvtepu8_epi16(
       _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
 }
 
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
 // This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
 // 16-bit is also the lowest packing for hadd, but without subsampling there is
 // an unfortunate conversion required.
@@ -87,38 +133,6 @@ inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask,
   return _mm_cvtepu8_epi16(mask_val);
 }
 
-// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
-// when is_inter_intra is true, the prediction values are brought to 8-bit
-// packing as well.
-template <int subsampling_x, int subsampling_y>
-inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask,
-                                  ptrdiff_t stride) {
-  if (subsampling_x == 1) {
-    const __m128i row_vals = LoadUnaligned16(mask);
-
-    const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
-    const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
-    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
-
-    if (subsampling_y == 1) {
-      const __m128i next_row_vals = LoadUnaligned16(mask + stride);
-      const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
-      const __m128i next_mask_val_1 =
-          _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
-      subsampled_mask = _mm_add_epi16(
-          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
-    }
-    const __m128i ret =
-        RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
-    return _mm_packus_epi16(ret, ret);
-  }
-  assert(subsampling_y == 0 && subsampling_x == 0);
-  // Unfortunately there is no shift operation for 8-bit packing, or else we
-  // could return everything with 8-bit packing.
-  const __m128i mask_val = LoadLo8(mask);
-  return mask_val;
-}
-
 inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
                                   const int16_t* LIBGAV1_RESTRICT const pred_1,
                                   const __m128i pred_mask_0,
@@ -149,15 +163,14 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
-                                 const int16_t* LIBGAV1_RESTRICT pred_1,
-                                 const uint8_t* LIBGAV1_RESTRICT mask,
-                                 const ptrdiff_t mask_stride,
-                                 uint8_t* LIBGAV1_RESTRICT dst,
-                                 const ptrdiff_t dst_stride) {
+inline void MaskBlending4x4_SSE4_1(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                   const int16_t* LIBGAV1_RESTRICT pred_1,
+                                   const uint8_t* LIBGAV1_RESTRICT mask,
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
+  constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
   const __m128i mask_inverter = _mm_set1_epi16(64);
-  __m128i pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
   __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                         dst_stride);
@@ -166,30 +179,30 @@ inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
   mask += mask_stride << (1 + subsampling_y);
   dst += dst_stride << 1;
 
-  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
   pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                         dst_stride);
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
-                                 const int16_t* LIBGAV1_RESTRICT pred_1,
-                                 const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
-                                 const ptrdiff_t mask_stride, const int height,
-                                 uint8_t* LIBGAV1_RESTRICT dst,
-                                 const ptrdiff_t dst_stride) {
+inline void MaskBlending4xH_SSE4_1(
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const int height,
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+  assert(subsampling_x == 1);
   const uint8_t* mask = mask_ptr;
+  constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
   if (height == 4) {
-    MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
-        pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+    MaskBlending4x4_SSE4_1<subsampling_x, subsampling_y>(pred_0, pred_1, mask,
+                                                         dst, dst_stride);
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi16(64);
   int y = 0;
   do {
-    __m128i pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
 
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
@@ -199,7 +212,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
     mask += mask_stride << (1 + subsampling_y);
     dst += dst_stride << 1;
 
-    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                           dst_stride);
@@ -208,7 +221,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
     mask += mask_stride << (1 + subsampling_y);
     dst += dst_stride << 1;
 
-    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                           dst_stride);
@@ -217,7 +230,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
     mask += mask_stride << (1 + subsampling_y);
     dst += dst_stride << 1;
 
-    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                           dst_stride);
@@ -230,21 +243,21 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                           const void* LIBGAV1_RESTRICT prediction_1,
-                           const ptrdiff_t /*prediction_stride_1*/,
-                           const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
-                           const ptrdiff_t mask_stride, const int width,
-                           const int height, void* LIBGAV1_RESTRICT dest,
-                           const ptrdiff_t dst_stride) {
+inline void MaskBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             const ptrdiff_t /*prediction_stride_1*/,
+                             const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                             const ptrdiff_t mask_stride, const int width,
+                             const int height, void* LIBGAV1_RESTRICT dest,
+                             const ptrdiff_t dst_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   const ptrdiff_t pred_stride_0 = width;
   const ptrdiff_t pred_stride_1 = width;
   if (width == 4) {
-    MaskBlending4xH_SSE4<subsampling_x, subsampling_y>(
-        pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+    MaskBlending4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask_ptr, height, dst, dst_stride);
     return;
   }
   const uint8_t* mask = mask_ptr;
@@ -293,7 +306,6 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(
   const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
 
   const __m128i pred_val_0 = LoadLo8(pred_0);
-  // TODO(b/150326556): One load.
   __m128i pred_val_1 = Load4(pred_1);
   pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
                             pred_val_1);
@@ -309,16 +321,16 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4x4_SSE4(
+inline void InterIntraMaskBlending8bpp4x4_SSE4_1(
     const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
     const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
     const ptrdiff_t mask_stride) {
   const __m128i mask_inverter = _mm_set1_epi8(64);
   const __m128i pred_mask_u16_first =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
   mask += mask_stride << (1 + subsampling_y);
   const __m128i pred_mask_u16_second =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
   mask += mask_stride << (1 + subsampling_y);
   __m128i pred_mask_1 =
       _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
@@ -335,26 +347,26 @@ inline void InterIntraMaskBlending8bpp4x4_SSE4(
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4xH_SSE4(
+inline void InterIntraMaskBlending8bpp4xH_SSE4_1(
     const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
     const ptrdiff_t pred_stride_1,
     const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
     const int height) {
   const uint8_t* mask = mask_ptr;
   if (height == 4) {
-    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+    InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
         pred_0, pred_1, pred_stride_1, mask, mask_stride);
     return;
   }
   int y = 0;
   do {
-    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+    InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
         pred_0, pred_1, pred_stride_1, mask, mask_stride);
     pred_0 += 4 << 2;
     pred_1 += pred_stride_1 << 2;
     mask += mask_stride << (2 + subsampling_y);
 
-    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+    InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
         pred_0, pred_1, pred_stride_1, mask, mask_stride);
     pred_0 += 4 << 2;
     pred_1 += pred_stride_1 << 2;
@@ -363,14 +375,31 @@ inline void InterIntraMaskBlending8bpp4xH_SSE4(
   } while (y < height);
 }
 
+// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
+// when is_inter_intra is true, the prediction values are brought to 8-bit
+// packing as well.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask8bpp8(const uint8_t* LIBGAV1_RESTRICT mask,
+                                      ptrdiff_t stride) {
+  if (subsampling_x == 1) {
+    const __m128i ret = GetMask8<subsampling_x, subsampling_y>(mask, stride);
+    return _mm_packus_epi16(ret, ret);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  // Unfortunately there is no shift operation for 8-bit packing, or else we
+  // could return everything with 8-bit packing.
+  const __m128i mask_val = LoadLo8(mask);
+  return mask_val;
+}
+
 template <int subsampling_x, int subsampling_y>
-void InterIntraMaskBlend8bpp_SSE4(
+void InterIntraMaskBlend8bpp_SSE4_1(
     const uint8_t* LIBGAV1_RESTRICT prediction_0,
     uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
     const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
     const int width, const int height) {
   if (width == 4) {
-    InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
+    InterIntraMaskBlending8bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
         prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
         height);
     return;
@@ -382,7 +411,7 @@ void InterIntraMaskBlend8bpp_SSE4(
     int x = 0;
     do {
       const __m128i pred_mask_1 =
-          GetInterIntraMask8<subsampling_x, subsampling_y>(
+          GetInterIntraMask8bpp8<subsampling_x, subsampling_y>(
               mask + (x << subsampling_x), mask_stride);
       // 64 - mask
       const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
@@ -411,24 +440,24 @@ void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
-  dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>;
+  dsp->mask_blend[0][0] = MaskBlend_SSE4_1<0, 0>;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
-  dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_SSE4_1<1, 0>;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
-  dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>;
+  dsp->mask_blend[2][0] = MaskBlend_SSE4_1<1, 1>;
 #endif
   // The is_inter_intra index of mask_blend[][] is replaced by
   // inter_intra_mask_blend_8bpp[] in 8-bit.
 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
-  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>;
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4_1<0, 0>;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
-  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>;
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4_1<1, 0>;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
-  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>;
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4_1<1, 1>;
 #endif
 }
 
@@ -443,14 +472,6 @@ constexpr int kMax10bppSample = (1 << 10) - 1;
 constexpr int kMaskInverse = 64;
 constexpr int kRoundBitsMaskBlend = 4;
 
-inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits,
-                                              const __m128i zero) {
-  // Shift out all but the last bit.
-  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
-  // Avg with zero will shift by 1 and round.
-  return _mm_avg_epu16(v_tmp_d, zero);
-}
-
 inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
                                                const __m128i shift) {
   const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
@@ -458,53 +479,31 @@ inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
 }
 
 template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride,
-                          const __m128i zero) {
-  if (subsampling_x == 1) {
-    if (subsampling_y == 0) {
-      const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
-      const __m128i mask_val_1 =
-          _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
-      __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
-      return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
-    }
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i mask_val_0 =
-        LoadHi8(LoadLo8(mask), mask + (mask_stride << 1));
-    const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride),
-                                       mask + (mask_stride << 1) + mask_stride);
-    const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1);
-    const __m128i subsampled_mask = _mm_maddubs_epi16(add, one);
-    return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero);
+inline __m128i GetMask4x2(const uint8_t* mask) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const __m128i mask_row_01 = LoadUnaligned16(mask);
+    const __m128i mask_row_23 = LoadUnaligned16(mask + 16);
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+    const __m128i mask_val_1 =
+        _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+    const __m128i mask_val_2 = _mm_cvtepu8_epi16(mask_row_23);
+    const __m128i mask_val_3 =
+        _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_23, 8));
+    const __m128i subsampled_mask_02 = _mm_hadd_epi16(mask_val_0, mask_val_2);
+    const __m128i subsampled_mask_13 = _mm_hadd_epi16(mask_val_1, mask_val_3);
+    const __m128i subsampled_mask =
+        _mm_add_epi16(subsampled_mask_02, subsampled_mask_13);
+    return RightShiftWithRounding_U16(subsampled_mask, 2);
   }
-  assert(subsampling_y == 0 && subsampling_x == 0);
-  const __m128i mask_val_0 = Load4(mask);
-  const __m128i mask_val_1 = Load4(mask + mask_stride);
-  return _mm_cvtepu8_epi16(
-      _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
-}
-
-template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride,
-                        const __m128i zero) {
   if (subsampling_x == 1) {
-    if (subsampling_y == 0) {
-      const __m128i row_vals = LoadUnaligned16(mask);
-      const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
-      const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
-      __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
-      return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
-    }
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i mask_val_0 = LoadUnaligned16(mask);
-    const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
-    const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
-    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
-    return RightShiftWithRoundingZero_U16(mask_0, 2, zero);
+    const __m128i mask_row_01 = LoadUnaligned16(mask);
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+    const __m128i mask_val_1 =
+        _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+    const __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+    return RightShiftWithRounding_U16(subsampled_mask, 1);
   }
-  assert(subsampling_y == 0 && subsampling_x == 0);
-  const __m128i mask_val = LoadLo8(mask);
-  return _mm_cvtepu8_epi16(mask_val);
+  return _mm_cvtepu8_epi16(LoadLo8(mask));
 }
 
 inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
@@ -558,12 +557,10 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
                                      uint16_t* LIBGAV1_RESTRICT dst,
                                      const ptrdiff_t dst_stride) {
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
-  const __m128i zero = _mm_setzero_si128();
   const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
   const __m128i offset = _mm_set1_epi32(kCompoundOffset);
   const __m128i max = _mm_set1_epi16(kMax10bppSample);
-  __m128i pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
   __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
                                     pred_mask_1, offset, max, shift4, dst,
@@ -573,8 +570,7 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
   mask += mask_stride << (1 + subsampling_y);
   dst += dst_stride << 1;
 
-  pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
   pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
                                     pred_mask_1, offset, max, shift4, dst,
@@ -595,7 +591,6 @@ inline void MaskBlend10bpp4xH_SSE4_1(
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
-  const __m128i zero = _mm_setzero_si128();
   const uint8_t pred0_stride2 = 4 << 1;
   const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
   const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
@@ -605,8 +600,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
   const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
   int y = height;
   do {
-    __m128i pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
 
     WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
@@ -617,8 +611,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
     mask += mask_stride2;
     dst += dst_stride2;
 
-    pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                       pred_mask_0, pred_mask_1, offset, max,
@@ -628,8 +621,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
     mask += mask_stride2;
     dst += dst_stride2;
 
-    pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                       pred_mask_0, pred_mask_1, offset, max,
@@ -639,8 +631,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
     mask += mask_stride2;
     dst += dst_stride2;
 
-    pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                       pred_mask_0, pred_mask_1, offset, max,
@@ -675,7 +666,6 @@ inline void MaskBlend10bpp_SSE4_1(
   }
   const uint8_t* mask = mask_ptr;
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
-  const __m128i zero = _mm_setzero_si128();
   const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
   const __m128i offset = _mm_set1_epi32(kCompoundOffset);
   const __m128i max = _mm_set1_epi16(kMax10bppSample);
@@ -685,7 +675,7 @@ inline void MaskBlend10bpp_SSE4_1(
     int x = 0;
     do {
       const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
-          mask + (x << subsampling_x), mask_stride, zero);
+          mask + (x << subsampling_x), mask_stride);
       const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
       const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
       // 64 - mask
@@ -729,7 +719,6 @@ inline void MaskBlend10bpp_SSE4_1(
     mask += mask_stride_ss;
   } while (--y != 0);
 }
-
 inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
     const uint16_t* LIBGAV1_RESTRICT prediction_0,
     const uint16_t* LIBGAV1_RESTRICT prediction_1,
@@ -764,9 +753,8 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
     uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
   const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
-  const __m128i zero = _mm_setzero_si128();
   __m128i pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
   __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                               pred_mask_0, pred_mask_1, shift6,
@@ -777,7 +765,7 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
   dst += dst_stride << 1;
 
   pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
   pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                               pred_mask_0, pred_mask_1, shift6,
@@ -798,7 +786,6 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
-  const __m128i zero = _mm_setzero_si128();
   const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
   const uint8_t pred0_stride2 = 4 << 1;
   const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
@@ -807,7 +794,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
   int y = height;
   do {
     __m128i pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
     __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                                 pred_mask_0, pred_mask_1,
@@ -818,7 +805,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
     dst += dst_stride2;
 
     pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                                 pred_mask_0, pred_mask_1,
@@ -829,7 +816,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
     dst += dst_stride2;
 
     pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                                 pred_mask_0, pred_mask_1,
@@ -840,7 +827,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
     dst += dst_stride2;
 
     pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                                 pred_mask_0, pred_mask_1,
@@ -876,14 +863,13 @@ inline void InterIntraMaskBlend10bpp_SSE4_1(
   const uint8_t* mask = mask_ptr;
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
   const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
-  const __m128i zero = _mm_setzero_si128();
   const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
   int y = height;
   do {
     int x = 0;
     do {
       const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
-          mask + (x << subsampling_x), mask_stride, zero);
+          mask + (x << subsampling_x), mask_stride);
       const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
       const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
       // 64 - mask
diff --git a/libgav1/src/dsp/x86/obmc_sse4.cc b/libgav1/src/dsp/x86/obmc_sse4.cc
index 8ce23b4..f068ff3 100644
--- a/libgav1/src/dsp/x86/obmc_sse4.cc
+++ b/libgav1/src/dsp/x86/obmc_sse4.cc
@@ -39,8 +39,8 @@ namespace {
 inline void OverlapBlendFromLeft2xH_SSE4_1(
     uint8_t* LIBGAV1_RESTRICT const prediction,
     const ptrdiff_t prediction_stride, const int height,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_prediction_stride) {
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 2;
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
@@ -51,8 +51,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
   int y = height;
   do {
     const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
-    const __m128i obmc_pred_val =
-        Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
+    const __m128i obmc_pred_val = Load4(obmc_pred);
 
     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
     const __m128i result =
@@ -71,8 +70,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
 inline void OverlapBlendFromLeft4xH_SSE4_1(
     uint8_t* LIBGAV1_RESTRICT const prediction,
     const ptrdiff_t prediction_stride, const int height,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_prediction_stride) {
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 4;
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
@@ -85,15 +84,12 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
   int y = height;
   do {
     const __m128i pred_val0 = Load4(pred);
-    const __m128i obmc_pred_val0 = Load4(obmc_pred);
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
 
     // Place the second row of each source in the second four bytes.
     const __m128i pred_val =
         _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
-    const __m128i obmc_pred_val = _mm_alignr_epi8(
-        Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
     const __m128i result =
         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
@@ -102,7 +98,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
     const int second_row_result = _mm_extract_epi32(packed_result, 1);
     memcpy(pred, &second_row_result, sizeof(second_row_result));
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
     y -= 2;
   } while (y != 0);
 }
@@ -110,8 +106,8 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
 inline void OverlapBlendFromLeft8xH_SSE4_1(
     uint8_t* LIBGAV1_RESTRICT const prediction,
     const ptrdiff_t prediction_stride, const int height,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_prediction_stride) {
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 8;
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -121,16 +117,25 @@ inline void OverlapBlendFromLeft8xH_SSE4_1(
   const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
   int y = height;
   do {
-    const __m128i pred_val = LoadLo8(pred);
-    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
-    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
-    const __m128i result =
-        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+    const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result_lo =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+
+    const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+    const __m128i result_hi =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
 
-    StoreLo8(pred, _mm_packus_epi16(result, result));
+    const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+    StoreLo8(pred, result);
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-  } while (--y != 0);
+    StoreHi8(pred, result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+    y -= 2;
+  } while (y != 0);
 }
 
 void OverlapBlendFromLeft_SSE4_1(
@@ -144,18 +149,15 @@ void OverlapBlendFromLeft_SSE4_1(
   assert(height >= 4);
 
   if (width == 2) {
-    OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
-                                   obmc_prediction_stride);
+    OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
     return;
   }
   if (width == 4) {
-    OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
-                                   obmc_prediction_stride);
+    OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
     return;
   }
   if (width == 8) {
-    OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
-                                   obmc_prediction_stride);
+    OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -192,8 +194,8 @@ void OverlapBlendFromLeft_SSE4_1(
 inline void OverlapBlendFromTop4xH_SSE4_1(
     uint8_t* LIBGAV1_RESTRICT const prediction,
     const ptrdiff_t prediction_stride, const int height,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_prediction_stride) {
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 4;
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_set1_epi16(64);
@@ -212,13 +214,10 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
         _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
     const __m128i pred_val0 = Load4(pred);
 
-    const __m128i obmc_pred_val0 = Load4(obmc_pred);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
     const __m128i pred_val =
         _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
-    const __m128i obmc_pred_val = _mm_alignr_epi8(
-        Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
     const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
     const __m128i result =
         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
@@ -227,7 +226,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
     Store4(pred - prediction_stride, packed_result);
     Store4(pred, _mm_srli_si128(packed_result, 4));
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
     y += 2;
   } while (y < compute_height);
 }
@@ -235,8 +234,8 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
 inline void OverlapBlendFromTop8xH_SSE4_1(
     uint8_t* LIBGAV1_RESTRICT const prediction,
     const ptrdiff_t prediction_stride, const int height,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_prediction_stride) {
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 8;
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const uint8_t* mask = kObmcMask + height - 2;
@@ -244,20 +243,35 @@ inline void OverlapBlendFromTop8xH_SSE4_1(
   const int compute_height = height - (height >> 2);
   int y = compute_height;
   do {
-    const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
+    const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]);
     // 64 - mask
-    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
-    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
-    const __m128i pred_val = LoadLo8(pred);
-    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
-    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
-    const __m128i result =
-        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+    const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0);
+    const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0);
 
-    StoreLo8(pred, _mm_packus_epi16(result, result));
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+    const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result_lo =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6);
+
+    --y;
+    const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]);
+    // 64 - mask
+    const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1);
+    const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1);
+
+    const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+    const __m128i result_hi =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6);
+
+    const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+    StoreLo8(pred, result);
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-  } while (--y != 0);
+    StoreHi8(pred, result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+  } while (--y > 0);
 }
 
 void OverlapBlendFromTop_SSE4_1(
@@ -271,13 +285,11 @@ void OverlapBlendFromTop_SSE4_1(
   assert(height >= 2);
 
   if (width == 4) {
-    OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
-                                  obmc_prediction_stride);
+    OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
     return;
   }
   if (width == 8) {
-    OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
-                                  obmc_prediction_stride);
+    OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
     return;
   }
 
@@ -333,8 +345,8 @@ constexpr int kRoundBitsObmcBlend = 6;
 
 inline void OverlapBlendFromLeft2xH_SSE4_1(
     uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
-    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_pred_stride) {
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_pred_stride = 2;
   uint16_t* pred = prediction;
   const uint16_t* obmc_pred = obmc_prediction;
   const ptrdiff_t pred_stride2 = pred_stride << 1;
@@ -348,8 +360,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
   int y = height;
   do {
     const __m128i pred_val = Load4x2(pred, pred + pred_stride);
-    const __m128i obmc_pred_val =
-        Load4x2(obmc_pred, obmc_pred + obmc_pred_stride);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
     const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
     const __m128i result = RightShiftWithRounding_U32(
         _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
@@ -364,8 +375,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
 
 inline void OverlapBlendFromLeft4xH_SSE4_1(
     uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
-    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_pred_stride) {
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_pred_stride = 4;
   uint16_t* pred = prediction;
   const uint16_t* obmc_pred = obmc_prediction;
   const ptrdiff_t pred_stride2 = pred_stride << 1;
@@ -379,8 +390,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
   int y = height;
   do {
     const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
-    const __m128i obmc_pred_val =
-        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
     const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
     const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
     const __m128i result_lo = RightShiftWithRounding_U32(
@@ -410,13 +420,11 @@ void OverlapBlendFromLeft10bpp_SSE4_1(
   assert(height >= 4);
 
   if (width == 2) {
-    OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
-                                   obmc_pred_stride);
+    OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred);
     return;
   }
   if (width == 4) {
-    OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
-                                   obmc_pred_stride);
+    OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -452,8 +460,8 @@ void OverlapBlendFromLeft10bpp_SSE4_1(
 
 inline void OverlapBlendFromTop4xH_SSE4_1(
     uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
-    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_pred_stride) {
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_pred_stride = 4;
   uint16_t* pred = prediction;
   const uint16_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_set1_epi16(64);
@@ -473,8 +481,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
     const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
 
     const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
-    const __m128i obmc_pred_val =
-        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
     const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
     const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
     const __m128i result_lo = RightShiftWithRounding_U32(
@@ -505,8 +512,7 @@ void OverlapBlendFromTop10bpp_SSE4_1(
   assert(height >= 2);
 
   if (width == 4) {
-    OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
-                                  obmc_pred_stride);
+    OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
     return;
   }
 
diff --git a/libgav1/src/dsp/x86/warp_sse4.cc b/libgav1/src/dsp/x86/warp_sse4.cc
index 5830894..5498052 100644
--- a/libgav1/src/dsp/x86/warp_sse4.cc
+++ b/libgav1/src/dsp/x86/warp_sse4.cc
@@ -167,7 +167,7 @@ inline void WriteVerticalFilter(const __m128i filter[8],
 }
 
 template <bool is_compound, typename DestType>
-inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
+inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma,
                            int delta, DestType* LIBGAV1_RESTRICT dest_row,
                            ptrdiff_t dest_stride) {
   int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
@@ -188,8 +188,8 @@ inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
 }
 
 template <bool is_compound, typename DestType>
-inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, int y4,
-                           int gamma, int delta,
+inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols,
+                           int64_t y4, int gamma, int delta,
                            DestType* LIBGAV1_RESTRICT dest_row,
                            ptrdiff_t dest_stride) {
   int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
@@ -249,7 +249,7 @@ inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src,
 
 template <bool is_compound, typename DestType>
 inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
-                        ptrdiff_t source_stride, int source_width, int y4,
+                        ptrdiff_t source_stride, int source_width, int64_t y4,
                         int ix4, int iy4, int gamma, int delta,
                         int16_t intermediate_result_column[15],
                         DestType* LIBGAV1_RESTRICT dst_row,
@@ -291,7 +291,7 @@ inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
 template <bool is_compound, typename DestType>
 inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
                         ptrdiff_t source_stride, int source_height, int alpha,
-                        int beta, int x4, int ix4, int iy4,
+                        int beta, int64_t x4, int ix4, int iy4,
                         int16_t intermediate_result[15][8]) {
   // Region 3
   // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
@@ -323,8 +323,9 @@ inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
 
 template <bool is_compound, typename DestType>
 inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src,
-                        ptrdiff_t source_stride, int alpha, int beta, int x4,
-                        int ix4, int iy4, int16_t intermediate_result[15][8]) {
+                        ptrdiff_t source_stride, int alpha, int beta,
+                        int64_t x4, int ix4, int iy4,
+                        int16_t intermediate_result[15][8]) {
   // Region 4.
   // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
 
@@ -379,14 +380,8 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
     int16_t intermediate_result_column[15];
   };
 
-  const int dst_x =
-      src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
-  const int dst_y =
-      src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
-  const int x4 = dst_x >> subsampling_x;
-  const int y4 = dst_y >> subsampling_y;
-  const int ix4 = x4 >> kWarpedModelPrecisionBits;
-  const int iy4 = y4 >> kWarpedModelPrecisionBits;
+  const WarpFilterParams filter_params = GetWarpFilterParams(
+      src_x, src_y, subsampling_x, subsampling_y, warp_params);
   // A prediction block may fall outside the frame's boundaries. If a
   // prediction block is calculated using only samples outside the frame's
   // boundary, the filtering can be simplified. We can divide the plane
@@ -439,33 +434,38 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
   // border index (source_width - 1 or 0, respectively). Then for each x,
   // the inner for loop of the horizontal filter is reduced to multiplying
   // the border pixel by the sum of the filter coefficients.
-  if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
-    if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+  if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) {
+    if ((filter_params.iy4 - 7 >= source_height - 1 ||
+         filter_params.iy4 + 7 <= 0)) {
       // Outside the frame in both directions. One repeated value.
-      WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
-                                         source_height, ix4, iy4, dst_row,
-                                         dest_stride);
+      WarpRegion1<is_compound, DestType>(
+          src, source_stride, source_width, source_height, filter_params.ix4,
+          filter_params.iy4, dst_row, dest_stride);
       return;
     }
     // Outside the frame horizontally. Rows repeated.
     WarpRegion2<is_compound, DestType>(
-        src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
-        intermediate_result_column, dst_row, dest_stride);
+        src, source_stride, source_width, filter_params.y4, filter_params.ix4,
+        filter_params.iy4, gamma, delta, intermediate_result_column, dst_row,
+        dest_stride);
     return;
   }
 
-  if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+  if ((filter_params.iy4 - 7 >= source_height - 1 ||
+       filter_params.iy4 + 7 <= 0)) {
     // Outside the frame vertically.
-    WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
-                                       beta, x4, ix4, iy4, intermediate_result);
+    WarpRegion3<is_compound, DestType>(
+        src, source_stride, source_height, alpha, beta, filter_params.x4,
+        filter_params.ix4, filter_params.iy4, intermediate_result);
   } else {
     // Inside the frame.
-    WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
-                                       iy4, intermediate_result);
+    WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta,
+                                       filter_params.x4, filter_params.ix4,
+                                       filter_params.iy4, intermediate_result);
   }
   // Region 3 and 4 vertical filter.
-  VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
-                                        dst_row, dest_stride);
+  VerticalFilter<is_compound, DestType>(intermediate_result, filter_params.y4,
+                                        gamma, delta, dst_row, dest_stride);
 }
 
 template <bool is_compound>
diff --git a/libgav1/src/dsp/x86/weight_mask_sse4.cc b/libgav1/src/dsp/x86/weight_mask_sse4.cc
index 69cb784..53a374d 100644
--- a/libgav1/src/dsp/x86/weight_mask_sse4.cc
+++ b/libgav1/src/dsp/x86/weight_mask_sse4.cc
@@ -37,10 +37,10 @@ namespace {
 constexpr int kRoundingBits8bpp = 4;
 
 template <bool mask_is_inverse, bool is_store_16>
-inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
-                              const int16_t* LIBGAV1_RESTRICT prediction_1,
-                              uint8_t* LIBGAV1_RESTRICT mask,
-                              ptrdiff_t mask_stride) {
+inline void WeightMask16_SSE4_1(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                const int16_t* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
   const __m128i pred_00 = LoadAligned16(prediction_0);
   const __m128i pred_10 = LoadAligned16(prediction_1);
   const __m128i difference_0 = RightShiftWithRounding_U16(
@@ -78,7 +78,7 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
 }
 
 #define WEIGHT8_PAIR_WITHOUT_STRIDE \
-  WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
+  WeightMask16_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
 
 #define WEIGHT8_PAIR_AND_STRIDE \
   WEIGHT8_PAIR_WITHOUT_STRIDE;  \
@@ -87,9 +87,10 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride << 1
 
 template <bool mask_is_inverse>
-void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                        const void* LIBGAV1_RESTRICT prediction_1,
-                        uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+void WeightMask8x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
 
@@ -100,10 +101,10 @@ void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                         const void* LIBGAV1_RESTRICT prediction_1,
-                         uint8_t* LIBGAV1_RESTRICT mask,
-                         ptrdiff_t mask_stride) {
+void WeightMask8x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 3;
@@ -116,10 +117,10 @@ void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                         const void* LIBGAV1_RESTRICT prediction_1,
-                         uint8_t* LIBGAV1_RESTRICT mask,
-                         ptrdiff_t mask_stride) {
+void WeightMask8x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 5;
@@ -132,7 +133,7 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 #define WEIGHT16_WITHOUT_STRIDE \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
 
 #define WEIGHT16_AND_STRIDE \
   WEIGHT16_WITHOUT_STRIDE;  \
@@ -141,10 +142,10 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                         const void* LIBGAV1_RESTRICT prediction_1,
-                         uint8_t* LIBGAV1_RESTRICT mask,
-                         ptrdiff_t mask_stride) {
+void WeightMask16x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y = 7;
@@ -155,10 +156,10 @@ void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask16x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 5;
@@ -171,10 +172,10 @@ void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask16x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 6;
@@ -190,10 +191,10 @@ void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask16x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 21;
@@ -205,10 +206,11 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   WEIGHT16_WITHOUT_STRIDE;
 }
 
-#define WEIGHT32_WITHOUT_STRIDE                                                \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16,           \
-                                           mask + 16, mask_stride)
+#define WEIGHT32_WITHOUT_STRIDE                                        \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                             mask_stride);             \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                             mask + 16, mask_stride)
 
 #define WEIGHT32_AND_STRIDE \
   WEIGHT32_WITHOUT_STRIDE;  \
@@ -217,10 +219,10 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                         const void* LIBGAV1_RESTRICT prediction_1,
-                         uint8_t* LIBGAV1_RESTRICT mask,
-                         ptrdiff_t mask_stride) {
+void WeightMask32x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   WEIGHT32_AND_STRIDE;
@@ -234,10 +236,10 @@ void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask32x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 5;
@@ -250,10 +252,10 @@ void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask32x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 6;
@@ -269,10 +271,10 @@ void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask32x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 21;
@@ -284,14 +286,15 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   WEIGHT32_WITHOUT_STRIDE;
 }
 
-#define WEIGHT64_WITHOUT_STRIDE                                                \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16,           \
-                                           mask + 16, mask_stride);            \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32,           \
-                                           mask + 32, mask_stride);            \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48,           \
-                                           mask + 48, mask_stride)
+#define WEIGHT64_WITHOUT_STRIDE                                        \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                             mask_stride);             \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                             mask + 16, mask_stride);  \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+                                             mask + 32, mask_stride);  \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+                                             mask + 48, mask_stride)
 
 #define WEIGHT64_AND_STRIDE \
   WEIGHT64_WITHOUT_STRIDE;  \
@@ -300,10 +303,10 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask64x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -316,10 +319,10 @@ void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask64x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 0;
@@ -335,10 +338,10 @@ void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask64x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -351,10 +354,10 @@ void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                           const void* LIBGAV1_RESTRICT prediction_1,
-                           uint8_t* LIBGAV1_RESTRICT mask,
-                           ptrdiff_t mask_stride) {
+void WeightMask64x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT mask,
+                             ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -368,10 +371,10 @@ void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                           const void* LIBGAV1_RESTRICT prediction_1,
-                           uint8_t* LIBGAV1_RESTRICT mask,
-                           ptrdiff_t mask_stride) {
+void WeightMask128x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT mask,
+                             ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -412,10 +415,10 @@ void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                            const void* LIBGAV1_RESTRICT prediction_1,
-                            uint8_t* LIBGAV1_RESTRICT mask,
-                            ptrdiff_t mask_stride) {
+void WeightMask128x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                              const void* LIBGAV1_RESTRICT prediction_1,
+                              uint8_t* LIBGAV1_RESTRICT mask,
+                              ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -466,8 +469,9 @@ void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 
 #define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
   dsp->weight_mask[w_index][h_index][0] =                      \
-      WeightMask##width##x##height##_SSE4<0>;                  \
-  dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1>
+      WeightMask##width##x##height##_SSE4_1<0>;                \
+  dsp->weight_mask[w_index][h_index][1] =                      \
+      WeightMask##width##x##height##_SSE4_1<1>
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
@@ -501,7 +505,7 @@ constexpr int kRoundingBits10bpp = 6;
 constexpr int kScaledDiffShift = 4;
 
 template <bool mask_is_inverse, bool is_store_16>
-inline void WeightMask16_10bpp_SSE4(
+inline void WeightMask16_10bpp_SSE4_1(
     const uint16_t* LIBGAV1_RESTRICT prediction_0,
     const uint16_t* LIBGAV1_RESTRICT prediction_1,
     uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
@@ -562,9 +566,9 @@ inline void WeightMask16_10bpp_SSE4(
   }
 }
 
-#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP                               \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \
-                                                  mask_stride)
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP                                 \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, \
+                                                    mask_stride)
 
 #define WEIGHT8_PAIR_AND_STRIDE_10BPP \
   WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;  \
@@ -573,10 +577,10 @@ inline void WeightMask16_10bpp_SSE4(
   mask += mask_stride << 1
 
 template <bool mask_is_inverse>
-void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                              const void* LIBGAV1_RESTRICT prediction_1,
-                              uint8_t* LIBGAV1_RESTRICT mask,
-                              ptrdiff_t mask_stride) {
+void WeightMask8x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
 
@@ -587,10 +591,10 @@ void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                               const void* LIBGAV1_RESTRICT prediction_1,
-                               uint8_t* LIBGAV1_RESTRICT mask,
-                               ptrdiff_t mask_stride) {
+void WeightMask8x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 3;
@@ -603,10 +607,10 @@ void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                               const void* LIBGAV1_RESTRICT prediction_1,
-                               uint8_t* LIBGAV1_RESTRICT mask,
-                               ptrdiff_t mask_stride) {
+void WeightMask8x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y5 = 5;
@@ -618,9 +622,9 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
 }
 
-#define WEIGHT16_WITHOUT_STRIDE_10BPP                                  \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
-                                                 mask_stride)
+#define WEIGHT16_WITHOUT_STRIDE_10BPP                                    \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+                                                   mask_stride)
 
 #define WEIGHT16_AND_STRIDE_10BPP \
   WEIGHT16_WITHOUT_STRIDE_10BPP;  \
@@ -629,10 +633,10 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                               const void* LIBGAV1_RESTRICT prediction_1,
-                               uint8_t* LIBGAV1_RESTRICT mask,
-                               ptrdiff_t mask_stride) {
+void WeightMask16x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y = 7;
@@ -643,10 +647,10 @@ void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask16x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 5;
@@ -659,10 +663,10 @@ void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask16x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y5 = 6;
@@ -678,10 +682,10 @@ void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask16x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 21;
@@ -693,11 +697,11 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   WEIGHT16_WITHOUT_STRIDE_10BPP;
 }
 
-#define WEIGHT32_WITHOUT_STRIDE_10BPP                                      \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask,     \
-                                                 mask_stride);             \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
-                                                 mask + 16, mask_stride)
+#define WEIGHT32_WITHOUT_STRIDE_10BPP                                        \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                   mask_stride);             \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                   mask + 16, mask_stride)
 
 #define WEIGHT32_AND_STRIDE_10BPP \
   WEIGHT32_WITHOUT_STRIDE_10BPP;  \
@@ -706,10 +710,10 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                               const void* LIBGAV1_RESTRICT prediction_1,
-                               uint8_t* LIBGAV1_RESTRICT mask,
-                               ptrdiff_t mask_stride) {
+void WeightMask32x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   WEIGHT32_AND_STRIDE_10BPP;
@@ -723,10 +727,10 @@ void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask32x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 5;
@@ -739,10 +743,10 @@ void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask32x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y5 = 6;
@@ -758,10 +762,10 @@ void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask32x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 21;
@@ -773,15 +777,15 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   WEIGHT32_WITHOUT_STRIDE_10BPP;
 }
 
-#define WEIGHT64_WITHOUT_STRIDE_10BPP                                      \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask,     \
-                                                 mask_stride);             \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
-                                                 mask + 16, mask_stride);  \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
-                                                 mask + 32, mask_stride);  \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
-                                                 mask + 48, mask_stride)
+#define WEIGHT64_WITHOUT_STRIDE_10BPP                                        \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                   mask_stride);             \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                   mask + 16, mask_stride);  \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+                                                   mask + 32, mask_stride);  \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+                                                   mask + 48, mask_stride)
 
 #define WEIGHT64_AND_STRIDE_10BPP \
   WEIGHT64_WITHOUT_STRIDE_10BPP;  \
@@ -790,10 +794,10 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask64x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 5;
@@ -806,10 +810,10 @@ void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask64x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y5 = 6;
@@ -825,10 +829,10 @@ void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask64x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 21;
@@ -841,10 +845,10 @@ void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                 const void* LIBGAV1_RESTRICT prediction_1,
-                                 uint8_t* LIBGAV1_RESTRICT mask,
-                                 ptrdiff_t mask_stride) {
+void WeightMask64x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                   const void* LIBGAV1_RESTRICT prediction_1,
+                                   uint8_t* LIBGAV1_RESTRICT mask,
+                                   ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 42;
@@ -858,10 +862,10 @@ void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                 const void* LIBGAV1_RESTRICT prediction_1,
-                                 uint8_t* LIBGAV1_RESTRICT mask,
-                                 ptrdiff_t mask_stride) {
+void WeightMask128x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                   const void* LIBGAV1_RESTRICT prediction_1,
+                                   uint8_t* LIBGAV1_RESTRICT mask,
+                                   ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 21;
@@ -902,10 +906,10 @@ void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                  const void* LIBGAV1_RESTRICT prediction_1,
-                                  uint8_t* LIBGAV1_RESTRICT mask,
-                                  ptrdiff_t mask_stride) {
+void WeightMask128x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                    const void* LIBGAV1_RESTRICT prediction_1,
+                                    uint8_t* LIBGAV1_RESTRICT mask,
+                                    ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 42;
@@ -956,9 +960,9 @@ void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 
 #define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
   dsp->weight_mask[w_index][h_index][0] =                       \
-      WeightMask##width##x##height##_10bpp_SSE4<0>;             \
+      WeightMask##width##x##height##_10bpp_SSE4_1<0>;           \
   dsp->weight_mask[w_index][h_index][1] =                       \
-      WeightMask##width##x##height##_10bpp_SSE4<1>
+      WeightMask##width##x##height##_10bpp_SSE4_1<1>
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
diff --git a/libgav1/src/film_grain.cc b/libgav1/src/film_grain.cc
index 5c64ff2..44a2543 100644
--- a/libgav1/src/film_grain.cc
+++ b/libgav1/src/film_grain.cc
@@ -824,5 +824,8 @@ template class FilmGrain<kBitdepth8>;
 #if LIBGAV1_MAX_BITDEPTH >= 10
 template class FilmGrain<kBitdepth10>;
 #endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+template class FilmGrain<kBitdepth12>;
+#endif
 
 }  // namespace libgav1
diff --git a/libgav1/src/film_grain.h b/libgav1/src/film_grain.h
index f2c1e93..bda8458 100644
--- a/libgav1/src/film_grain.h
+++ b/libgav1/src/film_grain.h
@@ -104,7 +104,9 @@ class FilmGrain {
   using Pixel =
       typename std::conditional<bitdepth == 8, uint8_t, uint16_t>::type;
   static constexpr int kScalingLutLength =
-      (kScalingLookupTableSize + kScalingLookupTablePadding) << (bitdepth - 8);
+      (bitdepth == 10)
+          ? (kScalingLookupTableSize + kScalingLookupTablePadding) << 2
+          : kScalingLookupTableSize + kScalingLookupTablePadding;
 
   bool Init();
 
diff --git a/libgav1/src/gav1/decoder_buffer.h b/libgav1/src/gav1/decoder_buffer.h
index 880c320..0a5586e 100644
--- a/libgav1/src/gav1/decoder_buffer.h
+++ b/libgav1/src/gav1/decoder_buffer.h
@@ -115,6 +115,27 @@ typedef enum Libgav1ColorRange {
   kLibgav1ColorRangeFull     // YUV/RGB [0..255]
 } Libgav1ColorRange;
 
+typedef struct Libgav1ObuMetadataHdrCll {  // NOLINT
+  uint16_t max_cll;                        // Maximum content light level.
+  uint16_t max_fall;                       // Maximum frame-average light level.
+} Libgav1ObuMetadataHdrCll;
+
+typedef struct Libgav1ObuMetadataHdrMdcv {  // NOLINT
+  uint16_t primary_chromaticity_x[3];
+  uint16_t primary_chromaticity_y[3];
+  uint16_t white_point_chromaticity_x;
+  uint16_t white_point_chromaticity_y;
+  uint32_t luminance_max;
+  uint32_t luminance_min;
+} Libgav1ObuMetadataHdrMdcv;
+
+typedef struct Libgav1ObuMetadataItutT35 {  // NOLINT
+  uint8_t country_code;
+  uint8_t country_code_extension_byte;  // Valid if country_code is 0xFF.
+  uint8_t* payload_bytes;
+  int payload_size;
+} Libgav1ObuMetadataItutT35;
+
 typedef struct Libgav1DecoderBuffer {
 #if defined(__cplusplus)
   LIBGAV1_PUBLIC int NumPlanes() const {
@@ -146,6 +167,18 @@ typedef struct Libgav1DecoderBuffer {
   // Temporal id of this frame.
   int temporal_id;
 
+  Libgav1ObuMetadataHdrCll hdr_cll;
+  int has_hdr_cll;  // 1 if the values in hdr_cll are valid for this frame. 0
+                    // otherwise.
+
+  Libgav1ObuMetadataHdrMdcv hdr_mdcv;
+  int has_hdr_mdcv;  // 1 if the values in hdr_mdcv are valid for this frame. 0
+                     // otherwise.
+
+  Libgav1ObuMetadataItutT35 itut_t35;
+  int has_itut_t35;  // 1 if the values in itut_t35 are valid for this frame. 0
+                     // otherwise.
+
   // The |user_private_data| argument passed to Decoder::EnqueueFrame().
   int64_t user_private_data;
   // The |private_data| field of FrameBuffer. Set by the get frame buffer
@@ -264,6 +297,10 @@ using ColorRange = Libgav1ColorRange;
 constexpr ColorRange kColorRangeStudio = kLibgav1ColorRangeStudio;
 constexpr ColorRange kColorRangeFull = kLibgav1ColorRangeFull;
 
+using ObuMetadataHdrCll = Libgav1ObuMetadataHdrCll;
+using ObuMetadataHdrMdcv = Libgav1ObuMetadataHdrMdcv;
+using ObuMetadataItutT35 = Libgav1ObuMetadataItutT35;
+
 using DecoderBuffer = Libgav1DecoderBuffer;
 
 }  // namespace libgav1
diff --git a/libgav1/src/gav1/version.h b/libgav1/src/gav1/version.h
index 9bdc630..b386acc 100644
--- a/libgav1/src/gav1/version.h
+++ b/libgav1/src/gav1/version.h
@@ -23,7 +23,7 @@
 // (https://semver.org).
 
 #define LIBGAV1_MAJOR_VERSION 0
-#define LIBGAV1_MINOR_VERSION 17
+#define LIBGAV1_MINOR_VERSION 18
 #define LIBGAV1_PATCH_VERSION 0
 
 #define LIBGAV1_VERSION                                           \
diff --git a/libgav1/src/libgav1_decoder.cmake b/libgav1/src/libgav1_decoder.cmake
index b97d09d..1314d0b 100644
--- a/libgav1/src/libgav1_decoder.cmake
+++ b/libgav1/src/libgav1_decoder.cmake
@@ -107,7 +107,7 @@ macro(libgav1_add_decoder_targets)
     list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
   endif()
 
-  if(NOT ANDROID)
+  if(use_absl_threading)
     list(APPEND libgav1_absl_deps absl::base absl::synchronization)
   endif()
 
diff --git a/libgav1/src/obu_parser.cc b/libgav1/src/obu_parser.cc
index 445450b..9e9166a 100644
--- a/libgav1/src/obu_parser.cc
+++ b/libgav1/src/obu_parser.cc
@@ -1767,11 +1767,7 @@ bool ObuParser::ParseFrameParameters() {
   int64_t scratch;
   if (sequence_header_.reduced_still_picture_header) {
     frame_header_.show_frame = true;
-    current_frame_ = buffer_pool_->GetFreeBuffer();
-    if (current_frame_ == nullptr) {
-      LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
-      return false;
-    }
+    if (!EnsureCurrentFrameIsNotNull()) return false;
   } else {
     OBU_READ_BIT_OR_FAIL;
     frame_header_.show_existing_frame = scratch != 0;
@@ -1840,11 +1836,7 @@ bool ObuParser::ParseFrameParameters() {
       }
       return true;
     }
-    current_frame_ = buffer_pool_->GetFreeBuffer();
-    if (current_frame_ == nullptr) {
-      LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
-      return false;
-    }
+    if (!EnsureCurrentFrameIsNotNull()) return false;
     OBU_READ_LITERAL_OR_FAIL(2);
     frame_header_.frame_type = static_cast<FrameType>(scratch);
     current_frame_->set_frame_type(frame_header_.frame_type);
@@ -2395,50 +2387,58 @@ bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) {
   size -= metadata_type_size;
   int64_t scratch;
   switch (metadata_type) {
-    case kMetadataTypeHdrContentLightLevel:
+    case kMetadataTypeHdrContentLightLevel: {
+      ObuMetadataHdrCll hdr_cll;
       OBU_READ_LITERAL_OR_FAIL(16);
-      metadata_.max_cll = scratch;
+      hdr_cll.max_cll = scratch;
       OBU_READ_LITERAL_OR_FAIL(16);
-      metadata_.max_fall = scratch;
+      hdr_cll.max_fall = scratch;
+      if (!EnsureCurrentFrameIsNotNull()) return false;
+      current_frame_->set_hdr_cll(hdr_cll);
       break;
-    case kMetadataTypeHdrMasteringDisplayColorVolume:
+    }
+    case kMetadataTypeHdrMasteringDisplayColorVolume: {
+      ObuMetadataHdrMdcv hdr_mdcv;
       for (int i = 0; i < 3; ++i) {
         OBU_READ_LITERAL_OR_FAIL(16);
-        metadata_.primary_chromaticity_x[i] = scratch;
+        hdr_mdcv.primary_chromaticity_x[i] = scratch;
         OBU_READ_LITERAL_OR_FAIL(16);
-        metadata_.primary_chromaticity_y[i] = scratch;
+        hdr_mdcv.primary_chromaticity_y[i] = scratch;
       }
       OBU_READ_LITERAL_OR_FAIL(16);
-      metadata_.white_point_chromaticity_x = scratch;
+      hdr_mdcv.white_point_chromaticity_x = scratch;
       OBU_READ_LITERAL_OR_FAIL(16);
-      metadata_.white_point_chromaticity_y = scratch;
+      hdr_mdcv.white_point_chromaticity_y = scratch;
       OBU_READ_LITERAL_OR_FAIL(32);
-      metadata_.luminance_max = static_cast<uint32_t>(scratch);
+      hdr_mdcv.luminance_max = static_cast<uint32_t>(scratch);
       OBU_READ_LITERAL_OR_FAIL(32);
-      metadata_.luminance_min = static_cast<uint32_t>(scratch);
+      hdr_mdcv.luminance_min = static_cast<uint32_t>(scratch);
+      if (!EnsureCurrentFrameIsNotNull()) return false;
+      current_frame_->set_hdr_mdcv(hdr_mdcv);
       break;
+    }
     case kMetadataTypeScalability:
       if (!ParseMetadataScalability()) return false;
       break;
     case kMetadataTypeItutT35: {
+      ObuMetadataItutT35 itut_t35;
       OBU_READ_LITERAL_OR_FAIL(8);
-      metadata_.itu_t_t35_country_code = static_cast<uint8_t>(scratch);
+      itut_t35.country_code = static_cast<uint8_t>(scratch);
       ++data;
       --size;
-      if (metadata_.itu_t_t35_country_code == 0xFF) {
+      if (itut_t35.country_code == 0xFF) {
         OBU_READ_LITERAL_OR_FAIL(8);
-        metadata_.itu_t_t35_country_code_extension_byte =
-            static_cast<uint8_t>(scratch);
+        itut_t35.country_code_extension_byte = static_cast<uint8_t>(scratch);
         ++data;
         --size;
       }
-      // Read itu_t_t35_payload_bytes. Section 6.7.2 of the spec says:
-      //   itu_t_t35_payload_bytes shall be bytes containing data registered as
+      // Read itut_t35.payload_bytes. Section 6.7.2 of the spec says:
+      //   itut_t35.payload_bytes shall be bytes containing data registered as
       //   specified in Recommendation ITU-T T.35.
-      // Therefore itu_t_t35_payload_bytes is byte aligned and the first
-      // trailing byte should be 0x80. Since the exact syntax of
-      // itu_t_t35_payload_bytes is not defined in the AV1 spec, identify the
-      // end of itu_t_t35_payload_bytes by searching for the trailing bit.
+      // Therefore itut_t35.payload_bytes is byte aligned and the first trailing
+      // byte should be 0x80. Since the exact syntax of itut_t35.payload_bytes
+      // is not defined in the AV1 spec, identify the end of
+      // itut_t35.payload_bytes by searching for the trailing bit.
       const int i = GetLastNonzeroByteIndex(data, size);
       if (i < 0) {
         LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
@@ -2447,20 +2447,15 @@ bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) {
       if (data[i] != 0x80) {
         LIBGAV1_DLOG(
             ERROR,
-            "itu_t_t35_payload_bytes is not byte aligned. The last nonzero "
-            "byte of the payload data is 0x%x, should be 0x80.",
+            "itut_t35.payload_bytes is not byte aligned. The last nonzero byte "
+            "of the payload data is 0x%x, should be 0x80.",
             data[i]);
         return false;
       }
-      if (i != 0) {
-        // data[0]..data[i - 1] are itu_t_t35_payload_bytes.
-        metadata_.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[i]);
-        if (metadata_.itu_t_t35_payload_bytes == nullptr) {
-          LIBGAV1_DLOG(ERROR, "Allocation of itu_t_t35_payload_bytes failed.");
-          return false;
-        }
-        memcpy(metadata_.itu_t_t35_payload_bytes.get(), data, i);
-        metadata_.itu_t_t35_payload_size = i;
+      itut_t35.payload_size = i;
+      if (!EnsureCurrentFrameIsNotNull() ||
+          !current_frame_->set_itut_t35(itut_t35, data)) {
+        return false;
       }
       // Skip all bits before the trailing bit.
       bit_reader_->SkipBytes(i);
@@ -2637,6 +2632,16 @@ bool ObuParser::InitBitReader(const uint8_t* const data, size_t size) {
   return bit_reader_ != nullptr;
 }
 
+bool ObuParser::EnsureCurrentFrameIsNotNull() {
+  if (current_frame_ != nullptr) return true;
+  current_frame_ = buffer_pool_->GetFreeBuffer();
+  if (current_frame_ == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
+    return false;
+  }
+  return true;
+}
+
 bool ObuParser::HasData() const { return size_ > 0; }
 
 StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
@@ -2652,7 +2657,6 @@ StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
   // Clear everything except the sequence header.
   obu_headers_.clear();
   frame_header_ = {};
-  metadata_ = {};
   tile_buffers_.clear();
   next_tile_group_start_ = 0;
   sequence_header_changed_ = false;
diff --git a/libgav1/src/obu_parser.h b/libgav1/src/obu_parser.h
index 3f452ef..eba3370 100644
--- a/libgav1/src/obu_parser.h
+++ b/libgav1/src/obu_parser.h
@@ -221,26 +221,6 @@ enum MetadataType : uint8_t {
   // 32 and greater are reserved for AOM use.
 };
 
-struct ObuMetadata {
-  // Maximum content light level.
-  uint16_t max_cll;
-  // Maximum frame-average light level.
-  uint16_t max_fall;
-  uint16_t primary_chromaticity_x[3];
-  uint16_t primary_chromaticity_y[3];
-  uint16_t white_point_chromaticity_x;
-  uint16_t white_point_chromaticity_y;
-  uint32_t luminance_max;
-  uint32_t luminance_min;
-  // ITU-T T.35.
-  uint8_t itu_t_t35_country_code;
-  uint8_t itu_t_t35_country_code_extension_byte;  // Valid if
-                                                  // itu_t_t35_country_code is
-                                                  // 0xFF.
-  std::unique_ptr<uint8_t[]> itu_t_t35_payload_bytes;
-  size_t itu_t_t35_payload_size;
-};
-
 class ObuParser : public Allocable {
  public:
   ObuParser(const uint8_t* const data, size_t size, int operating_point,
@@ -276,7 +256,6 @@ class ObuParser : public Allocable {
   const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
   const ObuFrameHeader& frame_header() const { return frame_header_; }
   const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
-  const ObuMetadata& metadata() const { return metadata_; }
   // Returns true if the last call to ParseOneFrame() encountered a sequence
   // header change.
   bool sequence_header_changed() const { return sequence_header_changed_; }
@@ -372,6 +351,11 @@ class ObuParser : public Allocable {
                       size_t tg_header_size, size_t bytes_consumed_so_far);
   bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far);  // 5.11.1.
 
+  // Populates |current_frame_| from the |buffer_pool_| if |current_frame_| is
+  // nullptr. Does not do anything otherwise. Returns true on success, false
+  // otherwise.
+  bool EnsureCurrentFrameIsNotNull();
+
   // Parser elements.
   std::unique_ptr<RawBitReader> bit_reader_;
   const uint8_t* data_;
@@ -383,7 +367,6 @@ class ObuParser : public Allocable {
   ObuSequenceHeader sequence_header_ = {};
   ObuFrameHeader frame_header_ = {};
   Vector<TileBuffer> tile_buffers_;
-  ObuMetadata metadata_ = {};
   // The expected starting tile number of the next Tile Group.
   int next_tile_group_start_ = 0;
   // If true, the sequence_header_ field is valid.
diff --git a/libgav1/src/post_filter/deblock.cc b/libgav1/src/post_filter/deblock.cc
index 48ad823..daee01c 100644
--- a/libgav1/src/post_filter/deblock.cc
+++ b/libgav1/src/post_filter/deblock.cc
@@ -329,7 +329,6 @@ void PostFilter::HorizontalDeblockFilter(int row4x4_start, int row4x4_end,
             src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
             HevThresh(level));
       }
-      // TODO(chengchen): use shifts instead of multiplication.
       src_row += row_step * src_stride;
       row_step = DivideBy4(row_step);
     }
diff --git a/libgav1/src/quantizer.cc b/libgav1/src/quantizer.cc
index cd720d6..eb13314 100644
--- a/libgav1/src/quantizer.cc
+++ b/libgav1/src/quantizer.cc
@@ -20,8 +20,9 @@
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
 
-#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10
-#error LIBGAV1_MAX_BITDEPTH must be 8 or 10
+#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10 && \
+    LIBGAV1_MAX_BITDEPTH != 12
+#error LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12
 #endif
 
 namespace libgav1 {
@@ -87,6 +88,43 @@ constexpr int16_t kDcLookup[][256] = {
     4737, 4929, 5130, 5347
   },
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+  // Lookup table for 12 bit.
+  {
+    4, 12, 18, 25, 33, 41, 50, 60,
+    70, 80, 91, 103, 115, 127, 140, 153,
+    166, 180, 194, 208, 222, 237, 251, 266,
+    281, 296, 312, 327, 343, 358, 374, 390,
+    405, 421, 437, 453, 469, 484, 500, 516,
+    532, 548, 564, 580, 596, 611, 627, 643,
+    659, 674, 690, 706, 721, 737, 752, 768,
+    783, 798, 814, 829, 844, 859, 874, 889,
+    904, 919, 934, 949, 964, 978, 993, 1008,
+    1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122,
+    1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234,
+    1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342,
+    1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544,
+    1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741,
+    1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933,
+    1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199,
+    2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467,
+    2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788,
+    2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127,
+    3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517,
+    3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951,
+    4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420,
+    4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942,
+    5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517,
+    5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149,
+    6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867,
+    6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715,
+    7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788,
+    8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245,
+    10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+    12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+    16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 };
 
 constexpr int16_t kAcLookup[][256] = {
@@ -142,6 +180,43 @@ constexpr int16_t kAcLookup[][256] = {
     6900, 7036, 7172, 7312
   },
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+  // Lookup table for 12 bit.
+  {
+    4, 13, 19, 27, 35, 44, 54, 64,
+    75, 87, 99, 112, 126, 139, 154, 168,
+    183, 199, 214, 230, 247, 263, 280, 297,
+    314, 331, 349, 366, 384, 402, 420, 438,
+    456, 475, 493, 511, 530, 548, 567, 586,
+    604, 623, 642, 660, 679, 698, 716, 735,
+    753, 772, 791, 809, 828, 846, 865, 884,
+    902, 920, 939, 957, 976, 994, 1012, 1030,
+    1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175,
+    1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317,
+    1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457,
+    1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595,
+    1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856,
+    1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118,
+    2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378,
+    2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750,
+    2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137,
+    3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619,
+    3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149,
+    4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791,
+    4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544,
+    5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410,
+    6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435,
+    7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635,
+    8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028,
+    10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+    11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+    13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+    16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+    18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+    21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+    25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH == 12
 };
 // clang-format on
 
diff --git a/libgav1/src/tile.h b/libgav1/src/tile.h
index 83c3423..fcab963 100644
--- a/libgav1/src/tile.h
+++ b/libgav1/src/tile.h
@@ -464,13 +464,14 @@ class Tile : public MaxAlignedAllocable {
                          int* start_y, int* step_x, int* step_y);  // 7.11.3.3.
   // If the method returns false, the caller only uses the output parameters
   // *ref_block_start_x and *ref_block_start_y. If the method returns true, the
-  // caller uses all three output parameters.
+  // caller uses all four output parameters.
   static bool GetReferenceBlockPosition(
       int reference_frame_index, bool is_scaled, int width, int height,
       int ref_start_x, int ref_last_x, int ref_start_y, int ref_last_y,
       int start_x, int start_y, int step_x, int step_y, int left_border,
       int right_border, int top_border, int bottom_border,
-      int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x);
+      int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x,
+      int* ref_block_end_y);
 
   template <typename Pixel>
   void BuildConvolveBlock(Plane plane, int reference_frame_index,
diff --git a/libgav1/src/tile/prediction.cc b/libgav1/src/tile/prediction.cc
index bba5a69..4348548 100644
--- a/libgav1/src/tile/prediction.cc
+++ b/libgav1/src/tile/prediction.cc
@@ -771,11 +771,10 @@ bool Tile::InterPrediction(const Block& block, const Plane plane, const int x,
                       [static_cast<int>(prediction_parameters.mask_is_inverse)](
                           block.scratch_buffer->prediction_buffer[0],
                           block.scratch_buffer->prediction_buffer[1],
-                          block.scratch_buffer->weight_mask,
-                          kMaxSuperBlockSizeInPixels);
+                          block.scratch_buffer->weight_mask, block.width);
     }
     prediction_mask = block.scratch_buffer->weight_mask;
-    prediction_mask_stride = kMaxSuperBlockSizeInPixels;
+    prediction_mask_stride = block.width;
   }
 
   if (is_compound) {
@@ -996,7 +995,7 @@ bool Tile::GetReferenceBlockPosition(
     const int start_y, const int step_x, const int step_y,
     const int left_border, const int right_border, const int top_border,
     const int bottom_border, int* ref_block_start_x, int* ref_block_start_y,
-    int* ref_block_end_x) {
+    int* ref_block_end_x, int* ref_block_end_y) {
   *ref_block_start_x = GetPixelPositionFromHighScale(start_x, 0, 0);
   *ref_block_start_y = GetPixelPositionFromHighScale(start_y, 0, 0);
   if (reference_frame_index == -1) {
@@ -1006,7 +1005,7 @@ bool Tile::GetReferenceBlockPosition(
   *ref_block_start_y -= kConvolveBorderLeftTop;
   *ref_block_end_x = GetPixelPositionFromHighScale(start_x, step_x, width - 1) +
                      kConvolveBorderRight;
-  int ref_block_end_y =
+  *ref_block_end_y =
       GetPixelPositionFromHighScale(start_y, step_y, height - 1) +
       kConvolveBorderBottom;
   if (is_scaled) {
@@ -1015,13 +1014,13 @@ bool Tile::GetReferenceBlockPosition(
          kScaleSubPixelBits) +
         kSubPixelTaps;
     *ref_block_end_x += kConvolveScaleBorderRight - kConvolveBorderRight;
-    ref_block_end_y = *ref_block_start_y + block_height - 1;
+    *ref_block_end_y = *ref_block_start_y + block_height - 1;
   }
   // Determines if we need to extend beyond the left/right/top/bottom border.
   return *ref_block_start_x < (ref_start_x - left_border) ||
          *ref_block_end_x > (ref_last_x + right_border) ||
          *ref_block_start_y < (ref_start_y - top_border) ||
-         ref_block_end_y > (ref_last_y + bottom_border);
+         *ref_block_end_y > (ref_last_y + bottom_border);
 }
 
 // Builds a block as the input for convolve, by copying the content of
@@ -1140,6 +1139,7 @@ bool Tile::BlockInterPrediction(
   int ref_block_start_x;
   int ref_block_start_y;
   int ref_block_end_x;
+  int ref_block_end_y;
   const bool extend_block = GetReferenceBlockPosition(
       reference_frame_index, is_scaled, width, height, ref_start_x, ref_last_x,
       ref_start_y, ref_last_y, start_x, start_y, step_x, step_y,
@@ -1147,24 +1147,15 @@ bool Tile::BlockInterPrediction(
       reference_buffer->right_border(plane),
       reference_buffer->top_border(plane),
       reference_buffer->bottom_border(plane), &ref_block_start_x,
-      &ref_block_start_y, &ref_block_end_x);
+      &ref_block_start_y, &ref_block_end_x, &ref_block_end_y);
 
   // In frame parallel mode, ensure that the reference block has been decoded
   // and available for referencing.
   if (reference_frame_index != -1 && frame_parallel_) {
-    int reference_y_max;
-    if (is_scaled) {
-      // TODO(vigneshv): For now, we wait for the entire reference frame to be
-      // decoded if we are using scaled references. This will eventually be
-      // fixed.
-      reference_y_max = reference_height;
-    } else {
-      reference_y_max =
-          std::min(ref_block_start_y + height + kSubPixelTaps, ref_last_y);
-      // For U and V planes with subsampling, we need to multiply
-      // reference_y_max by 2 since we only track the progress of Y planes.
-      reference_y_max = LeftShift(reference_y_max, subsampling_y);
-    }
+    // For U and V planes with subsampling, we need to multiply the value of
+    // ref_block_end_y by 2 since we only track the progress of the Y planes.
+    const int reference_y_max = LeftShift(
+        std::min(ref_block_end_y + kSubPixelTaps, ref_last_y), subsampling_y);
     if (reference_frame_progress_cache_[reference_frame_index] <
             reference_y_max &&
         !reference_frames_[reference_frame_index]->WaitUntil(
@@ -1297,11 +1288,12 @@ bool Tile::BlockWarpProcess(const Block& block, const Plane plane,
            start_x += 8) {
         const int src_x = (start_x + 4) << subsampling_x_[plane];
         const int src_y = (start_y + 4) << subsampling_y_[plane];
-        const int dst_y = src_x * warp_params->params[4] +
-                          src_y * warp_params->params[5] +
-                          warp_params->params[1];
-        const int y4 = dst_y >> subsampling_y_[plane];
-        const int iy4 = y4 >> kWarpedModelPrecisionBits;
+        const int64_t dst_y =
+            src_x * warp_params->params[4] +
+            static_cast<int64_t>(src_y) * warp_params->params[5] +
+            warp_params->params[1];
+        const int64_t y4 = dst_y >> subsampling_y_[plane];
+        const int iy4 = static_cast<int>(y4 >> kWarpedModelPrecisionBits);
         reference_y_max = std::max(iy4 + 8, reference_y_max);
       }
     }
diff --git a/libgav1/src/utils/constants.h b/libgav1/src/utils/constants.h
index 1126ad6..8281aad 100644
--- a/libgav1/src/utils/constants.h
+++ b/libgav1/src/utils/constants.h
@@ -37,6 +37,10 @@ enum {
 };  // anonymous enum
 
 enum {
+  // Documentation variables.
+  kBitdepth8 = 8,
+  kBitdepth10 = 10,
+  kBitdepth12 = 12,
   kInvalidMvValue = -32768,
   kCdfMaxProbability = 32768,
   kBlockWidthCount = 5,
@@ -59,6 +63,13 @@ enum {
   kRestorationTypeSymbolCount = 3,
   kSgrProjParamsBits = 4,
   kSgrProjPrecisionBits = 7,
+  // Precision of a division table (mtable)
+  kSgrProjScaleBits = 20,
+  kSgrProjReciprocalBits = 12,
+  // Core self-guided restoration precision bits.
+  kSgrProjSgrBits = 8,
+  // Precision bits of generated values higher than source before projection.
+  kSgrProjRestoreBits = 4,
   // Padding on left and right side of a restoration block.
   // 3 is enough, but padding to 4 is more efficient, and makes the temporary
   // source buffer 8-pixel aligned.
@@ -177,6 +188,15 @@ enum {
   // On Linux, the cache line size can be looked up with the command:
   //   getconf LEVEL1_DCACHE_LINESIZE
   kCacheLineSize = 64,
+  // InterRound0, Section 7.11.3.2.
+  kInterRoundBitsHorizontal = 3,  // 8 & 10-bit.
+  kInterRoundBitsHorizontal12bpp = 5,
+  kInterRoundBitsCompoundVertical = 7,  // 8, 10 & 12-bit compound prediction.
+  kInterRoundBitsVertical = 11,         // 8 & 10-bit, single prediction.
+  kInterRoundBitsVertical12bpp = 9,
+  // Offset applied to 10bpp and 12bpp predictors to allow storing them in
+  // uint16_t. Removed before blending.
+  kCompoundOffset = (1 << 14) + (1 << 13),
 };  // anonymous enum
 
 enum FrameType : uint8_t {
diff --git a/libgav1/src/utils/segmentation_map.cc b/libgav1/src/utils/segmentation_map.cc
index 4284ca2..bbf40c3 100644
--- a/libgav1/src/utils/segmentation_map.cc
+++ b/libgav1/src/utils/segmentation_map.cc
@@ -21,9 +21,12 @@
 namespace libgav1 {
 
 bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) {
+  if (rows4x4 * columns4x4 > rows4x4_ * columns4x4_) {
+    segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4 * columns4x4]);
+  }
+
   rows4x4_ = rows4x4;
   columns4x4_ = columns4x4;
-  segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4_ * columns4x4_]);
   if (segment_id_buffer_ == nullptr) return false;
   segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get());
   return true;
diff --git a/libgav1/src/warp_prediction.cc b/libgav1/src/warp_prediction.cc
index 69b40e8..0da8a1f 100644
--- a/libgav1/src/warp_prediction.cc
+++ b/libgav1/src/warp_prediction.cc
@@ -231,9 +231,6 @@ bool WarpEstimation(const int num_samples, const int block_width4x4,
       Clip3(vx, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
   params[1] =
       Clip3(vy, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
-
-  params[6] = 0;
-  params[7] = 0;
   return true;
 }
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2022-07-12 16:58:33 +0000
committer	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2022-07-12 16:58:33 +0000
commit	3ffef2f45795a505468f8bfe0f7000c8276033ea (patch)
tree	c9445be70dbfa7de3e9a3d43358d10e5582e5097
parent	a9d9cdc6db5e02f60d4676b391b4599204d38278 (diff)
parent	eee6aacd0da152622c837c25d87908eebece5f28 (diff)
download	libgav1-android13-mainline-go-uwb-release.tar.gz