aboutsummaryrefslogtreecommitdiff
path: root/libgav1/src/dsp/arm/intrapred_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'libgav1/src/dsp/arm/intrapred_neon.cc')
-rw-r--r--libgav1/src/dsp/arm/intrapred_neon.cc47
1 files changed, 18 insertions, 29 deletions
diff --git a/libgav1/src/dsp/arm/intrapred_neon.cc b/libgav1/src/dsp/arm/intrapred_neon.cc
index 14ca346..c967d82 100644
--- a/libgav1/src/dsp/arm/intrapred_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_neon.cc
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/dsp.h"
#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -25,6 +25,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
namespace libgav1 {
namespace dsp {
@@ -158,11 +159,10 @@ inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
if (ref_0_size_log2 == 2) {
- uint8x8_t val = vdup_n_u8(0);
- val = LoadLo4(ref_0_u8, val);
+ uint8x8_t val = Load4(ref_0_u8);
if (use_ref_1) {
if (ref_1_size_log2 == 2) { // 4x4
- val = LoadHi4(ref_1_u8, val);
+ val = Load4<1>(ref_1_u8, val);
return Sum(vpaddl_u8(val));
} else if (ref_1_size_log2 == 3) { // 4x8
const uint8x8_t val_1 = vld1_u8(ref_1_u8);
@@ -171,9 +171,7 @@ inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
return Sum(vadd_u16(sum_0, sum_1));
} else if (ref_1_size_log2 == 4) { // 4x16
const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
- const uint16x8_t sum_0 = vmovl_u8(val);
- const uint16x8_t sum_1 = vpaddlq_u8(val_1);
- return Sum(vaddq_u16(sum_0, sum_1));
+ return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
}
}
// 4x1
@@ -183,8 +181,7 @@ inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
const uint8x8_t val_0 = vld1_u8(ref_0_u8);
if (use_ref_1) {
if (ref_1_size_log2 == 2) { // 8x4
- uint8x8_t val_1 = vdup_n_u8(0);
- val_1 = LoadLo4(ref_1_u8, val_1);
+ const uint8x8_t val_1 = Load4(ref_1_u8);
const uint16x4_t sum_0 = vpaddl_u8(val_0);
const uint16x4_t sum_1 = vpaddl_u8(val_1);
return Sum(vadd_u16(sum_0, sum_1));
@@ -195,12 +192,9 @@ inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
return Sum(vadd_u16(sum_0, sum_1));
} else if (ref_1_size_log2 == 4) { // 8x16
const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
- const uint16x8_t sum_0 = vmovl_u8(val_0);
- const uint16x8_t sum_1 = vpaddlq_u8(val_1);
- return Sum(vaddq_u16(sum_0, sum_1));
+ return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
} else if (ref_1_size_log2 == 5) { // 8x32
- const uint16x8_t sum_0 = vmovl_u8(val_0);
- return Sum(vaddq_u16(sum_0, LoadAndAdd32(ref_1_u8)));
+ return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
}
}
// 8x1
@@ -209,16 +203,11 @@ inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
const uint8x16_t val_0 = vld1q_u8(ref_0_u8);
if (use_ref_1) {
if (ref_1_size_log2 == 2) { // 16x4
- uint8x8_t val_1 = vdup_n_u8(0);
- val_1 = LoadLo4(ref_1_u8, val_1);
- const uint16x8_t sum_0 = vmovl_u8(val_1);
- const uint16x8_t sum_u16 = vpaddlq_u8(val_0);
- return Sum(vaddq_u16(sum_0, sum_u16));
+ const uint8x8_t val_1 = Load4(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
} else if (ref_1_size_log2 == 3) { // 16x8
const uint8x8_t val_1 = vld1_u8(ref_1_u8);
- const uint16x8_t sum_0 = vpaddlq_u8(val_0);
- const uint16x8_t sum_1 = vmovl_u8(val_1);
- return Sum(vaddq_u16(sum_0, sum_1));
+ return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
} else if (ref_1_size_log2 == 4) { // 16x16
const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
return Sum(Add(val_0, val_1));
@@ -239,8 +228,7 @@ inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
if (use_ref_1) {
if (ref_1_size_log2 == 3) { // 32x8
const uint8x8_t val_1 = vld1_u8(ref_1_u8);
- const uint16x8_t sum_1 = vmovl_u8(val_1);
- return Sum(vaddq_u16(sum_0, sum_1));
+ return Sum(vaddw_u8(sum_0, val_1));
} else if (ref_1_size_log2 == 4) { // 32x16
const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
const uint16x8_t sum_1 = vpaddlq_u8(val_1);
@@ -340,8 +328,7 @@ inline void Paeth4Or8xN_NEON(void* const dest, ptrdiff_t stride,
const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
uint8x8_t top;
if (width == 4) {
- top = vdup_n_u8(0);
- top = LoadLo4(top_row_u8, top);
+ top = Load4(top_row_u8);
} else { // width == 8
top = vld1_u8(top_row_u8);
}
@@ -388,6 +375,8 @@ inline void Paeth4Or8xN_NEON(void* const dest, ptrdiff_t stride,
inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
const uint16x8_t top_left_dist_low,
const uint16x8_t top_left_dist_high) {
+ // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of
+ // using movl(x_dist).
const uint8x8_t x_le_top_left_low =
vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low));
const uint8x8_t x_le_top_left_high =
@@ -536,7 +525,7 @@ struct DcDefs {
};
void Init8bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
// 4x4
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
@@ -976,7 +965,7 @@ struct DcDefs {
};
void Init10bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
DcDefs::_4x4::DcTop;
@@ -1144,7 +1133,7 @@ void IntraPredInit_NEON() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {