aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/lossless_enc_sse41.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/lossless_enc_sse41.c')
-rw-r--r--src/dsp/lossless_enc_sse41.c54
1 files changed, 52 insertions, 2 deletions
diff --git a/src/dsp/lossless_enc_sse41.c b/src/dsp/lossless_enc_sse41.c
index ad358a6f..7ab83c26 100644
--- a/src/dsp/lossless_enc_sse41.c
+++ b/src/dsp/lossless_enc_sse41.c
@@ -18,8 +18,53 @@
#include <smmintrin.h>
#include "src/dsp/lossless.h"
-// For sign-extended multiplying constants, pre-shifted by 5:
-#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
+//------------------------------------------------------------------------------
+// Cost operations.
+
+static WEBP_INLINE uint32_t HorizontalSum_SSE41(__m128i cost) {
+ cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 8));
+ cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 4));
+ return _mm_cvtsi128_si32(cost);
+}
+
+static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) {
+ int i;
+ __m128i cost = _mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]);
+ assert(length % 8 == 0);
+
+ for (i = 8; i + 8 <= length; i += 8) {
+ const int j = (i - 2) >> 1;
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
+ const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j);
+ const __m128i a2 = _mm_hadd_epi32(a0, a1);
+ const __m128i mul = _mm_mullo_epi32(a2, w);
+ cost = _mm_add_epi32(mul, cost);
+ }
+ return HorizontalSum_SSE41(cost);
+}
+
+static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a,
+ const uint32_t* const b, int length) {
+ int i;
+ __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]),
+ _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4]));
+ assert(length % 8 == 0);
+
+ for (i = 8; i + 8 <= length; i += 8) {
+ const int j = (i - 2) >> 1;
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
+ const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]);
+ const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
+ const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j);
+ const __m128i a2 = _mm_hadd_epi32(a0, a1);
+ const __m128i b2 = _mm_hadd_epi32(b0, b1);
+ const __m128i mul = _mm_mullo_epi32(_mm_add_epi32(a2, b2), w);
+ cost = _mm_add_epi32(mul, cost);
+ }
+ return HorizontalSum_SSE41(cost);
+}
//------------------------------------------------------------------------------
// Subtract-Green Transform
@@ -44,6 +89,9 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
//------------------------------------------------------------------------------
// Color Transform
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
@@ -143,6 +191,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
extern void VP8LEncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
+ VP8LExtraCost = ExtraCost_SSE41;
+ VP8LExtraCostCombined = ExtraCostCombined_SSE41;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;