diff options
Diffstat (limited to 'src/dsp/lossless_enc_sse41.c')
-rw-r--r-- | src/dsp/lossless_enc_sse41.c | 54 |
1 files changed, 52 insertions, 2 deletions
diff --git a/src/dsp/lossless_enc_sse41.c b/src/dsp/lossless_enc_sse41.c index ad358a6f..7ab83c26 100644 --- a/src/dsp/lossless_enc_sse41.c +++ b/src/dsp/lossless_enc_sse41.c @@ -18,8 +18,53 @@ #include <smmintrin.h> #include "src/dsp/lossless.h" -// For sign-extended multiplying constants, pre-shifted by 5: -#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) +//------------------------------------------------------------------------------ +// Cost operations. + +static WEBP_INLINE uint32_t HorizontalSum_SSE41(__m128i cost) { + cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 8)); + cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 4)); + return _mm_cvtsi128_si32(cost); +} + +static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) { + int i; + __m128i cost = _mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]); + assert(length % 8 == 0); + + for (i = 8; i + 8 <= length; i += 8) { + const int j = (i - 2) >> 1; + const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); + const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j); + const __m128i a2 = _mm_hadd_epi32(a0, a1); + const __m128i mul = _mm_mullo_epi32(a2, w); + cost = _mm_add_epi32(mul, cost); + } + return HorizontalSum_SSE41(cost); +} + +static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a, + const uint32_t* const b, int length) { + int i; + __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]), + _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4])); + assert(length % 8 == 0); + + for (i = 8; i + 8 <= length; i += 8) { + const int j = (i - 2) >> 1; + const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); + const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]); + const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); + const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j); + const __m128i a2 = _mm_hadd_epi32(a0, a1); + const __m128i b2 = _mm_hadd_epi32(b0, b1); + const __m128i mul = _mm_mullo_epi32(_mm_add_epi32(a2, b2), w); + cost = _mm_add_epi32(mul, cost); + } + return HorizontalSum_SSE41(cost); +} //------------------------------------------------------------------------------ // Subtract-Green Transform @@ -44,6 +89,9 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data, //------------------------------------------------------------------------------ // Color Transform +// For sign-extended multiplying constants, pre-shifted by 5: +#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) + #define MK_CST_16(HI, LO) \ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) @@ -143,6 +191,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride, extern void VP8LEncDspInitSSE41(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) { + VP8LExtraCost = ExtraCost_SSE41; + VP8LExtraCostCombined = ExtraCostCombined_SSE41; VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41; VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41; VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41; |