aboutsummaryrefslogtreecommitdiff
path: root/intel/filter_sse2_intrinsics.c
diff options
context:
space:
mode:
Diffstat (limited to 'intel/filter_sse2_intrinsics.c')
-rw-r--r--intel/filter_sse2_intrinsics.c51
1 files changed, 18 insertions, 33 deletions
diff --git a/intel/filter_sse2_intrinsics.c b/intel/filter_sse2_intrinsics.c
index 5e8553fbb..d3c0fe9e2 100644
--- a/intel/filter_sse2_intrinsics.c
+++ b/intel/filter_sse2_intrinsics.c
@@ -1,12 +1,11 @@
/* filter_sse2_intrinsics.c - SSE2 optimized filter functions
*
+ * Copyright (c) 2018 Cosmin Truta
* Copyright (c) 2016-2017 Glenn Randers-Pehrson
* Written by Mike Klein and Matt Sarett
* Derived from arm/filter_neon_intrinsics.c
*
- * Last changed in libpng 1.6.31 [July 27, 2017]
- *
* This code is released under the libpng license.
* For conditions of distribution and use, see the disclaimer
* and license in png.h
@@ -29,39 +28,25 @@
*/
static __m128i load4(const void* p) {
- return _mm_cvtsi32_si128(*(const int*)p);
+ int tmp;
+ memcpy(&tmp, p, sizeof(tmp));
+ return _mm_cvtsi32_si128(tmp);
}
static void store4(void* p, __m128i v) {
- *(int*)p = _mm_cvtsi128_si32(v);
+ int tmp = _mm_cvtsi128_si32(v);
+ memcpy(p, &tmp, sizeof(int));
}
static __m128i load3(const void* p) {
- /* We'll load 2 bytes, then 1 byte,
- * then mask them together, and finally load into SSE.
- */
- const png_uint_16* p01 = (png_const_uint_16p)p;
- const png_byte* p2 = (const png_byte*)(p01+1);
-
- png_uint_32 v012 = (png_uint_32)(*p01)
- | (png_uint_32)(*p2) << 16;
- return load4(&v012);
+ png_uint_32 tmp = 0;
+ memcpy(&tmp, p, 3);
+ return _mm_cvtsi32_si128(tmp);
}
static void store3(void* p, __m128i v) {
- /* We'll pull from SSE as a 32-bit int, then write
- * its bottom two bytes, then its third byte.
- */
- png_uint_32 v012;
- png_uint_16* p01;
- png_byte* p2;
-
- store4(&v012, v);
-
- p01 = (png_uint_16p)p;
- p2 = (png_byte*)(p01+1);
- *p01 = (png_uint_16)v012;
- *p2 = (png_byte)(v012 >> 16);
+ int tmp = _mm_cvtsi128_si32(v);
+ memcpy(p, &tmp, 3);
}
void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
@@ -71,7 +56,7 @@ void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
* There is no pixel to the left of the first pixel. It's encoded directly.
* That works with our main loop if we just say that left pixel was zero.
*/
- png_size_t rb;
+ size_t rb;
__m128i a, d = _mm_setzero_si128();
@@ -104,7 +89,7 @@ void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
* There is no pixel to the left of the first pixel. It's encoded directly.
* That works with our main loop if we just say that left pixel was zero.
*/
- png_size_t rb;
+ size_t rb;
__m128i a, d = _mm_setzero_si128();
@@ -131,7 +116,7 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
* perfectly with our loop if we make sure a starts at zero.
*/
- png_size_t rb;
+ size_t rb;
const __m128i zero = _mm_setzero_si128();
@@ -185,7 +170,7 @@ void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
* predicted to be half of the pixel above it. So again, this works
* perfectly with our loop if we make sure a starts at zero.
*/
- png_size_t rb;
+ size_t rb;
const __m128i zero = _mm_setzero_si128();
__m128i b;
__m128i a, d = zero;
@@ -257,7 +242,7 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
* Here we zero b and d, which become c and a respectively at the start of
* the loop.
*/
- png_size_t rb;
+ size_t rb;
const __m128i zero = _mm_setzero_si128();
__m128i c, b = zero,
a, d = zero;
@@ -274,7 +259,7 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
/* (p-a) == (a+b-c - a) == (b-c) */
-
+
pa = _mm_sub_epi16(b,c);
/* (p-b) == (a+b-c - b) == (a-c) */
@@ -356,7 +341,7 @@ void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
* Here we zero b and d, which become c and a respectively at the start of
* the loop.
*/
- png_size_t rb;
+ size_t rb;
const __m128i zero = _mm_setzero_si128();
__m128i pa,pb,pc,smallest,nearest;
__m128i c, b = zero,