1 files changed, 18 insertions, 33 deletions
diff --git a/intel/filter_sse2_intrinsics.c b/intel/filter_sse2_intrinsics.c
index 5e8553fbb..d3c0fe9e2 100644
--- a/intel/filter_sse2_intrinsics.c
+++ b/intel/filter_sse2_intrinsics.c
@@ -1,12 +1,11 @@
 
 /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
  *
+ * Copyright (c) 2018 Cosmin Truta
  * Copyright (c) 2016-2017 Glenn Randers-Pehrson
  * Written by Mike Klein and Matt Sarett
  * Derived from arm/filter_neon_intrinsics.c
  *
- * Last changed in libpng 1.6.31 [July 27, 2017]
- *
  * This code is released under the libpng license.
  * For conditions of distribution and use, see the disclaimer
  * and license in png.h
@@ -29,39 +28,25 @@
  */
 
 static __m128i load4(const void* p) {
-   return _mm_cvtsi32_si128(*(const int*)p);
+   int tmp;
+   memcpy(&tmp, p, sizeof(tmp));
+   return _mm_cvtsi32_si128(tmp);
 }
 
 static void store4(void* p, __m128i v) {
-   *(int*)p = _mm_cvtsi128_si32(v);
+   int tmp = _mm_cvtsi128_si32(v);
+   memcpy(p, &tmp, sizeof(int));
 }
 
 static __m128i load3(const void* p) {
-   /* We'll load 2 bytes, then 1 byte,
-    * then mask them together, and finally load into SSE.
-    */
-   const png_uint_16* p01 = (png_const_uint_16p)p;
-   const png_byte*    p2  = (const png_byte*)(p01+1);
-
-   png_uint_32 v012 = (png_uint_32)(*p01)
-                    | (png_uint_32)(*p2) << 16;
-   return load4(&v012);
+   png_uint_32 tmp = 0;
+   memcpy(&tmp, p, 3);
+   return _mm_cvtsi32_si128(tmp);
 }
 
 static void store3(void* p, __m128i v) {
-   /* We'll pull from SSE as a 32-bit int, then write
-    * its bottom two bytes, then its third byte.
-    */
-   png_uint_32 v012;
-   png_uint_16* p01;
-   png_byte*    p2;
-
-   store4(&v012, v);
-
-   p01 = (png_uint_16p)p;
-   p2  = (png_byte*)(p01+1);
-   *p01 = (png_uint_16)v012;
-   *p2  = (png_byte)(v012 >> 16);
+   int tmp = _mm_cvtsi128_si32(v);
+   memcpy(p, &tmp, 3);
 }
 
 void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
@@ -71,7 +56,7 @@ void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
     * There is no pixel to the left of the first pixel.  It's encoded directly.
     * That works with our main loop if we just say that left pixel was zero.
     */
-   png_size_t rb;
+   size_t rb;
 
    __m128i a, d = _mm_setzero_si128();
 
@@ -104,7 +89,7 @@ void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
     * There is no pixel to the left of the first pixel.  It's encoded directly.
     * That works with our main loop if we just say that left pixel was zero.
     */
-   png_size_t rb;
+   size_t rb;
 
    __m128i a, d = _mm_setzero_si128();
 
@@ -131,7 +116,7 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
     * perfectly with our loop if we make sure a starts at zero.
     */
 
-   png_size_t rb;
+   size_t rb;
 
    const __m128i zero = _mm_setzero_si128();
 
@@ -185,7 +170,7 @@ void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
     * predicted to be half of the pixel above it.  So again, this works
     * perfectly with our loop if we make sure a starts at zero.
     */
-   png_size_t rb;
+   size_t rb;
    const __m128i zero = _mm_setzero_si128();
    __m128i    b;
    __m128i a, d = zero;
@@ -257,7 +242,7 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
     * Here we zero b and d, which become c and a respectively at the start of
     * the loop.
     */
-   png_size_t rb;
+   size_t rb;
    const __m128i zero = _mm_setzero_si128();
    __m128i c, b = zero,
            a, d = zero;
@@ -274,7 +259,7 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
       a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
 
       /* (p-a) == (a+b-c - a) == (b-c) */
-   
+
       pa = _mm_sub_epi16(b,c);
 
       /* (p-b) == (a+b-c - b) == (a-c) */
@@ -356,7 +341,7 @@ void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
     * Here we zero b and d, which become c and a respectively at the start of
     * the loop.
     */
-   png_size_t rb;
+   size_t rb;
    const __m128i zero = _mm_setzero_si128();
    __m128i pa,pb,pc,smallest,nearest;
    __m128i c, b = zero,