Add optimized routines for pairwise long adds and _mm_mullo_epi32 (#25)

* Add optimized routines for pairwise long adds and _mm_mullo_epi32 vpaddlq_uN can be implemented as so: { const __m128i ff = _mm_set1_epi2N((1 << N) - 1); __m128i low = _mm_and_si128(a, ff); __m128i high = _mm_srli_epi2N(a, N); return _mm_add_epi2N(low, high); | and the other unsigned pairwise adds are the same. vpaddlq_s32 can be implemented like so: { __m128i top, bot; bot = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); bot = _MM_CVTEPI32_EPI64(bot); top = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 1)); top = _MM_CVTEPI32_EPI64(top); return _mm_add_epi64(top, bot); } And _mm_mullo_epi32 uses the same routine that GCC uses with vector extensions (Clang uses a similar method, but it uses pshufd which is slow on pre-Penryn chips): { __m128i a_high = _mm_srli_epi64(a, 32); __m128i low = _mm_mul_epu32(a, b); __m128i b_high = _mm_srli_epi64(b, 32); __m128i high = _mm_mul_epu32(a_high, b_high); low = _mm_shuffle_epi32(low, _MM_SHUFFLE(0, 0, 2, 0)); high = _mm_shuffle_epi32(high, _MM_SHUFFLE(0, 0, 2, 0)); return _mm_unpacklo_epi32(low, high); }
author: easyaspi314 (Devin) <easyaspi314@users.noreply.github.com> 2019-02-06 06:37:16 -0500
committer: Victoria <victoria.zhislina@intel.com> 2019-02-06 14:37:16 +0300
commit: 6516a619fa09f6d70712328e4099b4b456577ad4 (patch)
tree: c03283b3425c6330bfdf8f5ffdb5a0a522f2f3e9
parent: 1a99e5a4870db6fcdb12bd1f642fdab07894a402 (diff)
download: neon_2_sse-6516a619fa09f6d70712328e4099b4b456577ad4.tar.gz
1 files changed, 53 insertions, 47 deletions
diff --git a/NEON_2_SSE.h b/NEON_2_SSE.h
index 2fc9a2a..eff6dc9 100644
--- a/NEON_2_SSE.h
+++ b/NEON_2_SSE.h
@@ -2671,19 +2671,16 @@ _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
         return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
     }
 
-
-    _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
+    // method used by GCC with generic vector extensions
+    _NEON2SSE_INLINE __m128i _MM_MULLO_EPI32(__m128i a, __m128i b)
     {
-        _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
-        int64_t res64;
-        int i;
-        _mm_store_si128((__m128i*)atmp, a);
-        _mm_store_si128((__m128i*)btmp, b);
-        for (i = 0; i<4; i++) {
-            res64 = atmp[i] * btmp[i];
-            res[i] = (int)(res64 & 0xffffffff);
-        }
-        return _mm_load_si128((__m128i*)res);
+        __m128i a_high = _mm_srli_epi64(a, 32);
+        __m128i low = _mm_mul_epu32(a, b);
+        __m128i b_high = _mm_srli_epi64(b, 32);
+        __m128i high = _mm_mul_epu32(a_high, b_high);
+        low = _mm_shuffle_epi32(low, _MM_SHUFFLE(0, 0, 2, 0));
+        high = _mm_shuffle_epi32(high, _MM_SHUFFLE(0, 0, 2, 0));
+        return _mm_unpacklo_epi32(low, high);
     }
 
     _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
@@ -6439,52 +6436,62 @@ _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
 }
 
 _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
+_NEON2SSE_INLINE int64x2_t vpaddlq_s32(int32x4_t a)
 {
-    _NEON2SSE_ALIGN_16 int32_t atmp[4];
-    _NEON2SSE_ALIGN_16 int64_t res[2];
-    _mm_store_si128((__m128i*)atmp, a);
-    res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
-    res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
-    return _mm_load_si128((__m128i*)res);
+    __m128i top, bot;
+    bot = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+    bot = _MM_CVTEPI32_EPI64(bot);
+    top = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 1));
+    top = _MM_CVTEPI32_EPI64(top);
+    return _mm_add_epi64(top, bot);
 }
 
 _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
 _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
 {
-    //no 8 bit hadd in IA32, need to go to 16 bit
-    __m128i r16_1, r16_2;
-    r16_1 = _MM_CVTEPU8_EPI16(a);
-    //swap hi and low part of r to process the remaining data
-    r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
-    return _mm_hadd_epi16 (r16_1, r16_2);
+    const __m128i ff = _mm_set1_epi16(0xFF);
+    __m128i low = _mm_and_si128(a, ff);
+    __m128i high = _mm_srli_epi16(a, 8);
+    return _mm_add_epi16(low, high);
 }
 
+#ifdef USE_SSE4
 _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
+_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
 {
-    //serial solution looks faster than a SIMD one
-    _NEON2SSE_ALIGN_16 uint16_t atmp[8];
-    _NEON2SSE_ALIGN_16 uint32_t res[4];
-    _mm_store_si128((__m128i*)atmp, a);
-    res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
-    res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
-    res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
-    res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
-    return _mm_load_si128((__m128i*)res);
+    const __m128i zero = _mm_setzero_si128();
+    __m128i low = _mm_blend_epi16(zero, a, 0x55); // 0b1010101
+    __m128i high = _mm_srli_epi32(a, 16);
+    return _mm_add_epi32(low, high);
 }
 
 _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
 {
-    _NEON2SSE_ALIGN_16 uint32_t atmp[4];
-    _NEON2SSE_ALIGN_16 uint64_t res[2];
-    _mm_store_si128((__m128i*)atmp, a);
-    res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
-    res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
-    return _mm_load_si128((__m128i*)res);
+    const __m128i zero = _mm_setzero_si128();
+    __m128i low = _mm_blend_epi16(zero, a, 0x33); // 0b00110011
+    __m128i high = _mm_srli_epi64(a, 32);
+    return _mm_add_epi64(low, high);
+}
+#else
+_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
+_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
+{
+    const __m128i ff = _mm_set1_epi32(0xFFFF);
+    __m128i low = _mm_and_si128(a, ff);
+    __m128i high = _mm_srli_epi32(a, 16);
+    return _mm_add_epi32(low, high);
+}
+
+_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
+_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
+{
+    const __m128i ff = _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);
+    __m128i low = _mm_and_si128(a, ff);
+    __m128i high = _mm_srli_epi64(a, 32);
+    return _mm_add_epi64(low, high);
 }
+#endif
 
 //************************  Long pairwise add and accumulate **************************
 //****************************************************************************************
@@ -6569,7 +6576,7 @@ _NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8
 }
 
 _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+_NEON2SSE_INLINE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b)
 {
     uint32x4_t pad;
     pad = vpaddlq_u16(b);
@@ -6577,13 +6584,12 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t
 } //no optimal SIMD solution, serial is faster
 
 _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+_NEON2SSE_INLINE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b)
 {
-    //no optimal SIMD solution, serial is faster
     uint64x2_t pad;
     pad = vpaddlq_u32(b);
     return _mm_add_epi64(a, pad);
-} //no optimal SIMD solution, serial is faster
+}
 
 //**********  Folding maximum   *************************************
 //*******************************************************************
author	easyaspi314 (Devin) <easyaspi314@users.noreply.github.com>	2019-02-06 06:37:16 -0500
committer	Victoria <victoria.zhislina@intel.com>	2019-02-06 14:37:16 +0300
commit	6516a619fa09f6d70712328e4099b4b456577ad4 (patch)
tree	c03283b3425c6330bfdf8f5ffdb5a0a522f2f3e9
parent	1a99e5a4870db6fcdb12bd1f642fdab07894a402 (diff)
download	neon_2_sse-6516a619fa09f6d70712328e4099b4b456577ad4.tar.gz