diff options
author | Charles Schlosser <cs.schlosser@gmail.com> | 2024-04-19 02:02:27 +0000 |
---|---|---|
committer | Rasmus Munk Larsen <rmlarsen@google.com> | 2024-04-19 02:02:27 +0000 |
commit | 5635d37f46acc2985aa66d9385665a76c3b9d5c7 (patch) | |
tree | 8fd6fd7eeb643a646b2a0f7dd38b026994647db3 | |
parent | f0795d35e3bc03fdceb119d6e5c6b9d724e3959a (diff) | |
download | eigen-5635d37f46acc2985aa66d9385665a76c3b9d5c7.tar.gz |
more pblend optimizations
-rw-r--r-- | Eigen/src/Core/arch/AVX/PacketMath.h | 33 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 20 |
2 files changed, 25 insertions, 28 deletions
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 6df332470..dac43fcd8 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -2134,35 +2134,28 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4d, 4>& kernel) { kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); } +EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<4>& ifPacket) { + return _mm256_set_epi64x(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], + 0 - ifPacket.select[0]); +} + +EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<8>& ifPacket) { + return _mm256_set_epi32(0 - ifPacket.select[7], 0 - ifPacket.select[6], 0 - ifPacket.select[5], + 0 - ifPacket.select[4], 0 - ifPacket.select[3], 0 - ifPacket.select[2], + 0 - ifPacket.select[1], 0 - ifPacket.select[0]); +} + template <> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { -#ifdef EIGEN_VECTORIZE_AVX2 - const __m256i select = - _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], - ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - const __m256 true_mask = _mm256_castsi256_ps(_mm256_sub_epi32(_mm256_setzero_si256(), select)); -#else - const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], - ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - const __m256 true_mask = _mm256_cmp_ps(select, _mm256_setzero_ps(), _CMP_NEQ_UQ); -#endif - + const __m256 true_mask = _mm256_castsi256_ps(avx_blend_mask(ifPacket)); return pselect<Packet8f>(true_mask, thenPacket, elsePacket); } template <> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { -#ifdef EIGEN_VECTORIZE_AVX2 - const __m256i select = - _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - const __m256d true_mask = _mm256_castsi256_pd(_mm256_sub_epi64(_mm256_setzero_si256(), select)); -#else - const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - __m256d true_mask = _mm256_cmp_pd(select, _mm256_setzero_pd(), _CMP_NEQ_UQ); -#endif - + const __m256d true_mask = _mm256_castsi256_pd(avx_blend_mask(ifPacket)); return pselect<Packet4d>(true_mask, thenPacket, elsePacket); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 7e6a1b9a2..7bac3f9c7 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -2232,18 +2232,24 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) { kernel.packet[15] = _mm_unpackhi_epi64(u7, uf); } +EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<2>& ifPacket) { + return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]); +} + +EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<4>& ifPacket) { + return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]); +} + template <> EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket, const Packet2l& elsePacket) { - const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]); - const __m128i true_mask = _mm_sub_epi64(_mm_setzero_si128(), select); + const __m128i true_mask = sse_blend_mask(ifPacket); return pselect<Packet2l>(true_mask, thenPacket, elsePacket); } template <> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - const __m128i true_mask = _mm_sub_epi32(_mm_setzero_si128(), select); + const __m128i true_mask = sse_blend_mask(ifPacket); return pselect<Packet4i>(true_mask, thenPacket, elsePacket); } template <> @@ -2254,15 +2260,13 @@ EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4u template <> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { - const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); - const __m128i true_mask = _mm_sub_epi32(_mm_setzero_si128(), select); + const __m128i true_mask = sse_blend_mask(ifPacket); return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket); } template <> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]); - const __m128i true_mask = _mm_sub_epi64(_mm_setzero_si128(), select); + const __m128i true_mask = sse_blend_mask(ifPacket); return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket); } |