aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharles Schlosser <cs.schlosser@gmail.com>2024-04-19 02:02:27 +0000
committerRasmus Munk Larsen <rmlarsen@google.com>2024-04-19 02:02:27 +0000
commit5635d37f46acc2985aa66d9385665a76c3b9d5c7 (patch)
tree8fd6fd7eeb643a646b2a0f7dd38b026994647db3
parentf0795d35e3bc03fdceb119d6e5c6b9d724e3959a (diff)
downloadeigen-5635d37f46acc2985aa66d9385665a76c3b9d5c7.tar.gz
more pblend optimizations
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h33
-rw-r--r--Eigen/src/Core/arch/SSE/PacketMath.h20
2 files changed, 25 insertions, 28 deletions
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 6df332470..dac43fcd8 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -2134,35 +2134,28 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4d, 4>& kernel) {
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
}
+EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<4>& ifPacket) {
+ return _mm256_set_epi64x(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1],
+ 0 - ifPacket.select[0]);
+}
+
+EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<8>& ifPacket) {
+ return _mm256_set_epi32(0 - ifPacket.select[7], 0 - ifPacket.select[6], 0 - ifPacket.select[5],
+ 0 - ifPacket.select[4], 0 - ifPacket.select[3], 0 - ifPacket.select[2],
+ 0 - ifPacket.select[1], 0 - ifPacket.select[0]);
+}
+
template <>
EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket,
const Packet8f& elsePacket) {
-#ifdef EIGEN_VECTORIZE_AVX2
- const __m256i select =
- _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
- ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- const __m256 true_mask = _mm256_castsi256_ps(_mm256_sub_epi32(_mm256_setzero_si256(), select));
-#else
- const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
- ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- const __m256 true_mask = _mm256_cmp_ps(select, _mm256_setzero_ps(), _CMP_NEQ_UQ);
-#endif
-
+ const __m256 true_mask = _mm256_castsi256_ps(avx_blend_mask(ifPacket));
return pselect<Packet8f>(true_mask, thenPacket, elsePacket);
}
template <>
EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket,
const Packet4d& elsePacket) {
-#ifdef EIGEN_VECTORIZE_AVX2
- const __m256i select =
- _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- const __m256d true_mask = _mm256_castsi256_pd(_mm256_sub_epi64(_mm256_setzero_si256(), select));
-#else
- const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- __m256d true_mask = _mm256_cmp_pd(select, _mm256_setzero_pd(), _CMP_NEQ_UQ);
-#endif
-
+ const __m256d true_mask = _mm256_castsi256_pd(avx_blend_mask(ifPacket));
return pselect<Packet4d>(true_mask, thenPacket, elsePacket);
}
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 7e6a1b9a2..7bac3f9c7 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -2232,18 +2232,24 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
}
+EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<2>& ifPacket) {
+ return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]);
+}
+
+EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<4>& ifPacket) {
+ return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]);
+}
+
template <>
EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket,
const Packet2l& elsePacket) {
- const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]);
- const __m128i true_mask = _mm_sub_epi64(_mm_setzero_si128(), select);
+ const __m128i true_mask = sse_blend_mask(ifPacket);
return pselect<Packet2l>(true_mask, thenPacket, elsePacket);
}
template <>
EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
const Packet4i& elsePacket) {
- const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- const __m128i true_mask = _mm_sub_epi32(_mm_setzero_si128(), select);
+ const __m128i true_mask = sse_blend_mask(ifPacket);
return pselect<Packet4i>(true_mask, thenPacket, elsePacket);
}
template <>
@@ -2254,15 +2260,13 @@ EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4u
template <>
EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
const Packet4f& elsePacket) {
- const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- const __m128i true_mask = _mm_sub_epi32(_mm_setzero_si128(), select);
+ const __m128i true_mask = sse_blend_mask(ifPacket);
return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket);
}
template <>
EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
const Packet2d& elsePacket) {
- const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]);
- const __m128i true_mask = _mm_sub_epi64(_mm_setzero_si128(), select);
+ const __m128i true_mask = sse_blend_mask(ifPacket);
return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket);
}