aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordario mambro <dario.mambro@gmail.com>2020-12-19 15:25:21 +0100
committerdario mambro <dario.mambro@gmail.com>2020-12-24 14:32:36 +0100
commit8dc269e569cfe6c796bf37a30c5cc4798be8a750 (patch)
treee4389a26f6d726ce347dcf747de4a37b161d760e
parent5fb07d7117dab7e1511895e8106224876abadd00 (diff)
downloadpffft-8dc269e569cfe6c796bf37a30c5cc4798be8a750.tar.gz
simplified some neon code, changed some tabs into spaces
(cherry picked from commit c92f08c8226e4c069436751b09554ada362ae7c8)
-rw-r--r--simd/pf_neon_double.h31
-rw-r--r--simd/pf_neon_double_from_avx.h83
2 files changed, 52 insertions, 62 deletions
diff --git a/simd/pf_neon_double.h b/simd/pf_neon_double.h
index 1c8b852..140f465 100644
--- a/simd/pf_neon_double.h
+++ b/simd/pf_neon_double.h
@@ -65,6 +65,13 @@ typedef union v4sf_union {
# define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
# define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
+FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
+{
+ __m256d res;
+ res.vect_f64[0] = a.vect_f64[0];
+ res.vect_f64[1] = b;
+ return res;
+}
FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
{
@@ -135,14 +142,12 @@ out2 = [ in1[2], in2[2], in1[3], in2[3] ]
__m128d low2__ = _mm256_castpd256_pd128(in2); \
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
- __m256d tmp__ = _mm256_insertf128_pd( \
+ __m256d tmp__ = _mm256_insertf128_pd_1( \
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \
- _mm_shuffle_pd_11(low1__, low2__), \
- 1); \
- out2 = _mm256_insertf128_pd( \
+ _mm_shuffle_pd_11(low1__, low2__)); \
+ out2 = _mm256_insertf128_pd_1( \
_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \
- _mm_shuffle_pd_11(high1__, high2__), \
- 1); \
+ _mm_shuffle_pd_11(high1__, high2__)); \
out1 = tmp__; \
}
@@ -155,14 +160,12 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ]
__m128d low2__ = _mm256_castpd256_pd128(in2); \
__m128d high1__ = _mm256_extractf128_pd(in1, 1); \
__m128d high2__ = _mm256_extractf128_pd(in2, 1); \
- __m256d tmp__ = _mm256_insertf128_pd( \
+ __m256d tmp__ = _mm256_insertf128_pd_1( \
_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \
- _mm_shuffle_pd_00(low2__, high2__), \
- 1); \
- out2 = _mm256_insertf128_pd( \
+ _mm_shuffle_pd_00(low2__, high2__)); \
+ out2 = _mm256_insertf128_pd_1( \
_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \
- _mm_shuffle_pd_11(low2__, high2__), \
- 1); \
+ _mm_shuffle_pd_11(low2__, high2__)); \
out1 = tmp__; \
}
@@ -184,13 +187,13 @@ out2 = [ in1[1], in1[3], in2[1], in2[3] ]
return [ b[0], b[1], a[2], a[3] ]
*/
# define VSWAPHL(a,b) \
- _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
+ _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
/* reverse/flip all floats */
# define VREV_S(a) _mm256_reverse(a)
/* reverse/flip complex floats */
-# define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
+# define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
diff --git a/simd/pf_neon_double_from_avx.h b/simd/pf_neon_double_from_avx.h
index c8cd74e..5cce17e 100644
--- a/simd/pf_neon_double_from_avx.h
+++ b/simd/pf_neon_double_from_avx.h
@@ -25,9 +25,7 @@
#if defined(__GNUC__) || defined(__clang__)
#pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
#define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
#else
@@ -37,99 +35,88 @@
#endif
#define FORCE_INLINE static inline
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
#endif
typedef struct {
- float32x4_t vect_f32[2];
+ float32x4_t vect_f32[2];
} __m256;
typedef struct {
- float64x2_t vect_f64[2];
+ float64x2_t vect_f64[2];
} __m256d;
typedef float64x2_t __m128d;
FORCE_INLINE __m256d _mm256_setzero_pd(void)
{
- __m256d ret;
- ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
- return ret;
+ __m256d ret;
+ ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
+ return ret;
}
FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
{
- __m256d res_m256d;
- res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
- res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
- return res_m256d;
+ __m256d res_m256d;
+ res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+ res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+ return res_m256d;
}
FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
{
- __m256d res_m256d;
- res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
- res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
- return res_m256d;
+ __m256d res_m256d;
+ res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+ res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+ return res_m256d;
}
FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
{
- __m256d res_m256d;
- res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
- res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
- return res_m256d;
+ __m256d res_m256d;
+ res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+ res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+ return res_m256d;
}
FORCE_INLINE __m256d _mm256_set1_pd(double a)
{
- __m256d ret;
- ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
- return ret;
+ __m256d ret;
+ ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
+ return ret;
}
FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
{
- __m256d res;
- res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
- res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
- return res;
+ __m256d res;
+ res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+ res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+ return res;
}
FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
{
- __m256d res;
- res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
- res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
- return res;
+ __m256d res;
+ res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
+ res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
+ return res;
}
FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
{
- return a.vect_f64[0];
+ return a.vect_f64[0];
}
FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
{
- assert(imm8 >= 0 && imm8 <= 1);
- return a.vect_f64[imm8];
-}
-FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8)
-{
- assert(imm8 == 0 || imm8 == 1);
- __m256d res;
- uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0));
- res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]);
- res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b);
- return res;
+ assert(imm8 >= 0 && imm8 <= 1);
+ return a.vect_f64[imm8];
}
+
FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
{
- __m256d res;
- res.vect_f64[0] = a;
- return res;
+ __m256d res;
+ res.vect_f64[0] = a;
+ return res;
}
#endif /* PF_AVX_DBL_H */