added initial PFDSP library with mixer, carrier (generation) and cic functions

portions copied from csdr library Signed-off-by: hayati ayguen <h_ayguen@web.de>
author: hayati ayguen <h_ayguen@web.de> 2020-11-11 23:09:58 +0100
committer: hayati ayguen <h_ayguen@web.de> 2020-11-11 23:09:58 +0100
commit: 29eb8477d4d7cd274b5432737725aadcd3df2767 (patch)
tree: c28a0903ffb5c2aad13c78a9cf2f1d0c0b416dfb
parent: ee17cb0c97146a888c8d925999413f20934676fb (diff)
download: pffft-29eb8477d4d7cd274b5432737725aadcd3df2767.tar.gz
10 files changed, 2177 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b33e6a7..ec92e24 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,6 +145,25 @@ set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
 
 ######################################################
 
+if (USE_TYPE_FLOAT)
+
+  add_library(PFDSP STATIC pf_mixer.cpp pf_mixer.h pf_carrier.cpp pf_carrier.h pf_cic.cpp pf_cic.h fmv.h )
+  target_compile_definitions(PFDSP PRIVATE _USE_MATH_DEFINES)
+  if (USE_DEBUG_ASAN)
+    target_compile_options(PFDSP PRIVATE "-fsanitize=address")
+  endif()
+  if (USE_SIMD AND USE_SIMD_NEON)
+    target_compile_definitions(PFDSP PRIVATE PFFFT_ENABLE_NEON=1)
+    target_compile_options(PFDSP PRIVATE "-mfpu=neon")
+  endif()
+  target_link_libraries( PFDSP ${MATHLIB} )
+  set_property(TARGET PFDSP APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  )
+endif()
+
+######################################################
+
 add_library(FFTPACK STATIC fftpack.c fftpack.h)
 target_compile_definitions(FFTPACK PRIVATE _USE_MATH_DEFINES)
 target_link_libraries( FFTPACK ${MATHLIB} )
@@ -282,6 +301,17 @@ endif()
 
 ######################################################
 
+if (USE_TYPE_FLOAT)
+  add_executable(bench_pf_mixer_float   bench_mixers.c)
+  target_compile_definitions(bench_pf_mixer_float PRIVATE _USE_MATH_DEFINES)
+  target_compile_definitions(bench_pf_mixer_float PRIVATE PFFFT_ENABLE_FLOAT)
+
+  target_link_libraries( bench_pf_mixer_float  PFDSP ${ASANLIB} )
+
+endif()
+
+######################################################
+
 enable_testing()
 
 if (USE_TYPE_FLOAT)
diff --git a/bench_mixers.c b/bench_mixers.c
new file mode 100644
index 0000000..b659020
--- /dev/null
+++ b/bench_mixers.c
@@ -0,0 +1,517 @@
+/*
+  Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+  bench for mixer algorithm/implementations
+
+ */
+
+#include <pf_mixer.h>
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+#define HAVE_SYS_TIMES
+
+#ifdef HAVE_SYS_TIMES
+#  include <sys/times.h>
+#  include <unistd.h>
+#endif
+
+#define BENCH_REF_TRIG_FUNC       1
+#define BENCH_OUT_OF_PLACE_ALGOS  1
+#define BENCH_INPLACE_ALGOS       1
+
+
+#if defined(HAVE_SYS_TIMES)
+    static double ttclk = 0.;
+
+    static double uclock_sec(void)
+    {
+        struct tms t;
+        if (ttclk == 0.)
+            ttclk = sysconf(_SC_CLK_TCK);
+        times(&t);
+        /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
+        return ((double)t.tms_utime) / ttclk;
+    }
+# else
+    double uclock_sec(void)
+    { return (double)clock()/(double)CLOCKS_PER_SEC; }
+#endif
+
+
+void save(complexf * d, int B, int N, const char * fn)
+{
+    if (!fn)
+        fn = "/dev/shm/bench.bin";
+    FILE * f = fopen(fn, "wb");
+    if (!f) {
+        fprintf(stderr, "error writing result to %s\n", fn);
+        return;
+    }
+    for (int off = 0; off + B <= N; off += B)
+    {
+        fwrite(d+off, sizeof(complexf), B, f);
+    }
+    fclose(f);
+}
+
+
+double bench_shift_math_cc(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_table_cc(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    int table_size=65536;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+
+    shift_table_data_t table_data = shift_table_init(table_size);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_addfast(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    int table_size=65536;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_addfast_data_t state = shift_addfast_init(-0.0009F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_addfast_cc(input+off, output+off, B, &state, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_unroll_oop(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_unroll_cc(input+off, output+off, B, &state, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+double bench_shift_unroll_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        phase = shift_unroll_inp_c(input+off, B, &state, phase);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, NULL);
+    free(input);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+
+double bench_shift_limited_unroll_oop(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_cc(input+off, output+off, B, &state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_limited_unroll_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_inp_c(input+off, B, &state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, NULL);
+    free(input);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_limited_unroll_sse_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+    shift_limited_unroll_sse_data_t *state = malloc(sizeof(shift_limited_unroll_sse_data_t));
+
+    *state = shift_limited_unroll_sse_init(-0.0009F, 0.0F);
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_limited_unroll_sse_inp_c(input+off, B, state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, NULL);
+    free(input);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_rec_osc_cc_oop(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    complexf *output = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state, shift_state;
+    shift_recursive_osc_conf_t gen_conf, shift_conf;
+
+    shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    //save(input, B, off, "/dev/shm/bench_inp.bin");
+    save(output, B, off, NULL);
+    free(input);
+    free(output);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_rec_osc_cc_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state, shift_state;
+    shift_recursive_osc_conf_t gen_conf, shift_conf;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+    shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_recursive_osc_inp_c(input+off, B, &shift_conf, &shift_state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, NULL);
+    free(input);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+double bench_shift_rec_osc_sse_c_inp(int B, int N) {
+    double t0, t1, tstop, T, nI;
+    int iter, off;
+    float phase = 0.0F;
+    complexf *input = (complexf *)malloc(N * sizeof(complexf));
+    shift_recursive_osc_t gen_state;
+    shift_recursive_osc_conf_t gen_conf;
+
+    shift_recursive_osc_sse_t *shift_state = malloc(sizeof(shift_recursive_osc_sse_t));
+    shift_recursive_osc_sse_conf_t shift_conf;
+
+    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
+    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
+
+    shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state);
+
+    iter = 0;
+    off = 0;
+    t0 = uclock_sec();
+    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
+    do {
+        // work
+        shift_recursive_osc_sse_inp_c(input+off, B, &shift_conf, shift_state);
+
+        off += B;
+        ++iter;
+        t1 = uclock_sec();
+    } while ( t1 < tstop && off + B < N );
+
+    save(input, B, off, NULL);
+    free(input);
+    printf("processed %f Msamples\n", off * 1E-6);
+    T = ( t1 - t0 );  /* duration per fft() */
+    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
+    return (nI / T);    /* normalized iterations per second */
+}
+
+
+
+int main(int argc, char **argv)
+{
+    double rt;
+
+    // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
+    int B = 8 * 1024;
+    int N = 64 * 1024 * 1024;
+
+#if BENCH_REF_TRIG_FUNC
+    printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
+    rt = bench_shift_math_cc(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+#endif
+
+#if BENCH_OUT_OF_PLACE_ALGOS
+    printf("starting bench of shift_table_cc (out-of-place) ..\n");
+    rt = bench_shift_table_cc(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_addfast_cc (out-of-place) ..\n");
+    rt = bench_shift_addfast(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n");
+    rt = bench_shift_unroll_oop(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n");
+    rt = bench_shift_limited_unroll_oop(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n");
+    rt = bench_shift_rec_osc_cc_oop(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+#endif
+
+#if BENCH_INPLACE_ALGOS
+    printf("starting bench of shift_unroll_cc in-place ..\n");
+    rt = bench_shift_unroll_inp(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_limited_unroll_cc in-place ..\n");
+    rt = bench_shift_limited_unroll_inp(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_limited_unroll_sse_inp_c in-place ..\n");
+    rt = bench_shift_limited_unroll_sse_inp(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_recursive_osc_cc in-place ..\n");
+    rt = bench_shift_rec_osc_cc_inp(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+    printf("starting bench of shift_recursive_osc_sse_c in-place ..\n");
+    rt = bench_shift_rec_osc_sse_c_inp(B, N);
+    printf("  %f MSamples/sec\n\n", rt * 1E-6);
+
+#endif
+
+    return 0;
+}
+
diff --git a/fmv.h b/fmv.h
new file mode 100644
index 0000000..0aa439d
--- /dev/null
+++ b/fmv.h
@@ -0,0 +1,20 @@
+#ifndef FMV_H
+
+#if HAVE_FUNC_ATTRIBUTE_IFUNC
+#if defined(__has_attribute)
+#if __has_attribute(target_clones)
+#if defined(__x86_64)
+
+// see https://gcc.gnu.org/wiki/FunctionMultiVersioning
+#define PF_TARGET_CLONES __attribute__((target_clones("avx","sse4.2","sse3","sse2","sse","default")))
+#define HAVE_PF_TARGET_CLONES  1
+#endif
+#endif
+#endif
+#endif
+
+#ifndef PF_TARGET_CLONES
+#define PF_TARGET_CLONES
+#endif
+
+#endif
diff --git a/pf_carrier.cpp b/pf_carrier.cpp
new file mode 100644
index 0000000..d751a55
--- /dev/null
+++ b/pf_carrier.cpp
@@ -0,0 +1,298 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* include own header first, to see missing includes */
+#include "pf_carrier.h"
+#include "fmv.h"
+
+#include <limits.h>
+#include <assert.h>
+
+
+PF_TARGET_CLONES
+void generate_dc_f(float* output, int size)
+{
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=(127.0F / 128.0F);
+        output[i++]=0.0F;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_s16(short* output, int size)
+{
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=SHRT_MAX;
+        output[i++]=0;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_pos_fs4_f(float* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=(127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0.0F;
+        output[i++]=(127.0F / 128.0F);
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=(-127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0.0F;
+        output[i++]=(-127.0F / 128.0F);
+    }
+}
+
+PF_TARGET_CLONES
+void generate_pos_fs4_s16(short* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0;
+        output[i++]=SHRT_MAX;
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=-SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0;
+        output[i++]=-SHRT_MAX;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_neg_fs4_f(float* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=(127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0.0F;
+        output[i++]=(-127.0F / 128.0F);
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=(-127.0F / 128.0F);
+        output[i++]=0.0F;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0.0F;
+        output[i++]=(127.0F / 128.0F);
+    }
+}
+
+PF_TARGET_CLONES
+void generate_neg_fs4_s16(short* output, int size)
+{
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+i*0 */
+        output[i++]=SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 0+i*-1 */
+        output[i++]=0;
+        output[i++]=-SHRT_MAX;
+        /* exp(i* +pi) = -1+i*0 */
+        output[i++]=-SHRT_MAX;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 0+i*1 */
+        output[i++]=0;
+        output[i++]=SHRT_MAX;
+    }
+}
+
+/****************************************************/
+
+PF_TARGET_CLONES
+void generate_dc_pos_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+1+i*0 */
+        output[i++]=m+m;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 1+0+i*1 */
+        output[i++]=m+0;
+        output[i++]=m;
+        /* exp(i* +pi) = 1-1+i*0 */
+        output[i++]=m-m;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 1+0+i*-1 */
+        output[i++]=m;
+        output[i++]=-m;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_neg_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* exp(i*0) = 1+1+i*0 */
+        output[i++]=m+m;
+        output[i++]=0;
+        /* exp(i* -pi/2) = 1+0+i*-1 */
+        output[i++]=m+0;
+        output[i++]=-m;
+        /* exp(i* +pi) = 1-1+i*0 */
+        output[i++]=m-m;
+        output[i++]=0;
+        /* exp(i* +pi/2) = 1+0+i*1 */
+        output[i++]=m+0;
+        output[i++]=m;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_pos_neg_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* pos(0) + neg(0) = exp(i*  0   ) + exp(i*  0   ) =  1 +i*  0  +  1 +i*  0 */
+        output[i++]=m;
+        output[i++]=-m;
+
+        /* pos(1) + neg(1) = exp(i* +pi/2) + exp(i* -pi/2) =  0 +i*  1  +  0 +i* -1 */
+        output[i++]=-m;
+        output[i++]=m;
+
+        /* pos(2) + neg(2) = exp(i* +pi  ) + exp(i* +pi  ) = -1 +i*  0  + -1 +i*  0 */
+        output[i++]=-m;
+        output[i++]=m;
+
+        /* pos(3) + neg(3) = exp(i* -pi/2) + exp(i* +pi/2) =  0 +i* -1  +  0 +i*  1 */
+        output[i++]=m;
+        output[i++]=-m;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_pos_neg_fs4_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* dc + pos(0) + neg(0) = dc + exp(i*  0   ) + exp(i*  0   ) =  1 +i*  0  +  1 +i*  0 */
+        output[i++]=m+m;
+        output[i++]=-m;
+
+        /* dc + pos(1) + neg(1) = dc + exp(i* +pi/2) + exp(i* -pi/2) =  0 +i*  1  +  0 +i* -1 */
+        output[i++]=0;
+        output[i++]=m;
+
+        /* dc + pos(2) + neg(2) = dc + exp(i* +pi  ) + exp(i* +pi  ) = -1 +i*  0  + -1 +i*  0 */
+        output[i++]=0;
+        output[i++]=m;
+
+        /* dc + pos(3) + neg(3) = dc + exp(i* -pi/2) + exp(i* +pi/2) =  0 +i* -1  +  0 +i*  1 */
+        output[i++]=m+m;
+        output[i++]=-m;
+    }
+}
+
+
+PF_TARGET_CLONES
+void generate_pos_neg_fs2_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* dc + exp(i* 0 ) = +1 */
+        output[i++]=m;
+        output[i++]=0;
+        /* dc + exp(i* pi) = -1 */
+        output[i++]=-m;
+        output[i++]=0;
+        /* dc + exp(i* 0 ) = +1 */
+        output[i++]=m;
+        output[i++]=0;
+        /* dc + exp(i* pi) = -1 */
+        output[i++]=-m;
+        output[i++]=0;
+    }
+}
+
+PF_TARGET_CLONES
+void generate_dc_pos_neg_fs2_s16(short* output, int size)
+{
+    const int m = SHRT_MAX / 2;
+    /* size must be multiple of 4 */
+    assert(!(size&3));
+    for(int i=0;i<2*size;)
+    {
+        /* with dc = i*1 */
+        /* dc + exp(i* 0 ) = i*1 +1 */
+        output[i++]=m;
+        output[i++]=m;
+        /* dc + exp(i* pi) = i*1 -1 */
+        output[i++]=-m;
+        output[i++]=m;
+        /* dc + exp(i* 0 ) = i*1 +1 */
+        output[i++]=m;
+        output[i++]=m;
+        /* dc + exp(i* pi) = i*1 -1 */
+        output[i++]=-m;
+        output[i++]=m;
+    }
+}
+
+
diff --git a/pf_carrier.h b/pf_carrier.h
new file mode 100644
index 0000000..c328ce0
--- /dev/null
+++ b/pf_carrier.h
@@ -0,0 +1,75 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+   _____                      _
+  / ____|                    | |
+ | |     ___  _ __ ___  _ __ | | _____  __
+ | |    / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
+ | |___| (_) | | | | | | |_) | |  __/>  <
+  \_____\___/|_| |_| |_| .__/|_|\___/_/\_\
+                       | |
+                       |_|
+*/
+
+typedef struct complexf_s { float i; float q; } complexf;
+
+
+/* generation functions */
+void generate_dc_f(float* output, int size);
+void generate_dc_s16(short* output, int size);
+void generate_pos_fs4_f(float* output, int size);
+void generate_pos_fs4_s16(short* output, int size);
+void generate_neg_fs4_f(float* output, int size);
+void generate_neg_fs4_s16(short* output, int size);
+
+void generate_dc_pos_fs4_s16(short* output, int size);
+void generate_dc_neg_fs4_s16(short* output, int size);
+void generate_pos_neg_fs4_s16(short* output, int size);
+void generate_dc_pos_neg_fs4_s16(short* output, int size);
+
+void generate_pos_neg_fs2_s16(short* output, int size);
+void generate_dc_pos_neg_fs2_s16(short* output, int size);
+
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/pf_cic.cpp b/pf_cic.cpp
new file mode 100644
index 0000000..34b90c5
--- /dev/null
+++ b/pf_cic.cpp
@@ -0,0 +1,252 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* include own header first, to see missing includes */
+#include "pf_cic.h"
+#include "fmv.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+
+/*
+   ____ ___ ____   ____  ____   ____
+  / ___|_ _/ ___| |  _ \|  _ \ / ___|
+ | |    | | |     | | | | | | | |
+ | |___ | | |___  | |_| | |_| | |___
+  \____|___\____| |____/|____/ \____|
+*/
+
+#define SINESHIFT 12
+#define SINESIZE (1<<SINESHIFT)
+typedef int64_t cic_dt; // data type used for integrators and combs
+typedef struct {
+    int factor;
+    uint64_t phase;
+    float gain;
+    cic_dt ig0a, ig0b, ig1a, ig1b;
+    cic_dt comb0a, comb0b, comb1a, comb1b;
+    int16_t *sinetable;
+} cicddc_t;
+
+void *cicddc_init(int factor) {
+    int i;
+    int sinesize2 = SINESIZE * 5/4; // 25% extra to get cosine from the same table
+    cicddc_t *s;
+    s = (cicddc_t *)malloc(sizeof(cicddc_t));
+    memset(s, 0, sizeof(cicddc_t));
+
+    float sineamp = 32767.0f;
+    s->factor = factor;
+    s->gain = 1.0f / SHRT_MAX / sineamp / factor / factor / factor; // compensate for gain of 3 integrators
+
+    s->sinetable = (int16_t *)malloc(sinesize2 * sizeof(*s->sinetable));
+    double f = 2.0*M_PI / (double)SINESIZE;
+    for(i = 0; i < sinesize2; i++) {
+        s->sinetable[i] = sineamp * cos(f * i);
+    }
+    return s;
+}
+
+void cicddc_free(void *state) {
+    cicddc_t *s = (cicddc_t *)state;
+    free(s->sinetable);
+    free(s);
+}
+
+
+PF_TARGET_CLONES
+void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
+    cicddc_t *s = (cicddc_t *)state;
+    int k;
+    int factor = s->factor;
+    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
+    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
+    uint64_t phase = s->phase, freq;
+    int16_t *sinetable = s->sinetable;
+    float gain = s->gain;
+
+    freq = rate * ((float)(1ULL << 63) * 2);
+
+    int16_t *inp = input;
+    for(k = 0; k < outsize; k++) {
+        int i;
+        cic_dt out0a, out0b, out1a, out1b;
+        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
+        for(i = 0; i < factor; i++) {
+            cic_dt in_a, in_b;
+            int sinep = phase >> (64-SINESHIFT);
+            in_a = (int32_t)inp[i] * (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
+            in_b = (int32_t)inp[i] * (int32_t)sinetable[sinep];
+            phase += freq;
+            /* integrators:
+            The calculations are ordered so that each integrator
+            takes a result from previous loop iteration
+            to make the code more "pipeline-friendly". */
+            ig2a += ig1a; ig2b += ig1b;
+            ig1a += ig0a; ig1b += ig0b;
+            ig0a += in_a; ig0b += in_b;
+        }
+        inp += factor;
+        // comb filters:
+        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
+        comb0a = ig2a;           comb0b = ig2b;
+        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
+        comb1a = out0a;          comb1b = out0b;
+
+        output[k].i = (float)out1a * gain;
+        output[k].q = (float)out1b * gain;
+    }
+
+    s->ig0a = ig0a; s->ig0b = ig0b;
+    s->ig1a = ig1a; s->ig1b = ig1b;
+    s->comb0a = comb0a; s->comb0b = comb0b;
+    s->comb1a = comb1a; s->comb1b = comb1b;
+    s->phase = phase;
+}
+
+PF_TARGET_CLONES
+void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) {
+    cicddc_t *s = (cicddc_t *)state;
+    int k;
+    int factor = s->factor;
+    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
+    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
+    uint64_t phase = s->phase, freq;
+    int16_t *sinetable = s->sinetable;
+    float gain = s->gain;
+
+    freq = rate * ((float)(1ULL << 63) * 2);
+
+    int16_t *inp = input;
+    for(k = 0; k < outsize; k++) {
+        int i;
+        cic_dt out0a, out0b, out1a, out1b;
+        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
+        for(i = 0; i < factor; i++) {
+            cic_dt in_a, in_b;
+            int32_t m_a, m_b, m_c, m_d;
+            int sinep = phase >> (64-SINESHIFT);
+            m_a = inp[2*i];
+            m_b = inp[2*i+1];
+            m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
+            m_d = (int32_t)sinetable[sinep];
+            // complex multiplication:
+            in_a = m_a*m_c - m_b*m_d;
+            in_b = m_a*m_d + m_b*m_c;
+            phase += freq;
+            /* integrators:
+            The calculations are ordered so that each integrator
+            takes a result from previous loop iteration
+            to make the code more "pipeline-friendly". */
+            ig2a += ig1a; ig2b += ig1b;
+            ig1a += ig0a; ig1b += ig0b;
+            ig0a += in_a; ig0b += in_b;
+        }
+        inp += 2*factor;
+        // comb filters:
+        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
+        comb0a = ig2a;           comb0b = ig2b;
+        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
+        comb1a = out0a;          comb1b = out0b;
+
+        output[k].i = (float)out1a * gain;
+        output[k].q = (float)out1b * gain;
+    }
+
+    s->ig0a = ig0a; s->ig0b = ig0b;
+    s->ig1a = ig1a; s->ig1b = ig1b;
+    s->comb0a = comb0a; s->comb0b = comb0b;
+    s->comb1a = comb1a; s->comb1b = comb1b;
+    s->phase = phase;
+}
+
+
+/* This is almost copy paste from cicddc_cs16_c.
+   I'm afraid this is going to be annoying to maintain... */
+PF_TARGET_CLONES
+void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate) {
+    cicddc_t *s = (cicddc_t *)state;
+    int k;
+    int factor = s->factor;
+    cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b;
+    cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b;
+    uint64_t phase = s->phase, freq;
+    int16_t *sinetable = s->sinetable;
+    float gain = s->gain;
+
+    freq = rate * ((float)(1ULL << 63) * 2);
+
+    uint8_t *inp = input;
+    for(k = 0; k < outsize; k++) {
+        int i;
+        cic_dt out0a, out0b, out1a, out1b;
+        cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum
+        for(i = 0; i < factor; i++) {
+            cic_dt in_a, in_b;
+            int32_t m_a, m_b, m_c, m_d;
+            int sinep = phase >> (64-SINESHIFT);
+            // subtract 127.4 (good for rtl-sdr)
+            m_a = (((int32_t)inp[2*i])   << 8) - 32614;
+            m_b = (((int32_t)inp[2*i+1]) << 8) - 32614;
+            m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))];
+            m_d = (int32_t)sinetable[sinep];
+            // complex multiplication:
+            in_a = m_a*m_c - m_b*m_d;
+            in_b = m_a*m_d + m_b*m_c;
+            phase += freq;
+            /* integrators:
+            The calculations are ordered so that each integrator
+            takes a result from previous loop iteration
+            to make the code more "pipeline-friendly". */
+            ig2a += ig1a; ig2b += ig1b;
+            ig1a += ig0a; ig1b += ig0b;
+            ig0a += in_a; ig0b += in_b;
+        }
+        inp += 2*factor;
+        // comb filters:
+        out0a  = ig2a - comb0a;  out0b  = ig2b - comb0b;
+        comb0a = ig2a;           comb0b = ig2b;
+        out1a  = out0a - comb1a; out1b  = out0b - comb1b;
+        comb1a = out0a;          comb1b = out0b;
+
+        output[k].i = (float)out1a * gain;
+        output[k].q = (float)out1b * gain;
+    }
+
+    s->ig0a = ig0a; s->ig0b = ig0b;
+    s->ig1a = ig1a; s->ig1b = ig1b;
+    s->comb0a = comb0a; s->comb0b = comb0b;
+    s->comb1a = comb1a; s->comb1b = comb1b;
+    s->phase = phase;
+}
+
diff --git a/pf_cic.h b/pf_cic.h
new file mode 100644
index 0000000..681ee4f
--- /dev/null
+++ b/pf_cic.h
@@ -0,0 +1,58 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+   ____ ___ ____   ____  ____   ____
+  / ___|_ _/ ___| |  _ \|  _ \ / ___|
+ | |    | | |     | | | | | | | |
+ | |___ | | |___  | |_| | |_| | |___
+  \____|___\____| |____/|____/ \____|
+*/
+
+typedef struct complexf_s { float i; float q; } complexf;
+
+void *cicddc_init(int factor);
+void cicddc_free(void *state);
+void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
+void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate);
+void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/pf_mixer.cpp b/pf_mixer.cpp
new file mode 100644
index 0000000..570ffaf
--- /dev/null
+++ b/pf_mixer.cpp
@@ -0,0 +1,750 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* include own header first, to see missing includes */
+#include "pf_mixer.h"
+#include "fmv.h"
+
+//#include <stdio.h>
+//#include <time.h>
+#include <math.h>
+#include <stdlib.h>
+//#include <string.h>
+//#include <unistd.h>
+//#include <limits.h>
+//#include <assert.h>
+//#include <stdarg.h>
+
+//they dropped M_PI in C99, so we define it:
+#define PI ((float)3.14159265358979323846)
+
+//apply to pointers:
+#define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)))
+#define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1))
+
+
+/*
+  _____   _____ _____      __                  _   _
+ |  __ \ / ____|  __ \    / _|                | | (_)
+ | |  | | (___ | |__) |  | |_ _   _ _ __   ___| |_ _  ___  _ __  ___
+ | |  | |\___ \|  ___/   |  _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
+ | |__| |____) | |       | | | |_| | | | | (__| |_| | (_) | | | \__ \
+ |_____/|_____/|_|       |_|  \__,_|_| |_|\___|\__|_|\___/|_| |_|___/
+
+*/
+
+PF_TARGET_CLONES
+float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase)
+{
+    rate*=2;
+    //Shifts the complex spectrum. Basically a complex mixer. This version uses cmath.
+    float phase=starting_phase;
+    float phase_increment=rate*PI;
+    float cosval, sinval;
+    for(int i=0;i<input_size; i++)
+    {
+        cosval=cos(phase);
+        sinval=sin(phase);
+        //we multiply two complex numbers.
+        //how? enter this to maxima (software) for explanation:
+        //   (a+b*%i)*(c+d*%i), rectform;
+        iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
+        qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
+        phase+=phase_increment;
+        while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
+        while(phase<0) phase+=2*PI;
+    }
+    return phase;
+}
+
+
+
+shift_table_data_t shift_table_init(int table_size)
+{
+    //RTODO
+    shift_table_data_t output;
+    output.table=(float*)malloc(sizeof(float)*table_size);
+    output.table_size=table_size;
+    for(int i=0;i<table_size;i++)
+    {
+        output.table[i]=sin(((float)i/table_size)*(PI/2));
+    }
+    return output;
+}
+
+void shift_table_deinit(shift_table_data_t table_data)
+{
+    free(table_data.table);
+}
+
+
+PF_TARGET_CLONES
+float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase)
+{
+    //RTODO
+    rate*=2;
+    //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table.
+    float phase=starting_phase;
+    float phase_increment=rate*PI;
+    float cosval, sinval;
+    for(int i=0;i<input_size; i++) //@shift_math_cc
+    {
+        int sin_index, cos_index, temp_index, sin_sign, cos_sign;
+        //float vphase=fmodf(phase,PI/2); //between 0 and 90deg
+        int quadrant=phase/(PI/2); //between 0 and 3
+        float vphase=phase-quadrant*(PI/2);
+        sin_index=(vphase/(PI/2))*table_data.table_size;
+        cos_index=table_data.table_size-1-sin_index;
+        if(quadrant&1) //in quadrant 1 and 3
+        {
+            temp_index=sin_index;
+            sin_index=cos_index;
+            cos_index=temp_index;
+        }
+        sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3
+        cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2
+        sinval=sin_sign*table_data.table[sin_index];
+        cosval=cos_sign*table_data.table[cos_index];
+        //we multiply two complex numbers.
+        //how? enter this to maxima (software) for explanation:
+        //   (a+b*%i)*(c+d*%i), rectform;
+        iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
+        qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
+        phase+=phase_increment;
+        while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
+        while(phase<0) phase+=2*PI;
+    }
+    return phase;
+}
+
+
+shift_unroll_data_t shift_unroll_init(float rate, int size)
+{
+    shift_unroll_data_t output;
+    output.phase_increment=2*rate*PI;
+    output.size = size;
+    output.dsin=(float*)malloc(sizeof(float)*size);
+    output.dcos=(float*)malloc(sizeof(float)*size);
+    float myphase = 0;
+    for(int i=0;i<size;i++)
+    {
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+        output.dsin[i]=sin(myphase);
+        output.dcos[i]=cos(myphase);
+    }
+    return output;
+}
+
+void shift_unroll_deinit(shift_unroll_data_t* d)
+{
+    if (d && d->dsin)
+    {
+        free(d->dsin);
+        d->dsin = NULL;
+    }
+    if (d && d->dcos)
+    {
+        free(d->dcos);
+        d->dcos = NULL;
+    }
+}
+
+PF_TARGET_CLONES
+float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
+{
+    //input_size should be multiple of 4
+    //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
+    float cos_start = cos(starting_phase);
+    float sin_start = sin(starting_phase);
+    register float cos_val, sin_val;
+    for(int i=0;i<input_size; i++) //@shift_unroll_cc
+    {
+        iof(output,i) = cos_val*iof(input,i) - sin_val*qof(input,i);
+        qof(output,i) = sin_val*iof(input,i) + cos_val*qof(input,i);
+        // calculate complex phasor for next iteration
+        cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
+        sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
+    }
+    starting_phase+=input_size*d->phase_increment;
+    while(starting_phase>PI) starting_phase-=2*PI;
+    while(starting_phase<-PI) starting_phase+=2*PI;
+    return starting_phase;
+}
+
+PF_TARGET_CLONES
+float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase)
+{
+    float cos_start = cos(starting_phase);
+    float sin_start = sin(starting_phase);
+    register float cos_val, sin_val;
+    for(int i=0;i<size; i++) //@shift_unroll_inp_c
+    {
+        register float inp_i = iof(in_out,i);
+        register float inp_q = qof(in_out,i);
+        iof(in_out,i) = cos_val*inp_i - sin_val*inp_q;
+        qof(in_out,i) = sin_val*inp_i + cos_val*inp_q;
+        // calculate complex phasor for next iteration
+        cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
+        sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
+    }
+    starting_phase += size * d->phase_increment;
+    while(starting_phase>PI) starting_phase-=2*PI;
+    while(starting_phase<-PI) starting_phase+=2*PI;
+    return starting_phase;
+}
+
+
+#define SHIFT_UNROLL_SIZE      CSDR_SHIFT_LIMITED_UNROLL_SIZE
+#define SHIFT_LIMITED_SIMD_SZ  CSDR_SHIFT_LIMITED_SIMD_SZ
+
+shift_limited_unroll_data_t shift_limited_unroll_init(float rate)
+{
+    shift_limited_unroll_data_t output;
+    output.phase_increment=2*rate*PI;
+    float myphase = 0;
+    for(int i=0;i<SHIFT_UNROLL_SIZE;i++)
+    {
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+        output.dcos[i] = cos(myphase);
+        output.dsin[i] = sin(myphase);
+    }
+    output.complex_phase.i = 1.0F;
+    output.complex_phase.q = 0.0F;
+    return output;
+}
+
+PF_TARGET_CLONES
+void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d)
+{
+    float cos_start = d->complex_phase.i;
+    float sin_start = d->complex_phase.q;
+    register float cos_val = cos_start, sin_val = sin_start;
+    while (size > 0) //@shift_limited_unroll_cc
+    {
+        int N = (size >= SHIFT_UNROLL_SIZE) ? SHIFT_UNROLL_SIZE : size;
+        for(int i=0;i<N/SHIFT_LIMITED_SIMD_SZ; i++ )
+        {
+            for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++)
+            {
+                iof(output,SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,SHIFT_LIMITED_SIMD_SZ*i+j);
+                qof(output,SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,SHIFT_LIMITED_SIMD_SZ*i+j);
+                // calculate complex phasor for next iteration
+                cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
+                sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
+            }
+        }
+        input += SHIFT_UNROLL_SIZE;
+        output += SHIFT_UNROLL_SIZE;
+        size -= SHIFT_UNROLL_SIZE;
+    }
+    d->complex_phase.i = cos_val;
+    d->complex_phase.q = sin_val;
+}
+
+PF_TARGET_CLONES
+void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d)
+{
+    float inp_i[SHIFT_LIMITED_SIMD_SZ];
+    float inp_q[SHIFT_LIMITED_SIMD_SZ];
+    float cos_start = d->complex_phase.i;
+    float sin_start = d->complex_phase.q;
+    register float cos_val = cos_start, sin_val = sin_start;
+    while (size > 0) //@shift_limited_unroll_inp_c
+    {
+        int N = (size >= SHIFT_UNROLL_SIZE) ? SHIFT_UNROLL_SIZE : size;
+        for(int i=0;i<N/SHIFT_LIMITED_SIMD_SZ; i++ )
+        {
+            for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++)
+                inp_i[j] = in_out[SHIFT_LIMITED_SIMD_SZ*i+j].i;
+            for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++)
+                inp_q[j] = in_out[SHIFT_LIMITED_SIMD_SZ*i+j].q;
+            for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++)
+            {
+                iof(in_out,SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j];
+                qof(in_out,SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j];
+                // calculate complex phasor for next iteration
+                cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
+                sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
+            }
+        }
+        in_out += SHIFT_UNROLL_SIZE;
+        size -= SHIFT_UNROLL_SIZE;
+    }
+    d->complex_phase.i = cos_val;
+    d->complex_phase.q = sin_val;
+}
+
+
+#if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+
+#if defined(__GNUC__)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define RESTRICT __restrict
+#elif defined(_MSC_VER)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define RESTRICT __restrict
+#endif
+
+#  define SIMD_SZ 4
+
+typedef union v4_union {
+  __m128  v;
+  float f[4];
+} v4_union;
+
+#define VMUL(a,b)    _mm_mul_ps(a,b)
+#define VADD(a,b)    _mm_add_ps(a,b)
+#define VSUB(a,b)    _mm_sub_ps(a,b)
+#define LD_PS1(s)    _mm_set1_ps(s)
+#define VLOAD_UNALIGNED(ptr)      _mm_loadu_ps((const float *)(ptr))
+#define VLOAD_ALIGNED(ptr)        _mm_load_ps((const float *)(ptr))
+#define VSTORE_UNALIGNED(ptr, v)  _mm_storeu_ps((float*)(ptr), v)
+#define VSTORE_ALIGNED(ptr, v)    _mm_store_ps((float*)(ptr), v)
+#define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+#define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+
+/* num_cplx must be multiple of 4 ! */
+void aligned_vec_complex_mul(int num_cplx, float *dest_a_vec, const float *b_vec)
+{
+    __m128 inp_re, inp_im;
+    __m128 inout_re, inout_im;
+    __m128 product_re, product_im;
+    __m128 interl_prod_a, interl_prod_b;
+    __m128 * RESTRICT u = (__m128*)dest_a_vec;
+    const __m128 * RESTRICT v = (const __m128*)b_vec;
+    const int L = num_cplx / 4;
+    int k;
+    for (k=0; k < L; ++k)
+    {
+        UNINTERLEAVE2(VLOAD_ALIGNED(u), VLOAD_ALIGNED(u+1), inout_re, inout_im);  /* inout_re = all reals; inout_im = all imags */
+        UNINTERLEAVE2(VLOAD_ALIGNED(v), VLOAD_ALIGNED(v+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+        product_re = VSUB( VMUL(inout_re, inp_re), VMUL(inout_im, inp_im) );
+        product_im = VADD( VMUL(inout_im, inp_re), VMUL(inout_re, inp_im) );
+        INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
+        VSTORE_ALIGNED(u, interl_prod_a);
+        VSTORE_ALIGNED(u+1, interl_prod_b);
+        u += 2;
+        v += 2;
+    }
+}
+
+shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_freq, float phase_start_rad)
+{
+    shift_limited_unroll_sse_data_t output;
+    float myphase;
+
+    output.phase_increment = 2*relative_freq*PI;
+
+    myphase = 0.0F;
+    for (int i = 0; i < SHIFT_UNROLL_SIZE + SHIFT_LIMITED_SIMD_SZ; i++)
+    {
+        output.dcos[i] = cos(myphase);
+        output.dsin[i] = sin(myphase);
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+    }
+
+    output.dcos_blk = 0.0F;
+    output.dsin_blk = 0.0F;
+
+    myphase = phase_start_rad;
+    for (int i = 0; i < SHIFT_LIMITED_SIMD_SZ; i++)
+    {
+        output.phase_state_i[i] = cos(myphase);
+        output.phase_state_q[i] = sin(myphase);
+        myphase += output.phase_increment;
+        while(myphase>PI) myphase-=2*PI;
+        while(myphase<-PI) myphase+=2*PI;
+    }
+    return output;
+}
+
+
+
+void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_sse_data_t* d)
+{
+    const __m128 cos_starts = VLOAD_ALIGNED( &d->phase_state_i[0] );
+    const __m128 sin_starts = VLOAD_ALIGNED( &d->phase_state_q[0] );
+    __m128 cos_vals = cos_starts;
+    __m128 sin_vals = sin_starts;
+    __m128 inp_re, inp_im;
+    __m128 product_re, product_im;
+    __m128 interl_prod_a, interl_prod_b;
+    __m128 * RESTRICT p_trig_cos_tab;
+    __m128 * RESTRICT p_trig_sin_tab;
+    __m128 * RESTRICT u = (__m128*)in_out;
+
+    while (N_cplx)
+    {
+        const int NB = (N_cplx >= CSDR_SHIFT_LIMITED_UNROLL_SIZE) ? CSDR_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
+        int B = NB;
+        p_trig_cos_tab = (__m128*)( &d->dcos[0] );
+        p_trig_sin_tab = (__m128*)( &d->dsin[0] );
+        while (B >= 0)
+        {
+            // complex multiplication of 4 complex values from/to in_out[]
+            // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
+            UNINTERLEAVE2(VLOAD_ALIGNED(u), VLOAD_ALIGNED(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+            product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
+            product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
+            INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
+            VSTORE_ALIGNED(u, interl_prod_a);
+            VSTORE_ALIGNED(u+1, interl_prod_b);
+            u += 2;
+            // calculate complex phasor for next iteration
+            // cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
+            // sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j];
+            // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
+            inp_re = VLOAD_ALIGNED(p_trig_cos_tab);
+            inp_im = VLOAD_ALIGNED(p_trig_sin_tab);
+            cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
+            sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
+            ++p_trig_cos_tab;
+            ++p_trig_sin_tab;
+            B -= 4;
+        }
+        N_cplx -= NB;
+        /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
+        /* re-use product_re[]/product_im[] for normalization */
+        product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
+        product_im = _mm_rsqrt_ps(product_re);
+        cos_vals = VMUL(cos_vals, product_im);
+        sin_vals = VMUL(sin_vals, product_im);
+    }
+    VSTORE_ALIGNED( &d->phase_state_i[0], cos_vals );
+    VSTORE_ALIGNED( &d->phase_state_q[0], sin_vals );
+}
+
+#endif
+
+
+shift_addfast_data_t shift_addfast_init(float rate)
+{
+    shift_addfast_data_t output;
+    output.phase_increment=2*rate*PI;
+    for(int i=0;i<4;i++)
+    {
+        output.dsin[i]=sin(output.phase_increment*(i+1));
+        output.dcos[i]=cos(output.phase_increment*(i+1));
+    }
+    return output;
+}
+
+#if defined NEON_OPTS && defined__arm__
+#pragma message "Manual NEON (arm32) optimizations are ON: we have a faster shift_addfast_cc now."
+
+// #define HAVE_ADDFAST_CC_IMPL
+// removed cpu specific implementation
+
+#elif defined NEON_OPTS && defined __aarch64__
+#pragma message "Manual NEON (aarch64) optimizations are ON: we have a faster shift_addfast_cc now."
+
+// #define HAVE_ADDFAST_CC_IMPL
+// removed cpu specific implementation
+
+#endif
+
+#ifndef HAVE_ADDFAST_CC_IMPL
+
+#define SADF_L1(j) cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
+    sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
+#define SADF_L2(j) iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \
+    qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j);
+
+PF_TARGET_CLONES
+float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
+{
+    //input_size should be multiple of 4
+    //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
+    float cos_start=cos(starting_phase);
+    float sin_start=sin(starting_phase);
+    float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
+        sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
+        dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
+        dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
+
+    for(int i=0;i<input_size/4; i++) //@shift_addfast_cc
+    {
+        SADF_L1(0)
+        SADF_L1(1)
+        SADF_L1(2)
+        SADF_L1(3)
+        SADF_L2(0)
+        SADF_L2(1)
+        SADF_L2(2)
+        SADF_L2(3)
+        cos_start = cos_vals_3;
+        sin_start = sin_vals_3;
+    }
+    starting_phase+=input_size*d->phase_increment;
+    while(starting_phase>PI) starting_phase-=2*PI;
+    while(starting_phase<-PI) starting_phase+=2*PI;
+    return starting_phase;
+}
+
+#endif
+
+
+
+#define SHIFT_REC_SIMD_SZ CSDR_SHIFT_RECURSIVE_SIMD_SZ
+
+void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state)
+{
+    // constants for single phase step
+    float phase_increment_s = rate*PI;
+    float k1 = tan(0.5*phase_increment_s);
+    float k2 = 2*k1 /(1 + k1 * k1);
+    for (int j=1; j<SHIFT_REC_SIMD_SZ; j++)
+    {
+        float tmp;
+        state->u_cos[j] = state->u_cos[j-1];
+        state->v_sin[j] = state->v_sin[j-1];
+        // small steps
+        tmp = state->u_cos[j] - k1 * state->v_sin[j];
+        state->v_sin[j] += k2 * tmp;
+        state->u_cos[j] = tmp - k1 * state->v_sin[j];
+    }
+
+    // constants for SHIFT_REC_SIMD_SZ times phase step
+    float phase_increment_b = phase_increment_s * SHIFT_REC_SIMD_SZ;
+    while(phase_increment_b > PI) phase_increment_b-=2*PI;
+    while(phase_increment_b < -PI) phase_increment_b+=2*PI;
+    conf->k1 = tan(0.5*phase_increment_b);
+    conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
+}
+
+void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state)
+{
+    if (starting_phase != 0.0F)
+    {
+        state->u_cos[0] = cos(starting_phase);
+        state->v_sin[0] = sin(starting_phase);
+    }
+    else
+    {
+        state->u_cos[0] = 1.0F;
+        state->v_sin[0] = 0.0F;
+    }
+    shift_recursive_osc_update_rate(rate, conf, state);
+}
+
+
+PF_TARGET_CLONES
+void shift_recursive_osc_cc(const complexf *input, complexf* output,
+    int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
+{
+    float tmp[SHIFT_REC_SIMD_SZ];
+    float inp_i[SHIFT_REC_SIMD_SZ];
+    float inp_q[SHIFT_REC_SIMD_SZ];
+    shift_recursive_osc_t state = *state_ext;
+    const float k1 = conf->k1;
+    const float k2 = conf->k2;
+    for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@shift_recursive_osc_cc
+    {
+        //we multiply two complex numbers - similar to shift_math_cc
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        {
+            inp_i[j] = input[SHIFT_REC_SIMD_SZ*i+j].i;
+            inp_q[j] = input[SHIFT_REC_SIMD_SZ*i+j].q;
+        }
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        {
+            iof(output,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
+            qof(output,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
+        }
+        // update complex phasor - like incrementing phase
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+            tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+            state.v_sin[j] += k2 * tmp[j];
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+            state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
+    }
+    *state_ext = state;
+}
+
+PF_TARGET_CLONES
+void shift_recursive_osc_inp_c(complexf* in_out,
+    int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
+{
+    float tmp[SHIFT_REC_SIMD_SZ];
+    float inp_i[SHIFT_REC_SIMD_SZ];
+    float inp_q[SHIFT_REC_SIMD_SZ];
+    shift_recursive_osc_t state = *state_ext;
+    const float k1 = conf->k1;
+    const float k2 = conf->k2;
+    for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@shift_recursive_osc_inp_c
+    {
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        {
+            inp_i[j] = in_out[SHIFT_REC_SIMD_SZ*i+j].i;
+            inp_q[j] = in_out[SHIFT_REC_SIMD_SZ*i+j].q;
+        }
+        //we multiply two complex numbers - similar to shift_math_cc
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        {
+            iof(in_out,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
+            qof(in_out,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
+        }
+        // update complex phasor - like incrementing phase
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+            tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+            state.v_sin[j] += k2 * tmp[j];
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+            state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
+    }
+    *state_ext = state;
+}
+
+PF_TARGET_CLONES
+void gen_recursive_osc_c(complexf* output,
+    int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
+{
+    float tmp[SHIFT_REC_SIMD_SZ];
+    shift_recursive_osc_t state = *state_ext;
+    const float k1 = conf->k1;
+    const float k2 = conf->k2;
+    for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@gen_recursive_osc_c
+    {
+        // output complex oscillator value
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+        {
+            iof(output,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j];
+            qof(output,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j];
+        }
+        // update complex phasor - like incrementing phase
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+            tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+            state.v_sin[j] += k2 * tmp[j];
+        for (int j=0;j<SHIFT_REC_SIMD_SZ;j++)
+            state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
+    }
+    *state_ext = state;
+}
+
+
+#if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+
+void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
+{
+    // constants for single phase step
+    float phase_increment_s = rate*PI;
+    float k1 = tan(0.5*phase_increment_s);
+    float k2 = 2*k1 /(1 + k1 * k1);
+    for (int j=1; j<CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++)
+    {
+        float tmp;
+        state->u_cos[j] = state->u_cos[j-1];
+        state->v_sin[j] = state->v_sin[j-1];
+        // small steps
+        tmp = state->u_cos[j] - k1 * state->v_sin[j];
+        state->v_sin[j] += k2 * tmp;
+        state->u_cos[j] = tmp - k1 * state->v_sin[j];
+    }
+
+    // constants for CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step
+    float phase_increment_b = phase_increment_s * CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ;
+    while(phase_increment_b > PI) phase_increment_b-=2*PI;
+    while(phase_increment_b < -PI) phase_increment_b+=2*PI;
+    conf->k1 = tan(0.5*phase_increment_b);
+    conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
+}
+
+
+void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
+{
+    if (starting_phase != 0.0F)
+    {
+        state->u_cos[0] = cos(starting_phase);
+        state->v_sin[0] = sin(starting_phase);
+    }
+    else
+    {
+        state->u_cos[0] = 1.0F;
+        state->v_sin[0] = 0.0F;
+    }
+    shift_recursive_osc_sse_update_rate(rate, conf, state);
+}
+
+
+void shift_recursive_osc_sse_inp_c(complexf* in_out,
+    int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
+{
+    const __m128 k1 = LD_PS1( conf->k1 );
+    const __m128 k2 = LD_PS1( conf->k2 );
+    __m128 u_cos = VLOAD_ALIGNED( &state_ext->u_cos[0] );
+    __m128 v_sin = VLOAD_ALIGNED( &state_ext->v_sin[0] );
+    __m128 inp_re, inp_im;
+    __m128 product_re, product_im;
+    __m128 interl_prod_a, interl_prod_b;
+    __m128 * RESTRICT u = (__m128*)in_out;
+
+    while (N_cplx)
+    {
+        //inp_i[j] = in_out[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i;
+        //inp_q[j] = in_out[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q;
+        UNINTERLEAVE2(VLOAD_ALIGNED(u), VLOAD_ALIGNED(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
+
+        //we multiply two complex numbers - similar to shift_math_cc
+        //iof(in_out,CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
+        //qof(in_out,CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
+        product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) );
+        product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) );
+        INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
+        VSTORE_ALIGNED(u, interl_prod_a);
+        VSTORE_ALIGNED(u+1, interl_prod_b);
+        u += 2;
+
+        // update complex phasor - like incrementing phase
+        // tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
+        product_re = VSUB( u_cos, VMUL(k1, v_sin) );
+        // state.v_sin[j] += k2 * tmp[j];
+        v_sin = VADD( v_sin, VMUL(k2, product_re) );
+        // state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
+        u_cos = VSUB( product_re, VMUL(k1, v_sin) );
+
+        N_cplx -= 4;
+    }
+    VSTORE_ALIGNED( &state_ext->u_cos[0], u_cos );
+    VSTORE_ALIGNED( &state_ext->v_sin[0], v_sin );
+}
+
+#endif
+
diff --git a/pf_mixer.h b/pf_mixer.h
new file mode 100644
index 0000000..ff647b3
--- /dev/null
+++ b/pf_mixer.h
@@ -0,0 +1,177 @@
+/*
+This software is part of pffft/pfdsp, a set of simple DSP routines.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+Copyright (c) 2020  Hayati Ayguen <h_ayguen@web.de>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+   _____                      _
+  / ____|                    | |
+ | |     ___  _ __ ___  _ __ | | _____  __
+ | |    / _ \| '_ ` _ \| '_ \| |/ _ \ \/ /
+ | |___| (_) | | | | | | |_) | |  __/>  <
+  \_____\___/|_| |_| |_| .__/|_|\___/_/\_\
+                       | |
+                       |_|
+*/
+
+typedef struct complexf_s { float i; float q; } complexf;
+
+// =================================================================================
+
+float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase);
+
+
+typedef struct shift_table_data_s
+{
+    float* table;
+    int table_size;
+} shift_table_data_t;
+void shift_table_deinit(shift_table_data_t table_data);
+shift_table_data_t shift_table_init(int table_size);
+float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase);
+
+
+typedef struct shift_addfast_data_s
+{
+    float dsin[4];
+    float dcos[4];
+    float phase_increment;
+} shift_addfast_data_t;
+shift_addfast_data_t shift_addfast_init(float rate);
+float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
+
+
+typedef struct shift_unroll_data_s
+{
+    float* dsin;
+    float* dcos;
+    float phase_increment;
+    int size;
+} shift_unroll_data_t;
+shift_unroll_data_t shift_unroll_init(float rate, int size);
+void shift_unroll_deinit(shift_unroll_data_t* d);
+float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase);
+float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase);
+
+
+/* similar to shift_unroll_cc() - but, have fixed and limited precalc size
+ * idea: smaller cache usage by table
+ * size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4)
+ */
+#define CSDR_SHIFT_LIMITED_UNROLL_SIZE  64
+#define CSDR_SHIFT_LIMITED_SIMD_SZ  4
+typedef struct shift_limited_unroll_data_s
+{
+    float dcos[CSDR_SHIFT_LIMITED_UNROLL_SIZE];
+    float dsin[CSDR_SHIFT_LIMITED_UNROLL_SIZE];
+    complexf complex_phase;
+    float phase_increment;
+} shift_limited_unroll_data_t;
+shift_limited_unroll_data_t shift_limited_unroll_init(float rate);
+/* size must be multiple of CSDR_SHIFT_LIMITED_SIMD_SZ */
+/* starting_phase for next call is kept internal in state */
+void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d);
+void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d);
+
+
+typedef struct shift_limited_unroll_sse_data_s
+{
+    /* small/limited trig table */
+    float dcos[CSDR_SHIFT_LIMITED_UNROLL_SIZE+CSDR_SHIFT_LIMITED_SIMD_SZ];
+    float dsin[CSDR_SHIFT_LIMITED_UNROLL_SIZE+CSDR_SHIFT_LIMITED_SIMD_SZ];
+    /* 4 times complex phase */
+    float phase_state_i[CSDR_SHIFT_LIMITED_SIMD_SZ];
+    float phase_state_q[CSDR_SHIFT_LIMITED_SIMD_SZ];
+    /* N_cplx_per_block times increment - for future parallel variants */
+    float dcos_blk;
+    float dsin_blk;
+    /* */
+    float phase_increment;
+} shift_limited_unroll_sse_data_t;
+shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_freq, float phase_start_rad);
+void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_sse_data_t* d);
+
+
+
+/* Recursive Quadrature Oscillator functions "recursive_osc"
+ * see https://www.vicanek.de/articles/QuadOsc.pdf
+ */
+#define CSDR_SHIFT_RECURSIVE_SIMD_SZ  8
+typedef struct shift_recursive_osc_s
+{
+    float u_cos[CSDR_SHIFT_RECURSIVE_SIMD_SZ];
+    float v_sin[CSDR_SHIFT_RECURSIVE_SIMD_SZ];
+} shift_recursive_osc_t;
+
+typedef struct shift_recursive_osc_conf_s
+{
+    float k1;
+    float k2;
+} shift_recursive_osc_conf_t;
+
+void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state);
+void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+
+/* size must be multiple of CSDR_SHIFT_LIMITED_SIMD_SZ */
+/* starting_phase for next call is kept internal in state */
+void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state);
+
+#define CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ  4
+typedef struct shift_recursive_osc_sse_s
+{
+    float u_cos[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ];
+    float v_sin[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ];
+} shift_recursive_osc_sse_t;
+
+typedef struct shift_recursive_osc_sse_conf_s
+{
+    float k1;
+    float k2;
+} shift_recursive_osc_sse_conf_t;
+
+void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state);
+void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state);
+void shift_recursive_osc_sse_inp_c(complexf* in_out, int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext);
+
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/use_gcc8.inc b/use_gcc8.inc
index c4535f1..c4535f1 100755..100644
--- a/use_gcc8.inc
+++ b/use_gcc8.inc
author	hayati ayguen <h_ayguen@web.de>	2020-11-11 23:09:58 +0100
committer	hayati ayguen <h_ayguen@web.de>	2020-11-11 23:09:58 +0100
commit	29eb8477d4d7cd274b5432737725aadcd3df2767 (patch)
tree	c28a0903ffb5c2aad13c78a9cf2f1d0c0b416dfb
parent	ee17cb0c97146a888c8d925999413f20934676fb (diff)
download	pffft-29eb8477d4d7cd274b5432737725aadcd3df2767.tar.gz