diff options
author | hayati ayguen <h_ayguen@web.de> | 2020-11-14 15:12:19 +0100 |
---|---|---|
committer | hayati ayguen <h_ayguen@web.de> | 2020-11-14 15:12:50 +0100 |
commit | d64eea5c6abe2c81f37b4e5932d15276fe85d09a (patch) | |
tree | 53fd06bbe82afe8da2107e35188cfd9db7359f81 | |
parent | 2587d835de992cabfb653bfed9db80e499df4449 (diff) | |
download | pffft-d64eea5c6abe2c81f37b4e5932d15276fe85d09a.tar.gz |
added mixer algorithm variants, cleaned up code, enabled unaligned arrays
-rw-r--r-- | bench_mixers.c | 320 | ||||
-rw-r--r-- | pf_mixer.cpp | 726 | ||||
-rw-r--r-- | pf_mixer.h | 142 |
3 files changed, 930 insertions, 258 deletions
diff --git a/bench_mixers.c b/bench_mixers.c index df80818..5b22b3f 100644 --- a/bench_mixers.c +++ b/bench_mixers.c @@ -22,37 +22,102 @@ #endif #define BENCH_REF_TRIG_FUNC 1 -#define BENCH_OUT_OF_PLACE_ALGOS 1 +#define BENCH_OUT_OF_PLACE_ALGOS 0 #define BENCH_INPLACE_ALGOS 1 +#define SAVE_BY_DEFAULT 0 +#define SAVE_LIMIT_MSPS 16 + +#if 0 + #define BENCH_FILE_SHIFT_MATH_CC "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin" + #define BENCH_FILE_ADD_FAST_CC "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin" + #define BENCH_FILE_ADD_FAST_INP_C "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin" + #define BENCH_FILE_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_INP_C "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin" + #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin" + #define BENCH_FILE_REC_OSC_CC "" + #define BENCH_FILE_REC_OSC_INP_C "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin" + #define BENCH_FILE_REC_OSC_SSE_INP_C "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin" +#else + #define BENCH_FILE_SHIFT_MATH_CC "" + #define BENCH_FILE_ADD_FAST_CC "" + #define BENCH_FILE_ADD_FAST_INP_C "" + #define BENCH_FILE_UNROLL_INP_C "" + #define BENCH_FILE_LTD_UNROLL_INP_C "" + #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C "" + #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C "" + #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C "" + #define BENCH_FILE_REC_OSC_CC "" + #define BENCH_FILE_REC_OSC_INP_C "" + #define BENCH_FILE_REC_OSC_SSE_INP_C "" +#endif + + #if defined(HAVE_SYS_TIMES) static double ttclk = 0.; - static double uclock_sec(void) + static double uclock_sec(int find_start) { - struct tms t; + struct tms t0, t; if (ttclk == 0.) + { ttclk = sysconf(_SC_CLK_TCK); + fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk); + } times(&t); + if (find_start) + { + t0 = t; + while (t0.tms_utime == t.tms_utime) + times(&t); + } /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */ return ((double)t.tms_utime) / ttclk; } -# else - double uclock_sec(void) + +#elif 0 + // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes + double uclock_sec(int find_start) + { + FILETIME a, b, c, d; + if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0) + { + // Returns total user time. + // Can be tweaked to include kernel times as well. + return + (double)(d.dwLowDateTime | + ((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001; + } + else { + // Handle error + return 0; + } + } + +#else + double uclock_sec(int find_start) { return (double)clock()/(double)CLOCKS_PER_SEC; } #endif void save(complexf * d, int B, int N, const char * fn) { - if (!fn) + if (!fn || !fn[0]) + { + if (! SAVE_BY_DEFAULT) + return; fn = "/dev/shm/bench.bin"; + } FILE * f = fopen(fn, "wb"); if (!f) { fprintf(stderr, "error writing result to %s\n", fn); return; } + if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 ) + N = SAVE_LIMIT_MSPS * 1024 * 1024; for (int off = 0; off + B <= N; off += B) { fwrite(d+off, sizeof(complexf), B, f); @@ -75,21 +140,22 @@ double bench_shift_math_cc(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase); off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(output, B, off, NULL); + save(output, B, off, BENCH_FILE_SHIFT_MATH_CC); + free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -112,7 +178,7 @@ double bench_shift_table_cc(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -120,14 +186,14 @@ double bench_shift_table_cc(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); save(output, B, off, NULL); free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -136,7 +202,6 @@ double bench_shift_table_cc(int B, int N) { double bench_shift_addfast(int B, int N) { double t0, t1, tstop, T, nI; int iter, off; - int table_size=65536; float phase = 0.0F; complexf *input = (complexf *)malloc(N * sizeof(complexf)); complexf *output = (complexf *)malloc(N * sizeof(complexf)); @@ -149,7 +214,7 @@ double bench_shift_addfast(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -157,14 +222,49 @@ double bench_shift_addfast(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(output, B, off, NULL); + save(output, B, off, BENCH_FILE_ADD_FAST_CC); + free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + +double bench_shift_addfast_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_addfast_data_t state = shift_addfast_init(-0.0009F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + phase = shift_addfast_inp_c(input+off, B, &state, phase); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, BENCH_FILE_ADD_FAST_INP_C); + + free(input); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -185,7 +285,7 @@ double bench_shift_unroll_oop(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -193,14 +293,14 @@ double bench_shift_unroll_oop(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); save(output, B, off, NULL); free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -219,7 +319,7 @@ double bench_shift_unroll_inp(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -227,13 +327,14 @@ double bench_shift_unroll_inp(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(input, B, off, NULL); + save(input, B, off, BENCH_FILE_UNROLL_INP_C); + free(input); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -254,7 +355,7 @@ double bench_shift_limited_unroll_oop(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -262,14 +363,14 @@ double bench_shift_limited_unroll_oop(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); save(output, B, off, NULL); free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -288,7 +389,7 @@ double bench_shift_limited_unroll_inp(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -296,48 +397,121 @@ double bench_shift_limited_unroll_inp(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(input, B, off, NULL); + save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C); + free(input); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } -double bench_shift_limited_unroll_sse_inp(int B, int N) { +double bench_shift_limited_unroll_A_sse_inp(int B, int N) { double t0, t1, tstop, T, nI; int iter, off; complexf *input = (complexf *)malloc(N * sizeof(complexf)); shift_recursive_osc_t gen_state; shift_recursive_osc_conf_t gen_conf; - shift_limited_unroll_sse_data_t *state = malloc(sizeof(shift_limited_unroll_sse_data_t)); + shift_limited_unroll_A_sse_data_t *state = malloc(sizeof(shift_limited_unroll_A_sse_data_t)); - *state = shift_limited_unroll_sse_init(-0.0009F, 0.0F); + *state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F); shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); gen_recursive_osc_c(input, N, &gen_conf, &gen_state); iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work - shift_limited_unroll_sse_inp_c(input+off, B, state); + shift_limited_unroll_A_sse_inp_c(input+off, B, state); off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(input, B, off, NULL); + save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C); + free(input); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + +double bench_shift_limited_unroll_B_sse_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_B_sse_data_t *state = malloc(sizeof(shift_limited_unroll_B_sse_data_t)); + + *state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + //shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_limited_unroll_B_sse_inp_c(input+off, B, state); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C); + + free(input); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + +double bench_shift_limited_unroll_C_sse_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_C_sse_data_t *state = malloc(sizeof(shift_limited_unroll_C_sse_data_t)); + + *state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(1); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_limited_unroll_C_sse_inp_c(input+off, B, state); + + off += B; + ++iter; + t1 = uclock_sec(0); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C); + + free(input); + T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -358,7 +532,7 @@ double bench_shift_rec_osc_cc_oop(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -366,15 +540,16 @@ double bench_shift_rec_osc_cc_oop(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - //save(input, B, off, "/dev/shm/bench_inp.bin"); + save(input, B, off, BENCH_FILE_REC_OSC_CC); + save(output, B, off, NULL); free(input); free(output); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -394,7 +569,7 @@ double bench_shift_rec_osc_cc_inp(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -402,13 +577,13 @@ double bench_shift_rec_osc_cc_inp(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(input, B, off, NULL); + save(input, B, off, BENCH_FILE_REC_OSC_INP_C); free(input); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -432,7 +607,7 @@ double bench_shift_rec_osc_sse_c_inp(int B, int N) { iter = 0; off = 0; - t0 = uclock_sec(); + t0 = uclock_sec(1); tstop = t0 + 0.5; /* benchmark duration: 500 ms */ do { // work @@ -440,13 +615,13 @@ double bench_shift_rec_osc_sse_c_inp(int B, int N) { off += B; ++iter; - t1 = uclock_sec(); + t1 = uclock_sec(0); } while ( t1 < tstop && off + B < N ); - save(input, B, off, NULL); + save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C); free(input); - printf("processed %f Msamples\n", off * 1E-6); T = ( t1 - t0 ); /* duration per fft() */ + printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3); nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ return (nI / T); /* normalized iterations per second */ } @@ -460,6 +635,26 @@ int main(int argc, char **argv) // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte) int B = 8 * 1024; int N = 64 * 1024 * 1024; + int showUsage = 0; + + if (argc == 1) + showUsage = 1; + + if (1 < argc) + B = atoi(argv[1]); + if (2 < argc) + N = atoi(argv[2]) * 1024 * 1024; + + if ( !B || !N || showUsage ) + { + fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]); + if ( !B || !N ) + return 0; + } + + fprintf(stderr, "processing up to N = %d MSamples with blocke length of %d samples\n", + N / (1024 * 1024), B ); + #if BENCH_REF_TRIG_FUNC printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n"); @@ -490,18 +685,31 @@ int main(int argc, char **argv) #endif #if BENCH_INPLACE_ALGOS - printf("starting bench of shift_unroll_cc in-place ..\n"); + + printf("starting bench of shift_addfast_inp_c in-place ..\n"); + rt = bench_shift_addfast_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_unroll_inp_c in-place ..\n"); rt = bench_shift_unroll_inp(B, N); printf(" %f MSamples/sec\n\n", rt * 1E-6); - printf("starting bench of shift_limited_unroll_cc in-place ..\n"); + printf("starting bench of shift_limited_unroll_inp_c in-place ..\n"); rt = bench_shift_limited_unroll_inp(B, N); printf(" %f MSamples/sec\n\n", rt * 1E-6); if ( have_sse_shift_mixer_impl() ) { - printf("starting bench of shift_limited_unroll_sse_inp_c in-place ..\n"); - rt = bench_shift_limited_unroll_sse_inp(B, N); + printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_A_sse_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_B_sse_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_C_sse_inp(B, N); printf(" %f MSamples/sec\n\n", rt * 1E-6); } diff --git a/pf_mixer.cpp b/pf_mixer.cpp index 81c78e4..0f2c310 100644 --- a/pf_mixer.cpp +++ b/pf_mixer.cpp @@ -43,6 +43,9 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i))) #define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1)) +#define USE_ALIGNED_ADDRESSES 0 + + /* _____ _____ _____ __ _ _ @@ -93,10 +96,11 @@ typedef union v4_union { float f[4]; } v4_union; -#define VMUL(a,b) _mm_mul_ps(a,b) -#define VADD(a,b) _mm_add_ps(a,b) -#define VSUB(a,b) _mm_sub_ps(a,b) -#define LD_PS1(s) _mm_set1_ps(s) +#define VMUL(a,b) _mm_mul_ps(a,b) +#define VDIV(a,b) _mm_div_ps(a,b) +#define VADD(a,b) _mm_add_ps(a,b) +#define VSUB(a,b) _mm_sub_ps(a,b) +#define LD_PS1(s) _mm_set1_ps(s) #define VLOAD_UNALIGNED(ptr) _mm_loadu_ps((const float *)(ptr)) #define VLOAD_ALIGNED(ptr) _mm_load_ps((const float *)(ptr)) #define VSTORE_UNALIGNED(ptr, v) _mm_storeu_ps((float*)(ptr), v) @@ -104,6 +108,14 @@ typedef union v4_union { #define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } #define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } +#if USE_ALIGNED_ADDRESSES + #define VLOAD(ptr) _mm_load_ps((const float *)(ptr)) + #define VSTORE(ptr, v) _mm_store_ps((float*)(ptr), v) +#else + #define VLOAD(ptr) _mm_loadu_ps((const float *)(ptr)) + #define VSTORE(ptr, v) _mm_storeu_ps((float*)(ptr), v) +#endif + int have_sse_shift_mixer_impl() { @@ -120,6 +132,12 @@ int have_sse_shift_mixer_impl() #endif +/*********************************************************************/ + +/**************/ +/*** ALGO A ***/ +/**************/ + PF_TARGET_CLONES float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase) { @@ -144,11 +162,14 @@ float shift_math_cc(complexf *input, complexf* output, int input_size, float rat return phase; } +/*********************************************************************/ +/**************/ +/*** ALGO B ***/ +/**************/ shift_table_data_t shift_table_init(int table_size) { - //RTODO shift_table_data_t output; output.table=(float*)malloc(sizeof(float)*table_size); output.table_size=table_size; @@ -168,7 +189,6 @@ void shift_table_deinit(shift_table_data_t table_data) PF_TARGET_CLONES float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase) { - //RTODO rate*=2; //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table. float phase=starting_phase; @@ -177,7 +197,6 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra for(int i=0;i<input_size; i++) //@shift_math_cc { int sin_index, cos_index, temp_index, sin_sign, cos_sign; - //float vphase=fmodf(phase,PI/2); //between 0 and 90deg int quadrant=phase/(PI/2); //between 0 and 3 float vphase=phase-quadrant*(PI/2); sin_index=(vphase/(PI/2))*table_data.table_size; @@ -204,6 +223,112 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra return phase; } +/*********************************************************************/ + +/**************/ +/*** ALGO C ***/ +/**************/ + +shift_addfast_data_t shift_addfast_init(float rate) +{ + shift_addfast_data_t output; + output.phase_increment=2*rate*PI; + for(int i=0;i<4;i++) + { + output.dsin[i]=sin(output.phase_increment*(i+1)); + output.dcos[i]=cos(output.phase_increment*(i+1)); + } + return output; +} + +#define SADF_L1(j) \ + cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \ + sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j; +#define SADF_L2(j) \ + iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \ + qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j); + +PF_TARGET_CLONES +float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase) +{ + //input_size should be multiple of 4 + //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); + float cos_start=cos(starting_phase); + float sin_start=sin(starting_phase); + float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3, + sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3, + dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3], + dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3]; + + for(int i=0;i<input_size/4; i++) + { + SADF_L1(0) + SADF_L1(1) + SADF_L1(2) + SADF_L1(3) + SADF_L2(0) + SADF_L2(1) + SADF_L2(2) + SADF_L2(3) + cos_start = cos_vals_3; + sin_start = sin_vals_3; + } + starting_phase+=input_size*d->phase_increment; + while(starting_phase>PI) starting_phase-=2*PI; + while(starting_phase<-PI) starting_phase+=2*PI; + return starting_phase; +} + +#undef SADF_L2 + + +#define SADF_L2(j) \ + tmp_inp_cos = iof(in_out,4*i+j); \ + tmp_inp_sin = qof(in_out,4*i+j); \ + iof(in_out,4*i+j)=(cos_vals_ ## j)*tmp_inp_cos - (sin_vals_ ## j)*tmp_inp_sin; \ + qof(in_out,4*i+j)=(sin_vals_ ## j)*tmp_inp_cos + (cos_vals_ ## j)*tmp_inp_sin; + +PF_TARGET_CLONES +float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase) +{ + //input_size should be multiple of 4 + //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); + float cos_start=cos(starting_phase); + float sin_start=sin(starting_phase); + float register tmp_inp_cos, tmp_inp_sin, + cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3, + sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3, + dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3], + dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3]; + + for(int i=0;i<N_cplx/4; i++) + { + SADF_L1(0) + SADF_L1(1) + SADF_L1(2) + SADF_L1(3) + SADF_L2(0) + SADF_L2(1) + SADF_L2(2) + SADF_L2(3) + cos_start = cos_vals_3; + sin_start = sin_vals_3; + } + starting_phase+=N_cplx*d->phase_increment; + while(starting_phase>PI) starting_phase-=2*PI; + while(starting_phase<-PI) starting_phase+=2*PI; + return starting_phase; +} + +#undef SADF_L1 +#undef SADF_L2 + + +/*********************************************************************/ + +/**************/ +/*** ALGO D ***/ +/**************/ shift_unroll_data_t shift_unroll_init(float rate, int size) { @@ -226,16 +351,12 @@ shift_unroll_data_t shift_unroll_init(float rate, int size) void shift_unroll_deinit(shift_unroll_data_t* d) { - if (d && d->dsin) - { - free(d->dsin); - d->dsin = NULL; - } - if (d && d->dcos) - { - free(d->dcos); - d->dcos = NULL; - } + if (!d) + return; + free(d->dsin); + free(d->dcos); + d->dsin = NULL; + d->dcos = NULL; } PF_TARGET_CLONES @@ -245,8 +366,8 @@ float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_u //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); float cos_start = cos(starting_phase); float sin_start = sin(starting_phase); - register float cos_val, sin_val; - for(int i=0;i<input_size; i++) //@shift_unroll_cc + register float cos_val = cos_start, sin_val = sin_start; + for(int i=0;i<input_size; i++) { iof(output,i) = cos_val*iof(input,i) - sin_val*qof(input,i); qof(output,i) = sin_val*iof(input,i) + cos_val*qof(input,i); @@ -265,8 +386,8 @@ float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, flo { float cos_start = cos(starting_phase); float sin_start = sin(starting_phase); - register float cos_val, sin_val; - for(int i=0;i<size; i++) //@shift_unroll_inp_c + register float cos_val = cos_start, sin_val = sin_start; + for(int i=0;i<size; i++) { register float inp_i = iof(in_out,i); register float inp_q = qof(in_out,i); @@ -283,15 +404,18 @@ float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, flo } -#define SHIFT_UNROLL_SIZE CSDR_SHIFT_LIMITED_UNROLL_SIZE -#define SHIFT_LIMITED_SIMD_SZ CSDR_SHIFT_LIMITED_SIMD_SZ +/*********************************************************************/ + +/**************/ +/*** ALGO E ***/ +/**************/ shift_limited_unroll_data_t shift_limited_unroll_init(float rate) { shift_limited_unroll_data_t output; output.phase_increment=2*rate*PI; float myphase = 0; - for(int i=0;i<SHIFT_UNROLL_SIZE;i++) + for(int i=0; i < PF_SHIFT_LIMITED_UNROLL_SIZE; i++) { myphase += output.phase_increment; while(myphase>PI) myphase-=2*PI; @@ -309,87 +433,119 @@ void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, { float cos_start = d->complex_phase.i; float sin_start = d->complex_phase.q; - register float cos_val = cos_start, sin_val = sin_start; - while (size > 0) //@shift_limited_unroll_cc + register float cos_val = cos_start, sin_val = sin_start, mag; + while (size > 0) { - int N = (size >= SHIFT_UNROLL_SIZE) ? SHIFT_UNROLL_SIZE : size; - for(int i=0;i<N/SHIFT_LIMITED_SIMD_SZ; i++ ) + int N = (size >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : size; + for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ ) { - for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++) + for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++) { - iof(output,SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,SHIFT_LIMITED_SIMD_SZ*i+j); - qof(output,SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,SHIFT_LIMITED_SIMD_SZ*i+j); + iof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j); + qof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j); // calculate complex phasor for next iteration - cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; - sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; + cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; } } - input += SHIFT_UNROLL_SIZE; - output += SHIFT_UNROLL_SIZE; - size -= SHIFT_UNROLL_SIZE; + // "starts := vals := vals / |vals|" + mag = sqrtf(cos_val * cos_val + sin_val * sin_val); + cos_val /= mag; + sin_val /= mag; + cos_start = cos_val; + sin_start = sin_val; + + input += PF_SHIFT_LIMITED_UNROLL_SIZE; + output += PF_SHIFT_LIMITED_UNROLL_SIZE; + size -= PF_SHIFT_LIMITED_UNROLL_SIZE; } d->complex_phase.i = cos_val; d->complex_phase.q = sin_val; } PF_TARGET_CLONES -void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d) +void shift_limited_unroll_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_data_t* d) { - float inp_i[SHIFT_LIMITED_SIMD_SZ]; - float inp_q[SHIFT_LIMITED_SIMD_SZ]; + float inp_i[PF_SHIFT_LIMITED_SIMD_SZ]; + float inp_q[PF_SHIFT_LIMITED_SIMD_SZ]; + // "vals := starts := phase_state" float cos_start = d->complex_phase.i; float sin_start = d->complex_phase.q; - register float cos_val = cos_start, sin_val = sin_start; - while (size > 0) //@shift_limited_unroll_inp_c + register float cos_val = cos_start, sin_val = sin_start, mag; + while (N_cplx) { - int N = (size >= SHIFT_UNROLL_SIZE) ? SHIFT_UNROLL_SIZE : size; - for(int i=0;i<N/SHIFT_LIMITED_SIMD_SZ; i++ ) + int N = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; + for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ ) { - for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++) - inp_i[j] = in_out[SHIFT_LIMITED_SIMD_SZ*i+j].i; - for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++) - inp_q[j] = in_out[SHIFT_LIMITED_SIMD_SZ*i+j].q; - for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++) + for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++) + inp_i[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].i; + for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++) + inp_q[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].q; + for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++) { - iof(in_out,SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j]; - qof(in_out,SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j]; + // "out[] = inp[] * vals" + iof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j]; + qof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j]; // calculate complex phasor for next iteration - cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; - sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; + // "vals := d[] * starts" + cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; } } - in_out += SHIFT_UNROLL_SIZE; - size -= SHIFT_UNROLL_SIZE; + // "starts := vals := vals / |vals|" + mag = sqrtf(cos_val * cos_val + sin_val * sin_val); + cos_val /= mag; + sin_val /= mag; + cos_start = cos_val; + sin_start = sin_val; + + in_out += PF_SHIFT_LIMITED_UNROLL_SIZE; + N_cplx -= PF_SHIFT_LIMITED_UNROLL_SIZE; } - d->complex_phase.i = cos_val; - d->complex_phase.q = sin_val; + // "phase_state := starts" + d->complex_phase.i = cos_start; + d->complex_phase.q = sin_start; } #ifdef HAVE_SSE_INTRINSICS -shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_freq, float phase_start_rad) +/*********************************************************************/ + +/**************/ +/*** ALGO F ***/ +/**************/ + +shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) { - shift_limited_unroll_sse_data_t output; + shift_limited_unroll_A_sse_data_t output; float myphase; output.phase_increment = 2*relative_freq*PI; myphase = 0.0F; - for (int i = 0; i < SHIFT_UNROLL_SIZE + SHIFT_LIMITED_SIMD_SZ; i++) + for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ) { + for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) + { + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } output.dcos[i] = cos(myphase); output.dsin[i] = sin(myphase); - myphase += output.phase_increment; - while(myphase>PI) myphase-=2*PI; - while(myphase<-PI) myphase+=2*PI; + for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) + { + output.dcos[i+k] = output.dcos[i]; + output.dsin[i+k] = output.dsin[i]; + } } output.dcos_blk = 0.0F; output.dsin_blk = 0.0F; myphase = phase_start_rad; - for (int i = 0; i < SHIFT_LIMITED_SIMD_SZ; i++) + for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++) { output.phase_state_i[i] = cos(myphase); output.phase_state_q[i] = sin(myphase); @@ -402,10 +558,11 @@ shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_fre PF_TARGET_CLONES -void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_sse_data_t* d) +void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) { - const __m128 cos_starts = VLOAD_ALIGNED( &d->phase_state_i[0] ); - const __m128 sin_starts = VLOAD_ALIGNED( &d->phase_state_q[0] ); + // "vals := starts := phase_state" + __m128 cos_starts = VLOAD( &d->phase_state_i[0] ); + __m128 sin_starts = VLOAD( &d->phase_state_q[0] ); __m128 cos_vals = cos_starts; __m128 sin_vals = sin_starts; __m128 inp_re, inp_im; @@ -417,27 +574,29 @@ void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_ while (N_cplx) { - const int NB = (N_cplx >= CSDR_SHIFT_LIMITED_UNROLL_SIZE) ? CSDR_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; + const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; int B = NB; p_trig_cos_tab = (__m128*)( &d->dcos[0] ); p_trig_sin_tab = (__m128*)( &d->dsin[0] ); - while (B >= 0) + while (B) { // complex multiplication of 4 complex values from/to in_out[] // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]): - UNINTERLEAVE2(VLOAD_ALIGNED(u), VLOAD_ALIGNED(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + // "out[] = inp[] * vals" + UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) ); product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) ); INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); - VSTORE_ALIGNED(u, interl_prod_a); - VSTORE_ALIGNED(u+1, interl_prod_b); + VSTORE(u, interl_prod_a); + VSTORE(u+1, interl_prod_b); u += 2; // calculate complex phasor for next iteration - // cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; - // sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; + // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-) - inp_re = VLOAD_ALIGNED(p_trig_cos_tab); - inp_im = VLOAD_ALIGNED(p_trig_sin_tab); + // "vals := d[] * starts" + inp_re = VLOAD(p_trig_cos_tab); + inp_im = VLOAD(p_trig_sin_tab); cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) ); sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) ); ++p_trig_cos_tab; @@ -447,97 +606,294 @@ void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_ N_cplx -= NB; /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */ /* re-use product_re[]/product_im[] for normalization */ + // "starts := vals := vals / |vals|" product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) ); +#if 0 + // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64 + // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16 product_im = _mm_rsqrt_ps(product_re); - cos_vals = VMUL(cos_vals, product_im); - sin_vals = VMUL(sin_vals, product_im); + cos_starts = cos_vals = VMUL(cos_vals, product_im); + sin_starts = sin_vals = VMUL(sin_vals, product_im); +#else + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower! + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again + product_im = _mm_sqrt_ps(product_re); + cos_starts = cos_vals = VDIV(cos_vals, product_im); + sin_starts = sin_vals = VDIV(sin_vals, product_im); +#endif } - VSTORE_ALIGNED( &d->phase_state_i[0], cos_vals ); - VSTORE_ALIGNED( &d->phase_state_q[0], sin_vals ); + // "phase_state := starts" + VSTORE( &d->phase_state_i[0], cos_starts ); + VSTORE( &d->phase_state_q[0], sin_starts ); } -#else -int have_sse_shift_mixer_impl() -{ - return 0; -} +/*********************************************************************/ +/**************/ +/*** ALGO G ***/ +/**************/ -shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_freq, float phase_start_rad) +shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) { - assert(0); - shift_limited_unroll_sse_data_t r; - return r; + shift_limited_unroll_B_sse_data_t output; + float myphase; + + output.phase_increment = 2*relative_freq*PI; + + myphase = 0.0F; + for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ) + { + for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) + { + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + output.dtrig[i+0] = cos(myphase); + output.dtrig[i+1] = sin(myphase); + output.dtrig[i+2] = output.dtrig[i+0]; + output.dtrig[i+3] = output.dtrig[i+1]; + } + + output.dcos_blk = 0.0F; + output.dsin_blk = 0.0F; + + myphase = phase_start_rad; + for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++) + { + output.phase_state_i[i] = cos(myphase); + output.phase_state_q[i] = sin(myphase); + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + return output; } -void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_sse_data_t* d) + +PF_TARGET_CLONES +void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) { - assert(0); -} + // "vals := starts := phase_state" + __m128 cos_starts = VLOAD( &d->phase_state_i[0] ); + __m128 sin_starts = VLOAD( &d->phase_state_q[0] ); + __m128 cos_vals = cos_starts; + __m128 sin_vals = sin_starts; + __m128 inp_re, inp_im; + __m128 product_re, product_im; + __m128 interl_prod_a, interl_prod_b; + __m128 * RESTRICT p_trig_tab; + __m128 * RESTRICT u = (__m128*)in_out; + while (N_cplx) + { + const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; + int B = NB; + p_trig_tab = (__m128*)( &d->dtrig[0] ); + while (B) + { + // complex multiplication of 4 complex values from/to in_out[] + // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]): + // "out[] = inp[] * vals" + UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) ); + product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) ); + INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); + VSTORE(u, interl_prod_a); + VSTORE(u+1, interl_prod_b); + u += 2; + // calculate complex phasor for next iteration + // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-) + // "vals := d[] * starts" + product_re = VLOAD(p_trig_tab); + UNINTERLEAVE2(product_re, product_re, inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) ); + sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) ); + ++p_trig_tab; + B -= 4; + } + N_cplx -= NB; + /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */ + /* re-use product_re[]/product_im[] for normalization */ + // "starts := vals := vals / |vals|" + product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) ); +#if 0 + // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64 + // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16 + product_im = _mm_rsqrt_ps(product_re); + cos_starts = cos_vals = VMUL(cos_vals, product_im); + sin_starts = sin_vals = VMUL(sin_vals, product_im); +#else + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower! + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again + product_im = _mm_sqrt_ps(product_re); + cos_starts = cos_vals = VDIV(cos_vals, product_im); + sin_starts = sin_vals = VDIV(sin_vals, product_im); #endif + } + // "phase_state := starts" + VSTORE( &d->phase_state_i[0], cos_starts ); + VSTORE( &d->phase_state_q[0], sin_starts ); +} +/*********************************************************************/ -shift_addfast_data_t shift_addfast_init(float rate) + +/**************/ +/*** ALGO H ***/ +/**************/ + +shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) { - shift_addfast_data_t output; - output.phase_increment=2*rate*PI; - for(int i=0;i<4;i++) + shift_limited_unroll_C_sse_data_t output; + float myphase; + + output.phase_increment = 2*relative_freq*PI; + + myphase = 0.0F; + for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ) { - output.dsin[i]=sin(output.phase_increment*(i+1)); - output.dcos[i]=cos(output.phase_increment*(i+1)); + for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) + { + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + output.dinterl_trig[2*i] = cos(myphase); + output.dinterl_trig[2*i+4] = sin(myphase); + for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++) + { + output.dinterl_trig[2*i+k] = output.dinterl_trig[2*i]; + output.dinterl_trig[2*i+k+4] = output.dinterl_trig[2*i+4]; + } } - return output; -} + output.dcos_blk = 0.0F; + output.dsin_blk = 0.0F; -#ifndef HAVE_ADDFAST_CC_IMPL + myphase = phase_start_rad; + for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++) + { + output.phase_state_i[i] = cos(myphase); + output.phase_state_q[i] = sin(myphase); + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + return output; +} -#define SADF_L1(j) \ - cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \ - sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j; -#define SADF_L2(j) \ - iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \ - qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j); PF_TARGET_CLONES -float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase) +void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) { - //input_size should be multiple of 4 - //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); - float cos_start=cos(starting_phase); - float sin_start=sin(starting_phase); - float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3, - sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3, - dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3], - dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3]; + // "vals := starts := phase_state" + __m128 cos_starts = VLOAD( &d->phase_state_i[0] ); + __m128 sin_starts = VLOAD( &d->phase_state_q[0] ); + __m128 cos_vals = cos_starts; + __m128 sin_vals = sin_starts; + __m128 inp_re, inp_im; + __m128 product_re, product_im; + __m128 interl_prod_a, interl_prod_b; + __m128 * RESTRICT p_trig_tab; + __m128 * RESTRICT u = (__m128*)in_out; - for(int i=0;i<input_size/4; i++) //@shift_addfast_cc + while (N_cplx) { - SADF_L1(0) - SADF_L1(1) - SADF_L1(2) - SADF_L1(3) - SADF_L2(0) - SADF_L2(1) - SADF_L2(2) - SADF_L2(3) - cos_start = cos_vals_3; - sin_start = sin_vals_3; + const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; + int B = NB; + p_trig_tab = (__m128*)( &d->dinterl_trig[0] ); + while (B) + { + // complex multiplication of 4 complex values from/to in_out[] + // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]): + // "out[] = inp[] * vals" + UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) ); + product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) ); + INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); + VSTORE(u, interl_prod_a); + VSTORE(u+1, interl_prod_b); + u += 2; + // calculate complex phasor for next iteration + // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j]; + // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-) + // "vals := d[] * starts" + inp_re = VLOAD(p_trig_tab); + inp_im = VLOAD(p_trig_tab+1); + cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) ); + sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) ); + p_trig_tab += 2; + B -= 4; + } + N_cplx -= NB; + /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */ + /* re-use product_re[]/product_im[] for normalization */ + // "starts := vals := vals / |vals|" + product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) ); +#if 0 + // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64 + // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16 + product_im = _mm_rsqrt_ps(product_re); + cos_starts = cos_vals = VMUL(cos_vals, product_im); + sin_starts = sin_vals = VMUL(sin_vals, product_im); +#else + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower! + // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again + product_im = _mm_sqrt_ps(product_re); + cos_starts = cos_vals = VDIV(cos_vals, product_im); + sin_starts = sin_vals = VDIV(sin_vals, product_im); +#endif } - starting_phase+=input_size*d->phase_increment; - while(starting_phase>PI) starting_phase-=2*PI; - while(starting_phase<-PI) starting_phase+=2*PI; - return starting_phase; + // "phase_state := starts" + VSTORE( &d->phase_state_i[0], cos_starts ); + VSTORE( &d->phase_state_q[0], sin_starts ); +} + + +#else + +/*********************************************************************/ + +shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) { + assert(0); + shift_limited_unroll_A_sse_data_t r; + return r; +} +shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) { + assert(0); + shift_limited_unroll_B_sse_data_t r; + return r; +} +shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) { + assert(0); + shift_limited_unroll_C_sse_data_t r; + return r; +} + +void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) { + assert(0); +} +void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) { + assert(0); +} +void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) { + assert(0); } #endif +/*********************************************************************/ -#define SHIFT_REC_SIMD_SZ CSDR_SHIFT_RECURSIVE_SIMD_SZ +/**************/ +/*** ALGO I ***/ +/**************/ void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state) { @@ -545,7 +901,7 @@ void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *con float phase_increment_s = rate*PI; float k1 = tan(0.5*phase_increment_s); float k2 = 2*k1 /(1 + k1 * k1); - for (int j=1; j<SHIFT_REC_SIMD_SZ; j++) + for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SZ; j++) { float tmp; state->u_cos[j] = state->u_cos[j-1]; @@ -556,8 +912,8 @@ void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *con state->u_cos[j] = tmp - k1 * state->v_sin[j]; } - // constants for SHIFT_REC_SIMD_SZ times phase step - float phase_increment_b = phase_increment_s * SHIFT_REC_SIMD_SZ; + // constants for PF_SHIFT_RECURSIVE_SIMD_SZ times phase step + float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SZ; while(phase_increment_b > PI) phase_increment_b-=2*PI; while(phase_increment_b < -PI) phase_increment_b+=2*PI; conf->k1 = tan(0.5*phase_increment_b); @@ -584,31 +940,31 @@ PF_TARGET_CLONES void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext) { - float tmp[SHIFT_REC_SIMD_SZ]; - float inp_i[SHIFT_REC_SIMD_SZ]; - float inp_q[SHIFT_REC_SIMD_SZ]; + float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ]; + float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ]; + float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ]; shift_recursive_osc_t state = *state_ext; const float k1 = conf->k1; const float k2 = conf->k2; - for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@shift_recursive_osc_cc + for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_cc { //we multiply two complex numbers - similar to shift_math_cc - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) { - inp_i[j] = input[SHIFT_REC_SIMD_SZ*i+j].i; - inp_q[j] = input[SHIFT_REC_SIMD_SZ*i+j].q; + inp_i[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i; + inp_q[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q; } - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) { - iof(output,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; - qof(output,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; + iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; + qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; } // update complex phasor - like incrementing phase - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) state.v_sin[j] += k2 * tmp[j]; - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; } *state_ext = state; @@ -618,31 +974,31 @@ PF_TARGET_CLONES void shift_recursive_osc_inp_c(complexf* in_out, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext) { - float tmp[SHIFT_REC_SIMD_SZ]; - float inp_i[SHIFT_REC_SIMD_SZ]; - float inp_q[SHIFT_REC_SIMD_SZ]; + float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ]; + float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ]; + float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ]; shift_recursive_osc_t state = *state_ext; const float k1 = conf->k1; const float k2 = conf->k2; - for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@shift_recursive_osc_inp_c + for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_inp_c { - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) { - inp_i[j] = in_out[SHIFT_REC_SIMD_SZ*i+j].i; - inp_q[j] = in_out[SHIFT_REC_SIMD_SZ*i+j].q; + inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i; + inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q; } //we multiply two complex numbers - similar to shift_math_cc - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) { - iof(in_out,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; - qof(in_out,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; + iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; + qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; } // update complex phasor - like incrementing phase - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) state.v_sin[j] += k2 * tmp[j]; - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; } *state_ext = state; @@ -652,24 +1008,24 @@ PF_TARGET_CLONES void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext) { - float tmp[SHIFT_REC_SIMD_SZ]; + float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ]; shift_recursive_osc_t state = *state_ext; const float k1 = conf->k1; const float k2 = conf->k2; - for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@gen_recursive_osc_c + for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@gen_recursive_osc_c { // output complex oscillator value - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) { - iof(output,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j]; - qof(output,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j]; + iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j]; + qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j]; } // update complex phasor - like incrementing phase - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) state.v_sin[j] += k2 * tmp[j]; - for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++) state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; } *state_ext = state; @@ -678,13 +1034,19 @@ void gen_recursive_osc_c(complexf* output, #ifdef HAVE_SSE_INTRINSICS +/*********************************************************************/ + +/**************/ +/*** ALGO J ***/ +/**************/ + void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state) { // constants for single phase step float phase_increment_s = rate*PI; float k1 = tan(0.5*phase_increment_s); float k2 = 2*k1 /(1 + k1 * k1); - for (int j=1; j<CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++) + for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++) { float tmp; state->u_cos[j] = state->u_cos[j-1]; @@ -695,8 +1057,8 @@ void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_con state->u_cos[j] = tmp - k1 * state->v_sin[j]; } - // constants for CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step - float phase_increment_b = phase_increment_s * CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ; + // constants for PF_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step + float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; while(phase_increment_b > PI) phase_increment_b-=2*PI; while(phase_increment_b < -PI) phase_increment_b+=2*PI; conf->k1 = tan(0.5*phase_increment_b); @@ -726,8 +1088,8 @@ void shift_recursive_osc_sse_inp_c(complexf* in_out, { const __m128 k1 = LD_PS1( conf->k1 ); const __m128 k2 = LD_PS1( conf->k2 ); - __m128 u_cos = VLOAD_ALIGNED( &state_ext->u_cos[0] ); - __m128 v_sin = VLOAD_ALIGNED( &state_ext->v_sin[0] ); + __m128 u_cos = VLOAD( &state_ext->u_cos[0] ); + __m128 v_sin = VLOAD( &state_ext->v_sin[0] ); __m128 inp_re, inp_im; __m128 product_re, product_im; __m128 interl_prod_a, interl_prod_b; @@ -735,18 +1097,18 @@ void shift_recursive_osc_sse_inp_c(complexf* in_out, while (N_cplx) { - //inp_i[j] = in_out[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i; - //inp_q[j] = in_out[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q; - UNINTERLEAVE2(VLOAD_ALIGNED(u), VLOAD_ALIGNED(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + //inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i; + //inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q; + UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ //we multiply two complex numbers - similar to shift_math_cc - //iof(in_out,CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; - //qof(in_out,CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; + //iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; + //qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) ); product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) ); INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); - VSTORE_ALIGNED(u, interl_prod_a); - VSTORE_ALIGNED(u+1, interl_prod_b); + VSTORE(u, interl_prod_a); + VSTORE(u+1, interl_prod_b); u += 2; // update complex phasor - like incrementing phase @@ -759,8 +1121,8 @@ void shift_recursive_osc_sse_inp_c(complexf* in_out, N_cplx -= 4; } - VSTORE_ALIGNED( &state_ext->u_cos[0], u_cos ); - VSTORE_ALIGNED( &state_ext->v_sin[0], v_sin ); + VSTORE( &state_ext->u_cos[0], u_cos ); + VSTORE( &state_ext->v_sin[0], v_sin ); } #else @@ -55,18 +55,37 @@ typedef struct complexf_s { float i; float q; } complexf; int have_sse_shift_mixer_impl(); + +/*********************************************************************/ + +/**************/ +/*** ALGO A ***/ +/**************/ + float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase); +/*********************************************************************/ + +/**************/ +/*** ALGO B ***/ +/**************/ + typedef struct shift_table_data_s { float* table; int table_size; } shift_table_data_t; + void shift_table_deinit(shift_table_data_t table_data); shift_table_data_t shift_table_init(int table_size); float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase); +/*********************************************************************/ + +/**************/ +/*** ALGO C ***/ +/**************/ typedef struct shift_addfast_data_s { @@ -74,9 +93,17 @@ typedef struct shift_addfast_data_s float dcos[4]; float phase_increment; } shift_addfast_data_t; + shift_addfast_data_t shift_addfast_init(float rate); float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase); +float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase); + +/*********************************************************************/ + +/**************/ +/*** ALGO D ***/ +/**************/ typedef struct shift_unroll_data_s { @@ -85,59 +112,128 @@ typedef struct shift_unroll_data_s float phase_increment; int size; } shift_unroll_data_t; + shift_unroll_data_t shift_unroll_init(float rate, int size); void shift_unroll_deinit(shift_unroll_data_t* d); float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase); float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase); +/*********************************************************************/ + +/**************/ +/*** ALGO E ***/ +/**************/ + /* similar to shift_unroll_cc() - but, have fixed and limited precalc size * idea: smaller cache usage by table * size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4) */ -#define CSDR_SHIFT_LIMITED_UNROLL_SIZE 64 -#define CSDR_SHIFT_LIMITED_SIMD_SZ 4 +#define PF_SHIFT_LIMITED_UNROLL_SIZE 128 +#define PF_SHIFT_LIMITED_SIMD_SZ 4 + typedef struct shift_limited_unroll_data_s { - float dcos[CSDR_SHIFT_LIMITED_UNROLL_SIZE]; - float dsin[CSDR_SHIFT_LIMITED_UNROLL_SIZE]; + float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE]; + float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE]; complexf complex_phase; float phase_increment; } shift_limited_unroll_data_t; + shift_limited_unroll_data_t shift_limited_unroll_init(float rate); -/* size must be multiple of CSDR_SHIFT_LIMITED_SIMD_SZ */ +/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */ /* starting_phase for next call is kept internal in state */ void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d); void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d); -typedef struct shift_limited_unroll_sse_data_s +/*********************************************************************/ + +/**************/ +/*** ALGO F ***/ +/**************/ + +typedef struct shift_limited_unroll_A_sse_data_s +{ + /* small/limited trig table */ + float dcos[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ]; + float dsin[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ]; + /* 4 times complex phase */ + float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ]; + float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ]; + /* N_cplx_per_block times increment - for future parallel variants */ + float dcos_blk; + float dsin_blk; + /* */ + float phase_increment; +} shift_limited_unroll_A_sse_data_t; + +shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad); +void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d); + + +/*********************************************************************/ + +/**************/ +/*** ALGO G ***/ +/**************/ + +typedef struct shift_limited_unroll_B_sse_data_s { /* small/limited trig table */ - float dcos[CSDR_SHIFT_LIMITED_UNROLL_SIZE+CSDR_SHIFT_LIMITED_SIMD_SZ]; - float dsin[CSDR_SHIFT_LIMITED_UNROLL_SIZE+CSDR_SHIFT_LIMITED_SIMD_SZ]; + float dtrig[PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ]; /* 4 times complex phase */ - float phase_state_i[CSDR_SHIFT_LIMITED_SIMD_SZ]; - float phase_state_q[CSDR_SHIFT_LIMITED_SIMD_SZ]; + float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ]; + float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ]; /* N_cplx_per_block times increment - for future parallel variants */ float dcos_blk; float dsin_blk; /* */ float phase_increment; -} shift_limited_unroll_sse_data_t; -shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_freq, float phase_start_rad); -void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_sse_data_t* d); +} shift_limited_unroll_B_sse_data_t; + +shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad); +void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d); +/*********************************************************************/ +/**************/ +/*** ALGO H ***/ +/**************/ + +typedef struct shift_limited_unroll_C_sse_data_s +{ + /* small/limited trig table - interleaved: 4 cos, 4 sin, 4 cos, .. */ + float dinterl_trig[2*(PF_SHIFT_LIMITED_UNROLL_SIZE+PF_SHIFT_LIMITED_SIMD_SZ)]; + /* 4 times complex phase */ + float phase_state_i[PF_SHIFT_LIMITED_SIMD_SZ]; + float phase_state_q[PF_SHIFT_LIMITED_SIMD_SZ]; + /* N_cplx_per_block times increment - for future parallel variants */ + float dcos_blk; + float dsin_blk; + /* */ + float phase_increment; +} shift_limited_unroll_C_sse_data_t; + +shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad); +void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d); + + + +/*********************************************************************/ + +/**************/ +/*** ALGO I ***/ +/**************/ /* Recursive Quadrature Oscillator functions "recursive_osc" * see https://www.vicanek.de/articles/QuadOsc.pdf */ -#define CSDR_SHIFT_RECURSIVE_SIMD_SZ 8 +#define PF_SHIFT_RECURSIVE_SIMD_SZ 8 typedef struct shift_recursive_osc_s { - float u_cos[CSDR_SHIFT_RECURSIVE_SIMD_SZ]; - float v_sin[CSDR_SHIFT_RECURSIVE_SIMD_SZ]; + float u_cos[PF_SHIFT_RECURSIVE_SIMD_SZ]; + float v_sin[PF_SHIFT_RECURSIVE_SIMD_SZ]; } shift_recursive_osc_t; typedef struct shift_recursive_osc_conf_s @@ -149,17 +245,23 @@ typedef struct shift_recursive_osc_conf_s void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state); void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); -/* size must be multiple of CSDR_SHIFT_LIMITED_SIMD_SZ */ +/* size must be multiple of PF_SHIFT_LIMITED_SIMD_SZ */ /* starting_phase for next call is kept internal in state */ void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); -#define CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ 4 +/*********************************************************************/ + +/**************/ +/*** ALGO J ***/ +/**************/ + +#define PF_SHIFT_RECURSIVE_SIMD_SSE_SZ 4 typedef struct shift_recursive_osc_sse_s { - float u_cos[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ]; - float v_sin[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ]; + float u_cos[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ]; + float v_sin[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ]; } shift_recursive_osc_sse_t; typedef struct shift_recursive_osc_sse_conf_s |