summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Duff <bduff@google.com>2013-11-30 21:15:50 -0800
committerBrian Duff <bduff@google.com>2013-11-30 21:17:46 -0800
commit598e0aeda91e8ed536b4b2ed670be0055dde3289 (patch)
tree348efaaa434a28c5df76570c815fbd67eb3c7a22
parent85ba2a83c6cd8a10c26124550736a89a2e03f14b (diff)
downloadfft2d-598e0aeda91e8ed536b4b2ed670be0055dde3289.tar.gz
Initial commit of fft2d.
Bug: 11946001 Change-Id: I79609cc5a1a011e25a1dfb597a7ddcec532366cb
-rw-r--r--Android.mk18
-rw-r--r--CleanSpec.mk49
-rw-r--r--MODULE_LICENSE_NOTICE3
-rw-r--r--README.android42
-rw-r--r--src/fft2d/fft.h24
-rw-r--r--src/fft2d/fft2d.h24
-rw-r--r--src/fft2d/fft2d/alloc.c153
-rw-r--r--src/fft2d/fft2d/alloc.h20
-rw-r--r--src/fft2d/fft2d/fft4f2d.c1705
-rw-r--r--src/fft2d/fft2d/fft4f2d.f1591
-rw-r--r--src/fft2d/fft2d/fftsg.c3314
-rw-r--r--src/fft2d/fft2d/fftsg.f2967
-rw-r--r--src/fft2d/fft2d/fftsg2d.c1190
-rw-r--r--src/fft2d/fft2d/fftsg2d.f562
-rw-r--r--src/fft2d/fft2d/fftsg3d.c1695
-rw-r--r--src/fft2d/fft2d/fftsg3d.f926
-rw-r--r--src/fft2d/fft2d/readme2d.txt71
-rw-r--r--src/fft2d/fft2d/sample2d/Makefile72
-rw-r--r--src/fft2d/fft2d/sample2d/Makefile.f7760
-rw-r--r--src/fft2d/fft2d/sample2d/Makefile.pth54
-rw-r--r--src/fft2d/fft2d/sample2d/alloc.h20
-rw-r--r--src/fft2d/fft2d/sample2d/fft4f2dt.c109
-rw-r--r--src/fft2d/fft2d/sample2d/fft4f2dt.f95
-rw-r--r--src/fft2d/fft2d/sample2d/fftsg2dt.c107
-rw-r--r--src/fft2d/fft2d/sample2d/fftsg2dt.f94
-rw-r--r--src/fft2d/fft2d/sample2d/fftsg3dt.c128
-rw-r--r--src/fft2d/fft2d/sample2d/fftsg3dt.f119
-rw-r--r--src/fft2d/fft2d/sample2d/shrtdctt.c68
-rw-r--r--src/fft2d/fft2d/shrtdct.c538
29 files changed, 15818 insertions, 0 deletions
diff --git a/Android.mk b/Android.mk
new file mode 100644
index 0000000..ea70b19
--- /dev/null
+++ b/Android.mk
@@ -0,0 +1,18 @@
+# Copyright 2013 Google, Inc.
+
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libfft2d_static
+LOCAL_MODULE_TAGS := optional
+
+LOCAL_SDK_VERSION := 9
+
+LOCAL_SRC_FILES := \
+ src/fft2d/fft2d/fftsg.c \
+ src/fft2d/fft2d/fftsg2d.c \
+ src/fft2d/fft2d/alloc.c \
+ src/fft2d/fft2d/shrtdct.c
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/CleanSpec.mk b/CleanSpec.mk
new file mode 100644
index 0000000..b84e1b6
--- /dev/null
+++ b/CleanSpec.mk
@@ -0,0 +1,49 @@
+# Copyright (C) 2007 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# If you don't need to do a full clean build but would like to touch
+# a file or delete some intermediate files, add a clean step to the end
+# of the list. These steps will only be run once, if they haven't been
+# run before.
+#
+# E.g.:
+# $(call add-clean-step, touch -c external/sqlite/sqlite3.h)
+# $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libz_intermediates)
+#
+# Always use "touch -c" and "rm -f" or "rm -rf" to gracefully deal with
+# files that are missing or have been moved.
+#
+# Use $(PRODUCT_OUT) to get to the "out/target/product/blah/" directory.
+# Use $(OUT_DIR) to refer to the "out" directory.
+#
+# If you need to re-do something that's already mentioned, just copy
+# the command and add it to the bottom of the list. E.g., if a change
+# that you made last week required touching a file and a change you
+# made today requires touching the same file, just copy the old
+# touch step and add it to the end of the list.
+#
+# ************************************************
+# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
+# ************************************************
+
+# For example:
+#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/APPS/AndroidTests_intermediates)
+#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/JAVA_LIBRARIES/core_intermediates)
+#$(call add-clean-step, find $(OUT_DIR) -type f -name "IGTalkSession*" -print0 | xargs -0 rm -f)
+#$(call add-clean-step, rm -rf $(PRODUCT_OUT)/data/*)
+
+# ************************************************
+# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
+# ************************************************
diff --git a/MODULE_LICENSE_NOTICE b/MODULE_LICENSE_NOTICE
new file mode 100644
index 0000000..2bd8550
--- /dev/null
+++ b/MODULE_LICENSE_NOTICE
@@ -0,0 +1,3 @@
+Copyright(C) 1997,2001 Takuya OOURA (email: ooura@kurims.kyoto-u.ac.jp).
+You may use, copy, modify this code for any purpose and
+without fee. You may distribute this ORIGINAL package.
diff --git a/README.android b/README.android
new file mode 100644
index 0000000..2575079
--- /dev/null
+++ b/README.android
@@ -0,0 +1,42 @@
+ Third-party FFT (fast Fourier transform) code
+
+This directory contains the third-party one- and two-dimensional
+fast Fourier transform and related sinusoidal transform code written
+by Takuya Ooura.
+
+Depending on your needs, there may be better choices than this
+package, e.g.:
+
+1. David Talkin has written a fast and flexible radix-2 FFT class:
+ google3/speech/dsp/fft.h
+
+2. fftw (http://www.fftw.org/) likely offers speed advantages over the
+routines in this package and undoubtedly has a much larger user base.
+However, it's a much bigger and more complex package.
+
+Package origin URL:
+-------------------
+ http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html
+
+Version:
+--------
+ "fft2d.tgz (53KB) updated: 2001/11/22"
+ md5sum fft2d.tgz: c02e1826a34a06ad1413209843c8e80f fft2d.tgz
+
+License:
+--------
+
+ See the bottom of fft2d/readme2d.txt.
+
+ My (popat's) reading of it is
+
+ licenses = ["notice"],
+
+Downloading and unpacking history:
+----------------------------------
+
+ Tue Jun 6 14:29:50 2006:
+ wget http://momonga.t.u-tokyo.ac.jp/~ooura/fft2d.tgz
+ tar xzf fft2d.tgz
+ rm fft2d.tgz
+
diff --git a/src/fft2d/fft.h b/src/fft2d/fft.h
new file mode 100644
index 0000000..c64d31c
--- /dev/null
+++ b/src/fft2d/fft.h
@@ -0,0 +1,24 @@
+// Copyright 2006 Google Inc. All Rights Reserved.
+// Author: popat@google.com (Ashok C. Popat)
+//
+// Declarations for third-party 1D FFT routines in third_party/fft2d/fft2d.
+
+#ifndef THIRD_PARTY_FFT2D_FFT_H__
+#define THIRD_PARTY_FFT2D_FFT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void cdft(int, int, double *, int *, double *);
+extern void rdft(int, int, double *, int *, double *);
+extern void ddct(int, int, double *, int *, double *);
+extern void ddst(int, int, double *, int *, double *);
+extern void dfct(int, double *, double *, int *, double *);
+extern void dfst(int, double *, double *, int *, double *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // THIRD_PARTY_FFT2D_FFT_H__
diff --git a/src/fft2d/fft2d.h b/src/fft2d/fft2d.h
new file mode 100644
index 0000000..fa32a20
--- /dev/null
+++ b/src/fft2d/fft2d.h
@@ -0,0 +1,24 @@
+// Copyright 2006 Google Inc. All Rights Reserved.
+// Author: popat@google.com (Ashok C. Popat)
+//
+// Declarations for third-party 2D FFT routines in third_party/fft2d/fft2d.
+
+#ifndef THIRD_PARTY_FFT2D_FFT2D_H__
+#define THIRD_PARTY_FFT2D_FFT2D_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void cdft2d(int, int, int, double **, double *, int *, double *);
+extern void rdft2d(int, int, int, double **, double *, int *, double *);
+extern void ddct2d(int, int, int, double **, double *, int *, double *);
+extern void ddst2d(int, int, int, double **, double *, int *, double *);
+extern void ddct8x8s(int isgn, double **a);
+extern void ddct16x16s(int isgn, double **a);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // THIRD_PARTY_FFT2D_FFT2D_H__
diff --git a/src/fft2d/fft2d/alloc.c b/src/fft2d/fft2d/alloc.c
new file mode 100644
index 0000000..7833d88
--- /dev/null
+++ b/src/fft2d/fft2d/alloc.c
@@ -0,0 +1,153 @@
+/* ---- memory allocation ---- */
+#include "alloc.h"
+
+
+#define alloc_error_check(p) { \
+ if ((p) == NULL) { \
+ fprintf(stderr, "Allocation Failure!\n"); \
+ exit(1); \
+ } \
+}
+
+
+int *alloc_1d_int(int n1)
+{
+ int *i;
+
+ i = (int *) malloc(sizeof(int) * n1);
+ alloc_error_check(i);
+ return i;
+}
+
+
+void free_1d_int(int *i)
+{
+ free(i);
+}
+
+
+double *alloc_1d_double(int n1)
+{
+ double *d;
+
+ d = (double *) malloc(sizeof(double) * n1);
+ alloc_error_check(d);
+ return d;
+}
+
+
+void free_1d_double(double *d)
+{
+ free(d);
+}
+
+
+int **alloc_2d_int(int n1, int n2)
+{
+ int **ii, *i;
+ int j;
+
+ ii = (int **) malloc(sizeof(int *) * n1);
+ alloc_error_check(ii);
+ i = (int *) malloc(sizeof(int) * n1 * n2);
+ alloc_error_check(i);
+ ii[0] = i;
+ for (j = 1; j < n1; j++) {
+ ii[j] = ii[j - 1] + n2;
+ }
+ return ii;
+}
+
+
+void free_2d_int(int **ii)
+{
+ free(ii[0]);
+ free(ii);
+}
+
+
+double **alloc_2d_double(int n1, int n2)
+{
+ double **dd, *d;
+ int j;
+
+ dd = (double **) malloc(sizeof(double *) * n1);
+ alloc_error_check(dd);
+ d = (double *) malloc(sizeof(double) * n1 * n2);
+ alloc_error_check(d);
+ dd[0] = d;
+ for (j = 1; j < n1; j++) {
+ dd[j] = dd[j - 1] + n2;
+ }
+ return dd;
+}
+
+
+void free_2d_double(double **dd)
+{
+ free(dd[0]);
+ free(dd);
+}
+
+
+int ***alloc_3d_int(int n1, int n2, int n3)
+{
+ int ***iii, **ii, *i;
+ int j;
+
+ iii = (int ***) malloc(sizeof(int **) * n1);
+ alloc_error_check(iii);
+ ii = (int **) malloc(sizeof(int *) * n1 * n2);
+ alloc_error_check(ii);
+ iii[0] = ii;
+ for (j = 1; j < n1; j++) {
+ iii[j] = iii[j - 1] + n2;
+ }
+ i = (int *) malloc(sizeof(int) * n1 * n2 * n3);
+ alloc_error_check(i);
+ ii[0] = i;
+ for (j = 1; j < n1 * n2; j++) {
+ ii[j] = ii[j - 1] + n3;
+ }
+ return iii;
+}
+
+
+void free_3d_int(int ***iii)
+{
+ free(iii[0][0]);
+ free(iii[0]);
+ free(iii);
+}
+
+
+double ***alloc_3d_double(int n1, int n2, int n3)
+{
+ double ***ddd, **dd, *d;
+ int j;
+
+ ddd = (double ***) malloc(sizeof(double **) * n1);
+ alloc_error_check(ddd);
+ dd = (double **) malloc(sizeof(double *) * n1 * n2);
+ alloc_error_check(dd);
+ ddd[0] = dd;
+ for (j = 1; j < n1; j++) {
+ ddd[j] = ddd[j - 1] + n2;
+ }
+ d = (double *) malloc(sizeof(double) * n1 * n2 * n3);
+ alloc_error_check(d);
+ dd[0] = d;
+ for (j = 1; j < n1 * n2; j++) {
+ dd[j] = dd[j - 1] + n3;
+ }
+ return ddd;
+}
+
+
+void free_3d_double(double ***ddd)
+{
+ free(ddd[0][0]);
+ free(ddd[0]);
+ free(ddd);
+}
+
diff --git a/src/fft2d/fft2d/alloc.h b/src/fft2d/fft2d/alloc.h
new file mode 100644
index 0000000..3467cc4
--- /dev/null
+++ b/src/fft2d/fft2d/alloc.h
@@ -0,0 +1,20 @@
+/* ---- memory allocation ---- */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+
+int *alloc_1d_int(int n1);
+void free_1d_int(int *i);
+double *alloc_1d_double(int n1);
+void free_1d_double(double *d);
+int **alloc_2d_int(int n1, int n2);
+void free_2d_int(int **ii);
+double **alloc_2d_double(int n1, int n2);
+void free_2d_double(double **dd);
+int ***alloc_3d_int(int n1, int n2, int n3);
+void free_3d_int(int ***iii);
+double ***alloc_3d_double(int n1, int n2, int n3);
+void free_3d_double(double ***ddd);
+
diff --git a/src/fft2d/fft2d/fft4f2d.c b/src/fft2d/fft2d/fft4f2d.c
new file mode 100644
index 0000000..354c80e
--- /dev/null
+++ b/src/fft2d/fft2d/fft4f2d.c
@@ -0,0 +1,1705 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+ dimension :two
+ data length :power of 2
+ decimation :frequency
+ radix :4, 2, row-column
+ data :inplace
+ table :use
+functions
+ cdft2d: Complex Discrete Fourier Transform
+ rdft2d: Real Discrete Fourier Transform
+ ddct2d: Discrete Cosine Transform
+ ddst2d: Discrete Sine Transform
+function prototypes
+ void cdft2d(int, int, int, double **, int *, double *);
+ void rdft2d(int, int, int, double **, int *, double *);
+ void ddct2d(int, int, int, double **, double **, int *, double *);
+ void ddst2d(int, int, int, double **, double **, int *, double *);
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+ [definition]
+ <case1>
+ X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] *
+ exp(2*pi*i*j1*k1/n1) *
+ exp(2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
+ <case2>
+ X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] *
+ exp(-2*pi*i*j1*k1/n1) *
+ exp(-2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
+ (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ cdft2d(n1, 2*n2, 1, a, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ cdft2d(n1, 2*n2, -1, a, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 1, n1 = power of 2
+ 2*n2 :data length (int)
+ n2 >= 1, n2 = power of 2
+ a[0...n1-1][0...2*n2-1]
+ :input/output data (double **)
+ input data
+ a[j1][2*j2] = Re(x[j1][j2]),
+ a[j1][2*j2+1] = Im(x[j1][j2]),
+ 0<=j1<n1, 0<=j2<n2
+ output data
+ a[k1][2*k2] = Re(X[k1][k2]),
+ a[k1][2*k2+1] = Im(X[k1][k2]),
+ 0<=k1<n1, 0<=k2<n2
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1, n2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1/2, n2/2)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ cdft2d(n1, 2*n2, -1, a, ip, w);
+ is
+ cdft2d(n1, 2*n2, 1, a, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= 2 * n2 - 1; j2++) {
+ a[j1][j2] *= 1.0 / (n1 * n2);
+ }
+ }
+ .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+ [definition]
+ <case1> RDFT
+ R[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+ cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
+ 0<=k1<n1, 0<=k2<n2
+ I[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+ sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
+ 0<=k1<n1, 0<=k2<n2
+ <case2> IRDFT (excluding scale)
+ a[k1][k2] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
+ (R[j1][j2] *
+ cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) +
+ I[j1][j2] *
+ sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)),
+ 0<=k1<n1, 0<=k2<n2
+ (notes: R[n1-k1][n2-k2] = R[k1][k2],
+ I[n1-k1][n2-k2] = -I[k1][k2],
+ R[n1-k1][0] = R[k1][0],
+ I[n1-k1][0] = -I[k1][0],
+ R[0][n2-k2] = R[0][k2],
+ I[0][n2-k2] = -I[0][k2],
+ 0<k1<n1, 0<k2<n2)
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ rdft2d(n1, n2, 1, a, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ rdft2d(n1, n2, -1, a, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 2, n1 = power of 2
+ n2 :data length (int)
+ n2 >= 2, n2 = power of 2
+ a[0...n1-1][0...n2-1]
+ :input/output data (double **)
+ <case1>
+ output data
+ a[k1][2*k2] = R[k1][k2] = R[n1-k1][n2-k2],
+ a[k1][2*k2+1] = I[k1][k2] = -I[n1-k1][n2-k2],
+ 0<k1<n1, 0<k2<n2/2,
+ a[0][2*k2] = R[0][k2] = R[0][n2-k2],
+ a[0][2*k2+1] = I[0][k2] = -I[0][n2-k2],
+ 0<k2<n2/2,
+ a[k1][0] = R[k1][0] = R[n1-k1][0],
+ a[k1][1] = I[k1][0] = -I[n1-k1][0],
+ a[n1-k1][1] = R[k1][n2/2] = R[n1-k1][n2/2],
+ a[n1-k1][0] = -I[k1][n2/2] = I[n1-k1][n2/2],
+ 0<k1<n1/2,
+ a[0][0] = R[0][0],
+ a[0][1] = R[0][n2/2],
+ a[n1/2][0] = R[n1/2][0],
+ a[n1/2][1] = R[n1/2][n2/2]
+ <case2>
+ input data
+ a[j1][2*j2] = R[j1][j2] = R[n1-j1][n2-j2],
+ a[j1][2*j2+1] = I[j1][j2] = -I[n1-j1][n2-j2],
+ 0<j1<n1, 0<j2<n2/2,
+ a[0][2*j2] = R[0][j2] = R[0][n2-j2],
+ a[0][2*j2+1] = I[0][j2] = -I[0][n2-j2],
+ 0<j2<n2/2,
+ a[j1][0] = R[j1][0] = R[n1-j1][0],
+ a[j1][1] = I[j1][0] = -I[n1-j1][0],
+ a[n1-j1][1] = R[j1][n2/2] = R[n1-j1][n2/2],
+ a[n1-j1][0] = -I[j1][n2/2] = I[n1-j1][n2/2],
+ 0<j1<n1/2,
+ a[0][0] = R[0][0],
+ a[0][1] = R[0][n2/2],
+ a[n1/2][0] = R[n1/2][0],
+ a[n1/2][1] = R[n1/2][n2/2]
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1, n2/2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1/2, n2/4) + n2/4
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ rdft2d(n1, n2, 1, a, ip, w);
+ is
+ rdft2d(n1, n2, -1, a, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2] *= 2.0 / (n1 * n2);
+ }
+ }
+ .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+ [definition]
+ <case1> IDCT (excluding scale)
+ C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+ cos(pi*j1*(k1+1/2)/n1) *
+ cos(pi*j2*(k2+1/2)/n2),
+ 0<=k1<n1, 0<=k2<n2
+ <case2> DCT
+ C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+ cos(pi*(j1+1/2)*k1/n1) *
+ cos(pi*(j2+1/2)*k2/n2),
+ 0<=k1<n1, 0<=k2<n2
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ ddct2d(n1, n2, 1, a, t, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ ddct2d(n1, n2, -1, a, t, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 2, n1 = power of 2
+ n2 :data length (int)
+ n2 >= 2, n2 = power of 2
+ a[0...n1-1][0...n2-1]
+ :input/output data (double **)
+ output data
+ a[k1][k2] = C[k1][k2], 0<=k1<n1, 0<=k2<n2
+ t[0...n1-1][0...n2-1]
+ :work area (double **)
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1, n2/2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1/2, n2/4) + max(n1, n2)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ ddct2d(n1, n2, -1, a, t, ip, w);
+ is
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ a[j1][0] *= 0.5;
+ }
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[0][j2] *= 0.5;
+ }
+ ddct2d(n1, n2, 1, a, t, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2] *= 4.0 / (n1 * n2);
+ }
+ }
+ .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+ [definition]
+ <case1> IDST (excluding scale)
+ S[k1][k2] = sum_j1=1^n1 sum_j2=1^n2 A[j1][j2] *
+ sin(pi*j1*(k1+1/2)/n1) *
+ sin(pi*j2*(k2+1/2)/n2),
+ 0<=k1<n1, 0<=k2<n2
+ <case2> DST
+ S[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+ sin(pi*(j1+1/2)*k1/n1) *
+ sin(pi*(j2+1/2)*k2/n2),
+ 0<k1<=n1, 0<k2<=n2
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ ddst2d(n1, n2, 1, a, t, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ ddst2d(n1, n2, -1, a, t, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 2, n1 = power of 2
+ n2 :data length (int)
+ n2 >= 2, n2 = power of 2
+ a[0...n1-1][0...n2-1]
+ :input/output data (double **)
+ <case1>
+ input data
+ a[j1][j2] = A[j1][j2], 0<j1<n1, 0<j2<n2,
+ a[j1][0] = A[j1][n2], 0<j1<n1,
+ a[0][j2] = A[n1][j2], 0<j2<n2,
+ a[0][0] = A[n1][n2]
+ (i.e. A[j1][j2] = a[j1 % n1][j2 % n2])
+ output data
+ a[k1][k2] = S[k1][k2], 0<=k1<n1, 0<=k2<n2
+ <case2>
+ output data
+ a[k1][k2] = S[k1][k2], 0<k1<n1, 0<k2<n2,
+ a[k1][0] = S[k1][n2], 0<k1<n1,
+ a[0][k2] = S[n1][k2], 0<k2<n2,
+ a[0][0] = S[n1][n2]
+ (i.e. S[k1][k2] = a[k1 % n1][k2 % n2])
+ t[0...n1-1][0...n2-1]
+ :work area (double **)
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1, n2/2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1/2, n2/4) + max(n1, n2)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ ddst2d(n1, n2, -1, a, t, ip, w);
+ is
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ a[j1][0] *= 0.5;
+ }
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[0][j2] *= 0.5;
+ }
+ ddst2d(n1, n2, 1, a, t, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2] *= 4.0 / (n1 * n2);
+ }
+ }
+ .
+*/
+
+
+void cdft2d(int n1, int n2, int isgn, double **a, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void bitrv2col(int n1, int n, int *ip, double **a);
+ void bitrv2row(int n, int n2, int *ip, double **a);
+ void cftbcol(int n1, int n, double **a, double *w);
+ void cftbrow(int n, int n2, double **a, double *w);
+ void cftfcol(int n1, int n, double **a, double *w);
+ void cftfrow(int n, int n2, double **a, double *w);
+ int n;
+
+ n = n1 << 1;
+ if (n < n2) {
+ n = n2;
+ }
+ if (n > (ip[0] << 2)) {
+ makewt(n >> 2, ip, w);
+ }
+ if (n2 > 4) {
+ bitrv2col(n1, n2, ip + 2, a);
+ }
+ if (n1 > 2) {
+ bitrv2row(n1, n2, ip + 2, a);
+ }
+ if (isgn < 0) {
+ cftfcol(n1, n2, a, w);
+ cftfrow(n1, n2, a, w);
+ } else {
+ cftbcol(n1, n2, a, w);
+ cftbrow(n1, n2, a, w);
+ }
+}
+
+
+void rdft2d(int n1, int n2, int isgn, double **a, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void bitrv2col(int n1, int n, int *ip, double **a);
+ void bitrv2row(int n, int n2, int *ip, double **a);
+ void cftbcol(int n1, int n, double **a, double *w);
+ void cftbrow(int n, int n2, double **a, double *w);
+ void cftfcol(int n1, int n, double **a, double *w);
+ void cftfrow(int n, int n2, double **a, double *w);
+ void rftbcol(int n1, int n, double **a, int nc, double *c);
+ void rftfcol(int n1, int n, double **a, int nc, double *c);
+ int n, nw, nc, n1h, i, j;
+ double xi;
+
+ n = n1 << 1;
+ if (n < n2) {
+ n = n2;
+ }
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n2 > (nc << 2)) {
+ nc = n2 >> 2;
+ makect(nc, ip, w + nw);
+ }
+ n1h = n1 >> 1;
+ if (isgn < 0) {
+ for (i = 1; i <= n1h - 1; i++) {
+ j = n1 - i;
+ xi = a[i][0] - a[j][0];
+ a[i][0] += a[j][0];
+ a[j][0] = xi;
+ xi = a[j][1] - a[i][1];
+ a[i][1] += a[j][1];
+ a[j][1] = xi;
+ }
+ if (n1 > 2) {
+ bitrv2row(n1, n2, ip + 2, a);
+ }
+ cftfrow(n1, n2, a, w);
+ for (i = 0; i <= n1 - 1; i++) {
+ a[i][1] = 0.5 * (a[i][0] - a[i][1]);
+ a[i][0] -= a[i][1];
+ }
+ if (n2 > 4) {
+ rftfcol(n1, n2, a, nc, w + nw);
+ bitrv2col(n1, n2, ip + 2, a);
+ }
+ cftfcol(n1, n2, a, w);
+ } else {
+ if (n2 > 4) {
+ bitrv2col(n1, n2, ip + 2, a);
+ }
+ cftbcol(n1, n2, a, w);
+ if (n2 > 4) {
+ rftbcol(n1, n2, a, nc, w + nw);
+ }
+ for (i = 0; i <= n1 - 1; i++) {
+ xi = a[i][0] - a[i][1];
+ a[i][0] += a[i][1];
+ a[i][1] = xi;
+ }
+ if (n1 > 2) {
+ bitrv2row(n1, n2, ip + 2, a);
+ }
+ cftbrow(n1, n2, a, w);
+ for (i = 1; i <= n1h - 1; i++) {
+ j = n1 - i;
+ a[j][0] = 0.5 * (a[i][0] - a[j][0]);
+ a[i][0] -= a[j][0];
+ a[j][1] = 0.5 * (a[i][1] + a[j][1]);
+ a[i][1] -= a[j][1];
+ }
+ }
+}
+
+
+void ddct2d(int n1, int n2, int isgn, double **a, double **t,
+ int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void bitrv2col(int n1, int n, int *ip, double **a);
+ void bitrv2row(int n, int n2, int *ip, double **a);
+ void cftbcol(int n1, int n, double **a, double *w);
+ void cftbrow(int n, int n2, double **a, double *w);
+ void cftfcol(int n1, int n, double **a, double *w);
+ void cftfrow(int n, int n2, double **a, double *w);
+ void rftbcol(int n1, int n, double **a, int nc, double *c);
+ void rftfcol(int n1, int n, double **a, int nc, double *c);
+ void dctbsub(int n1, int n2, double **a, int nc, double *c);
+ void dctfsub(int n1, int n2, double **a, int nc, double *c);
+ int n, nw, nc, n1h, n2h, i, ix, ic, j, jx, jc;
+ double xi;
+
+ n = n1 << 1;
+ if (n < n2) {
+ n = n2;
+ }
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n1 > nc || n2 > nc) {
+ if (n1 > n2) {
+ nc = n1;
+ } else {
+ nc = n2;
+ }
+ makect(nc, ip, w + nw);
+ }
+ n1h = n1 >> 1;
+ n2h = n2 >> 1;
+ if (isgn >= 0) {
+ for (i = 0; i <= n1 - 1; i++) {
+ for (j = 1; j <= n2h - 1; j++) {
+ jx = j << 1;
+ t[i][jx] = a[i][j];
+ t[i][jx + 1] = a[i][n2 - j];
+ }
+ }
+ t[0][0] = a[0][0];
+ t[0][1] = a[0][n2h];
+ t[n1h][0] = a[n1h][0];
+ t[n1h][1] = a[n1h][n2h];
+ for (i = 1; i <= n1h - 1; i++) {
+ ic = n1 - i;
+ t[i][0] = a[i][0];
+ t[ic][1] = a[i][n2h];
+ t[i][1] = a[ic][0];
+ t[ic][0] = a[ic][n2h];
+ }
+ dctfsub(n1, n2, t, nc, w + nw);
+ if (n1 > 2) {
+ bitrv2row(n1, n2, ip + 2, t);
+ }
+ cftfrow(n1, n2, t, w);
+ for (i = 0; i <= n1 - 1; i++) {
+ t[i][1] = 0.5 * (t[i][0] - t[i][1]);
+ t[i][0] -= t[i][1];
+ }
+ if (n2 > 4) {
+ rftfcol(n1, n2, t, nc, w + nw);
+ bitrv2col(n1, n2, ip + 2, t);
+ }
+ cftfcol(n1, n2, t, w);
+ for (i = 0; i <= n1h - 1; i++) {
+ ix = i << 1;
+ ic = n1 - 1 - i;
+ for (j = 0; j <= n2h - 1; j++) {
+ jx = j << 1;
+ jc = n2 - 1 - j;
+ a[ix][jx] = t[i][j];
+ a[ix][jx + 1] = t[i][jc];
+ a[ix + 1][jx] = t[ic][j];
+ a[ix + 1][jx + 1] = t[ic][jc];
+ }
+ }
+ } else {
+ for (i = 0; i <= n1h - 1; i++) {
+ ix = i << 1;
+ ic = n1 - 1 - i;
+ for (j = 0; j <= n2h - 1; j++) {
+ jx = j << 1;
+ jc = n2 - 1 - j;
+ t[i][j] = a[ix][jx];
+ t[i][jc] = a[ix][jx + 1];
+ t[ic][j] = a[ix + 1][jx];
+ t[ic][jc] = a[ix + 1][jx + 1];
+ }
+ }
+ if (n2 > 4) {
+ bitrv2col(n1, n2, ip + 2, t);
+ }
+ cftbcol(n1, n2, t, w);
+ if (n2 > 4) {
+ rftbcol(n1, n2, t, nc, w + nw);
+ }
+ for (i = 0; i <= n1 - 1; i++) {
+ xi = t[i][0] - t[i][1];
+ t[i][0] += t[i][1];
+ t[i][1] = xi;
+ }
+ if (n1 > 2) {
+ bitrv2row(n1, n2, ip + 2, t);
+ }
+ cftbrow(n1, n2, t, w);
+ dctbsub(n1, n2, t, nc, w + nw);
+ for (i = 0; i <= n1 - 1; i++) {
+ for (j = 1; j <= n2h - 1; j++) {
+ jx = j << 1;
+ a[i][j] = t[i][jx];
+ a[i][n2 - j] = t[i][jx + 1];
+ }
+ }
+ a[0][0] = t[0][0];
+ a[0][n2h] = t[0][1];
+ a[n1h][0] = t[n1h][0];
+ a[n1h][n2h] = t[n1h][1];
+ for (i = 1; i <= n1h - 1; i++) {
+ ic = n1 - i;
+ a[i][0] = t[i][0];
+ a[i][n2h] = t[ic][1];
+ a[ic][0] = t[i][1];
+ a[ic][n2h] = t[ic][0];
+ }
+ }
+}
+
+
+void ddst2d(int n1, int n2, int isgn, double **a, double **t,
+ int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void bitrv2col(int n1, int n, int *ip, double **a);
+ void bitrv2row(int n, int n2, int *ip, double **a);
+ void cftbcol(int n1, int n, double **a, double *w);
+ void cftbrow(int n, int n2, double **a, double *w);
+ void cftfcol(int n1, int n, double **a, double *w);
+ void cftfrow(int n, int n2, double **a, double *w);
+ void rftbcol(int n1, int n, double **a, int nc, double *c);
+ void rftfcol(int n1, int n, double **a, int nc, double *c);
+ void dstbsub(int n1, int n2, double **a, int nc, double *c);
+ void dstfsub(int n1, int n2, double **a, int nc, double *c);
+ int n, nw, nc, n1h, n2h, i, ix, ic, j, jx, jc;
+ double xi;
+
+ n = n1 << 1;
+ if (n < n2) {
+ n = n2;
+ }
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n1 > nc || n2 > nc) {
+ if (n1 > n2) {
+ nc = n1;
+ } else {
+ nc = n2;
+ }
+ makect(nc, ip, w + nw);
+ }
+ n1h = n1 >> 1;
+ n2h = n2 >> 1;
+ if (isgn >= 0) {
+ for (i = 0; i <= n1 - 1; i++) {
+ for (j = 1; j <= n2h - 1; j++) {
+ jx = j << 1;
+ t[i][jx] = a[i][j];
+ t[i][jx + 1] = a[i][n2 - j];
+ }
+ }
+ t[0][0] = a[0][0];
+ t[0][1] = a[0][n2h];
+ t[n1h][0] = a[n1h][0];
+ t[n1h][1] = a[n1h][n2h];
+ for (i = 1; i <= n1h - 1; i++) {
+ ic = n1 - i;
+ t[i][0] = a[i][0];
+ t[ic][1] = a[i][n2h];
+ t[i][1] = a[ic][0];
+ t[ic][0] = a[ic][n2h];
+ }
+ dstfsub(n1, n2, t, nc, w + nw);
+ if (n1 > 2) {
+ bitrv2row(n1, n2, ip + 2, t);
+ }
+ cftfrow(n1, n2, t, w);
+ for (i = 0; i <= n1 - 1; i++) {
+ t[i][1] = 0.5 * (t[i][0] - t[i][1]);
+ t[i][0] -= t[i][1];
+ }
+ if (n2 > 4) {
+ rftfcol(n1, n2, t, nc, w + nw);
+ bitrv2col(n1, n2, ip + 2, t);
+ }
+ cftfcol(n1, n2, t, w);
+ for (i = 0; i <= n1h - 1; i++) {
+ ix = i << 1;
+ ic = n1 - 1 - i;
+ for (j = 0; j <= n2h - 1; j++) {
+ jx = j << 1;
+ jc = n2 - 1 - j;
+ a[ix][jx] = t[i][j];
+ a[ix][jx + 1] = -t[i][jc];
+ a[ix + 1][jx] = -t[ic][j];
+ a[ix + 1][jx + 1] = t[ic][jc];
+ }
+ }
+ } else {
+ for (i = 0; i <= n1h - 1; i++) {
+ ix = i << 1;
+ ic = n1 - 1 - i;
+ for (j = 0; j <= n2h - 1; j++) {
+ jx = j << 1;
+ jc = n2 - 1 - j;
+ t[i][j] = a[ix][jx];
+ t[i][jc] = -a[ix][jx + 1];
+ t[ic][j] = -a[ix + 1][jx];
+ t[ic][jc] = a[ix + 1][jx + 1];
+ }
+ }
+ if (n2 > 4) {
+ bitrv2col(n1, n2, ip + 2, t);
+ }
+ cftbcol(n1, n2, t, w);
+ if (n2 > 4) {
+ rftbcol(n1, n2, t, nc, w + nw);
+ }
+ for (i = 0; i <= n1 - 1; i++) {
+ xi = t[i][0] - t[i][1];
+ t[i][0] += t[i][1];
+ t[i][1] = xi;
+ }
+ if (n1 > 2) {
+ bitrv2row(n1, n2, ip + 2, t);
+ }
+ cftbrow(n1, n2, t, w);
+ dstbsub(n1, n2, t, nc, w + nw);
+ for (i = 0; i <= n1 - 1; i++) {
+ for (j = 1; j <= n2h - 1; j++) {
+ jx = j << 1;
+ a[i][j] = t[i][jx];
+ a[i][n2 - j] = t[i][jx + 1];
+ }
+ }
+ a[0][0] = t[0][0];
+ a[0][n2h] = t[0][1];
+ a[n1h][0] = t[n1h][0];
+ a[n1h][n2h] = t[n1h][1];
+ for (i = 1; i <= n1h - 1; i++) {
+ ic = n1 - i;
+ a[i][0] = t[i][0];
+ a[i][n2h] = t[ic][1];
+ a[ic][0] = t[i][1];
+ a[ic][n2h] = t[ic][0];
+ }
+ }
+}
+
+
+/* -------- initializing routines -------- */
+
+
+#include <math.h>
+
+void makewt(int nw, int *ip, double *w)
+{
+ void bitrv2(int n, int *ip, double *a);
+ int nwh, j;
+ double delta, x, y;
+
+ ip[0] = nw;
+ ip[1] = 1;
+ if (nw > 2) {
+ nwh = nw >> 1;
+ delta = atan(1.0) / nwh;
+ w[0] = 1;
+ w[1] = 0;
+ w[nwh] = cos(delta * nwh);
+ w[nwh + 1] = w[nwh];
+ for (j = 2; j <= nwh - 2; j += 2) {
+ x = cos(delta * j);
+ y = sin(delta * j);
+ w[j] = x;
+ w[j + 1] = y;
+ w[nw - j] = y;
+ w[nw - j + 1] = x;
+ }
+ bitrv2(nw, ip + 2, w);
+ }
+}
+
+
+void makect(int nc, int *ip, double *c)
+{
+ int nch, j;
+ double delta;
+
+ ip[1] = nc;
+ if (nc > 1) {
+ nch = nc >> 1;
+ delta = atan(1.0) / nch;
+ c[0] = 0.5;
+ c[nch] = 0.5 * cos(delta * nch);
+ for (j = 1; j <= nch - 1; j++) {
+ c[j] = 0.5 * cos(delta * j);
+ c[nc - j] = 0.5 * sin(delta * j);
+ }
+ }
+}
+
+
+/* -------- child routines -------- */
+
+
+void bitrv2(int n, int *ip, double *a)
+{
+ int j, j1, k, k1, l, m, m2;
+ double xr, xi;
+
+ ip[0] = 0;
+ l = n;
+ m = 1;
+ while ((m << 2) < l) {
+ l >>= 1;
+ for (j = 0; j <= m - 1; j++) {
+ ip[m + j] = ip[j] + l;
+ }
+ m <<= 1;
+ }
+ if ((m << 2) > l) {
+ for (k = 1; k <= m - 1; k++) {
+ for (j = 0; j <= k - 1; j++) {
+ j1 = (j << 1) + ip[k];
+ k1 = (k << 1) + ip[j];
+ xr = a[j1];
+ xi = a[j1 + 1];
+ a[j1] = a[k1];
+ a[j1 + 1] = a[k1 + 1];
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ }
+ }
+ } else {
+ m2 = m << 1;
+ for (k = 1; k <= m - 1; k++) {
+ for (j = 0; j <= k - 1; j++) {
+ j1 = (j << 1) + ip[k];
+ k1 = (k << 1) + ip[j];
+ xr = a[j1];
+ xi = a[j1 + 1];
+ a[j1] = a[k1];
+ a[j1 + 1] = a[k1 + 1];
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += m2;
+ k1 += m2;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ a[j1] = a[k1];
+ a[j1 + 1] = a[k1 + 1];
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ }
+ }
+ }
+}
+
+
+void bitrv2col(int n1, int n, int *ip, double **a)
+{
+ int i, j, j1, k, k1, l, m, m2;
+ double xr, xi;
+
+ ip[0] = 0;
+ l = n;
+ m = 1;
+ while ((m << 2) < l) {
+ l >>= 1;
+ for (j = 0; j <= m - 1; j++) {
+ ip[m + j] = ip[j] + l;
+ }
+ m <<= 1;
+ }
+ if ((m << 2) > l) {
+ for (i = 0; i <= n1 - 1; i++) {
+ for (k = 1; k <= m - 1; k++) {
+ for (j = 0; j <= k - 1; j++) {
+ j1 = (j << 1) + ip[k];
+ k1 = (k << 1) + ip[j];
+ xr = a[i][j1];
+ xi = a[i][j1 + 1];
+ a[i][j1] = a[i][k1];
+ a[i][j1 + 1] = a[i][k1 + 1];
+ a[i][k1] = xr;
+ a[i][k1 + 1] = xi;
+ }
+ }
+ }
+ } else {
+ m2 = m << 1;
+ for (i = 0; i <= n1 - 1; i++) {
+ for (k = 1; k <= m - 1; k++) {
+ for (j = 0; j <= k - 1; j++) {
+ j1 = (j << 1) + ip[k];
+ k1 = (k << 1) + ip[j];
+ xr = a[i][j1];
+ xi = a[i][j1 + 1];
+ a[i][j1] = a[i][k1];
+ a[i][j1 + 1] = a[i][k1 + 1];
+ a[i][k1] = xr;
+ a[i][k1 + 1] = xi;
+ j1 += m2;
+ k1 += m2;
+ xr = a[i][j1];
+ xi = a[i][j1 + 1];
+ a[i][j1] = a[i][k1];
+ a[i][j1 + 1] = a[i][k1 + 1];
+ a[i][k1] = xr;
+ a[i][k1 + 1] = xi;
+ }
+ }
+ }
+ }
+}
+
+
+void bitrv2row(int n, int n2, int *ip, double **a)
+{
+ int i, j, j1, k, k1, l, m;
+ double xr, xi;
+
+ ip[0] = 0;
+ l = n;
+ m = 1;
+ while ((m << 1) < l) {
+ l >>= 1;
+ for (j = 0; j <= m - 1; j++) {
+ ip[m + j] = ip[j] + l;
+ }
+ m <<= 1;
+ }
+ if ((m << 1) > l) {
+ for (k = 1; k <= m - 1; k++) {
+ for (j = 0; j <= k - 1; j++) {
+ j1 = j + ip[k];
+ k1 = k + ip[j];
+ for (i = 0; i <= n2 - 2; i += 2) {
+ xr = a[j1][i];
+ xi = a[j1][i + 1];
+ a[j1][i] = a[k1][i];
+ a[j1][i + 1] = a[k1][i + 1];
+ a[k1][i] = xr;
+ a[k1][i + 1] = xi;
+ }
+ }
+ }
+ } else {
+ for (k = 1; k <= m - 1; k++) {
+ for (j = 0; j <= k - 1; j++) {
+ j1 = j + ip[k];
+ k1 = k + ip[j];
+ for (i = 0; i <= n2 - 2; i += 2) {
+ xr = a[j1][i];
+ xi = a[j1][i + 1];
+ a[j1][i] = a[k1][i];
+ a[j1][i + 1] = a[k1][i + 1];
+ a[k1][i] = xr;
+ a[k1][i + 1] = xi;
+ }
+ j1 += m;
+ k1 += m;
+ for (i = 0; i <= n2 - 2; i += 2) {
+ xr = a[j1][i];
+ xi = a[j1][i + 1];
+ a[j1][i] = a[k1][i];
+ a[j1][i + 1] = a[k1][i + 1];
+ a[k1][i] = xr;
+ a[k1][i + 1] = xi;
+ }
+ }
+ }
+ }
+}
+
+
+void cftbcol(int n1, int n, double **a, double *w)
+{
+ int i, j, j1, j2, j3, k, k1, ks, l, m;
+ double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+ for (i = 0; i <= n1 - 1; i++) {
+ l = 2;
+ while ((l << 1) < n) {
+ m = l << 2;
+ for (j = 0; j <= l - 2; j += 2) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ x0r = a[i][j] + a[i][j1];
+ x0i = a[i][j + 1] + a[i][j1 + 1];
+ x1r = a[i][j] - a[i][j1];
+ x1i = a[i][j + 1] - a[i][j1 + 1];
+ x2r = a[i][j2] + a[i][j3];
+ x2i = a[i][j2 + 1] + a[i][j3 + 1];
+ x3r = a[i][j2] - a[i][j3];
+ x3i = a[i][j2 + 1] - a[i][j3 + 1];
+ a[i][j] = x0r + x2r;
+ a[i][j + 1] = x0i + x2i;
+ a[i][j2] = x0r - x2r;
+ a[i][j2 + 1] = x0i - x2i;
+ a[i][j1] = x1r - x3i;
+ a[i][j1 + 1] = x1i + x3r;
+ a[i][j3] = x1r + x3i;
+ a[i][j3 + 1] = x1i - x3r;
+ }
+ if (m < n) {
+ wk1r = w[2];
+ for (j = m; j <= l + m - 2; j += 2) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ x0r = a[i][j] + a[i][j1];
+ x0i = a[i][j + 1] + a[i][j1 + 1];
+ x1r = a[i][j] - a[i][j1];
+ x1i = a[i][j + 1] - a[i][j1 + 1];
+ x2r = a[i][j2] + a[i][j3];
+ x2i = a[i][j2 + 1] + a[i][j3 + 1];
+ x3r = a[i][j2] - a[i][j3];
+ x3i = a[i][j2 + 1] - a[i][j3 + 1];
+ a[i][j] = x0r + x2r;
+ a[i][j + 1] = x0i + x2i;
+ a[i][j2] = x2i - x0i;
+ a[i][j2 + 1] = x0r - x2r;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[i][j1] = wk1r * (x0r - x0i);
+ a[i][j1 + 1] = wk1r * (x0r + x0i);
+ x0r = x3i + x1r;
+ x0i = x3r - x1i;
+ a[i][j3] = wk1r * (x0i - x0r);
+ a[i][j3 + 1] = wk1r * (x0i + x0r);
+ }
+ k1 = 1;
+ ks = -1;
+ for (k = (m << 1); k <= n - m; k += m) {
+ k1++;
+ ks = -ks;
+ wk1r = w[k1 << 1];
+ wk1i = w[(k1 << 1) + 1];
+ wk2r = ks * w[k1];
+ wk2i = w[k1 + ks];
+ wk3r = wk1r - 2 * wk2i * wk1i;
+ wk3i = 2 * wk2i * wk1r - wk1i;
+ for (j = k; j <= l + k - 2; j += 2) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ x0r = a[i][j] + a[i][j1];
+ x0i = a[i][j + 1] + a[i][j1 + 1];
+ x1r = a[i][j] - a[i][j1];
+ x1i = a[i][j + 1] - a[i][j1 + 1];
+ x2r = a[i][j2] + a[i][j3];
+ x2i = a[i][j2 + 1] + a[i][j3 + 1];
+ x3r = a[i][j2] - a[i][j3];
+ x3i = a[i][j2 + 1] - a[i][j3 + 1];
+ a[i][j] = x0r + x2r;
+ a[i][j + 1] = x0i + x2i;
+ x0r -= x2r;
+ x0i -= x2i;
+ a[i][j2] = wk2r * x0r - wk2i * x0i;
+ a[i][j2 + 1] = wk2r * x0i + wk2i * x0r;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[i][j1] = wk1r * x0r - wk1i * x0i;
+ a[i][j1 + 1] = wk1r * x0i + wk1i * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[i][j3] = wk3r * x0r - wk3i * x0i;
+ a[i][j3 + 1] = wk3r * x0i + wk3i * x0r;
+ }
+ }
+ }
+ l = m;
+ }
+ if (l < n) {
+ for (j = 0; j <= l - 2; j += 2) {
+ j1 = j + l;
+ x0r = a[i][j] - a[i][j1];
+ x0i = a[i][j + 1] - a[i][j1 + 1];
+ a[i][j] += a[i][j1];
+ a[i][j + 1] += a[i][j1 + 1];
+ a[i][j1] = x0r;
+ a[i][j1 + 1] = x0i;
+ }
+ }
+ }
+}
+
+
+void cftbrow(int n, int n2, double **a, double *w)
+{
+ int i, j, j1, j2, j3, k, k1, ks, l, m;
+ double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+ l = 1;
+ while ((l << 1) < n) {
+ m = l << 2;
+ for (j = 0; j <= l - 1; j++) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ for (i = 0; i <= n2 - 2; i += 2) {
+ x0r = a[j][i] + a[j1][i];
+ x0i = a[j][i + 1] + a[j1][i + 1];
+ x1r = a[j][i] - a[j1][i];
+ x1i = a[j][i + 1] - a[j1][i + 1];
+ x2r = a[j2][i] + a[j3][i];
+ x2i = a[j2][i + 1] + a[j3][i + 1];
+ x3r = a[j2][i] - a[j3][i];
+ x3i = a[j2][i + 1] - a[j3][i + 1];
+ a[j][i] = x0r + x2r;
+ a[j][i + 1] = x0i + x2i;
+ a[j2][i] = x0r - x2r;
+ a[j2][i + 1] = x0i - x2i;
+ a[j1][i] = x1r - x3i;
+ a[j1][i + 1] = x1i + x3r;
+ a[j3][i] = x1r + x3i;
+ a[j3][i + 1] = x1i - x3r;
+ }
+ }
+ if (m < n) {
+ wk1r = w[2];
+ for (j = m; j <= l + m - 1; j++) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ for (i = 0; i <= n2 - 2; i += 2) {
+ x0r = a[j][i] + a[j1][i];
+ x0i = a[j][i + 1] + a[j1][i + 1];
+ x1r = a[j][i] - a[j1][i];
+ x1i = a[j][i + 1] - a[j1][i + 1];
+ x2r = a[j2][i] + a[j3][i];
+ x2i = a[j2][i + 1] + a[j3][i + 1];
+ x3r = a[j2][i] - a[j3][i];
+ x3i = a[j2][i + 1] - a[j3][i + 1];
+ a[j][i] = x0r + x2r;
+ a[j][i + 1] = x0i + x2i;
+ a[j2][i] = x2i - x0i;
+ a[j2][i + 1] = x0r - x2r;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j1][i] = wk1r * (x0r - x0i);
+ a[j1][i + 1] = wk1r * (x0r + x0i);
+ x0r = x3i + x1r;
+ x0i = x3r - x1i;
+ a[j3][i] = wk1r * (x0i - x0r);
+ a[j3][i + 1] = wk1r * (x0i + x0r);
+ }
+ }
+ k1 = 1;
+ ks = -1;
+ for (k = (m << 1); k <= n - m; k += m) {
+ k1++;
+ ks = -ks;
+ wk1r = w[k1 << 1];
+ wk1i = w[(k1 << 1) + 1];
+ wk2r = ks * w[k1];
+ wk2i = w[k1 + ks];
+ wk3r = wk1r - 2 * wk2i * wk1i;
+ wk3i = 2 * wk2i * wk1r - wk1i;
+ for (j = k; j <= l + k - 1; j++) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ for (i = 0; i <= n2 - 2; i += 2) {
+ x0r = a[j][i] + a[j1][i];
+ x0i = a[j][i + 1] + a[j1][i + 1];
+ x1r = a[j][i] - a[j1][i];
+ x1i = a[j][i + 1] - a[j1][i + 1];
+ x2r = a[j2][i] + a[j3][i];
+ x2i = a[j2][i + 1] + a[j3][i + 1];
+ x3r = a[j2][i] - a[j3][i];
+ x3i = a[j2][i + 1] - a[j3][i + 1];
+ a[j][i] = x0r + x2r;
+ a[j][i + 1] = x0i + x2i;
+ x0r -= x2r;
+ x0i -= x2i;
+ a[j2][i] = wk2r * x0r - wk2i * x0i;
+ a[j2][i + 1] = wk2r * x0i + wk2i * x0r;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j1][i] = wk1r * x0r - wk1i * x0i;
+ a[j1][i + 1] = wk1r * x0i + wk1i * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j3][i] = wk3r * x0r - wk3i * x0i;
+ a[j3][i + 1] = wk3r * x0i + wk3i * x0r;
+ }
+ }
+ }
+ }
+ l = m;
+ }
+ if (l < n) {
+ for (j = 0; j <= l - 1; j++) {
+ j1 = j + l;
+ for (i = 0; i <= n2 - 2; i += 2) {
+ x0r = a[j][i] - a[j1][i];
+ x0i = a[j][i + 1] - a[j1][i + 1];
+ a[j][i] += a[j1][i];
+ a[j][i + 1] += a[j1][i + 1];
+ a[j1][i] = x0r;
+ a[j1][i + 1] = x0i;
+ }
+ }
+ }
+}
+
+
+void cftfcol(int n1, int n, double **a, double *w)
+{
+ int i, j, j1, j2, j3, k, k1, ks, l, m;
+ double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+ for (i = 0; i <= n1 - 1; i++) {
+ l = 2;
+ while ((l << 1) < n) {
+ m = l << 2;
+ for (j = 0; j <= l - 2; j += 2) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ x0r = a[i][j] + a[i][j1];
+ x0i = a[i][j + 1] + a[i][j1 + 1];
+ x1r = a[i][j] - a[i][j1];
+ x1i = a[i][j + 1] - a[i][j1 + 1];
+ x2r = a[i][j2] + a[i][j3];
+ x2i = a[i][j2 + 1] + a[i][j3 + 1];
+ x3r = a[i][j2] - a[i][j3];
+ x3i = a[i][j2 + 1] - a[i][j3 + 1];
+ a[i][j] = x0r + x2r;
+ a[i][j + 1] = x0i + x2i;
+ a[i][j2] = x0r - x2r;
+ a[i][j2 + 1] = x0i - x2i;
+ a[i][j1] = x1r + x3i;
+ a[i][j1 + 1] = x1i - x3r;
+ a[i][j3] = x1r - x3i;
+ a[i][j3 + 1] = x1i + x3r;
+ }
+ if (m < n) {
+ wk1r = w[2];
+ for (j = m; j <= l + m - 2; j += 2) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ x0r = a[i][j] + a[i][j1];
+ x0i = a[i][j + 1] + a[i][j1 + 1];
+ x1r = a[i][j] - a[i][j1];
+ x1i = a[i][j + 1] - a[i][j1 + 1];
+ x2r = a[i][j2] + a[i][j3];
+ x2i = a[i][j2 + 1] + a[i][j3 + 1];
+ x3r = a[i][j2] - a[i][j3];
+ x3i = a[i][j2 + 1] - a[i][j3 + 1];
+ a[i][j] = x0r + x2r;
+ a[i][j + 1] = x0i + x2i;
+ a[i][j2] = x0i - x2i;
+ a[i][j2 + 1] = x2r - x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[i][j1] = wk1r * (x0i + x0r);
+ a[i][j1 + 1] = wk1r * (x0i - x0r);
+ x0r = x3i - x1r;
+ x0i = x3r + x1i;
+ a[i][j3] = wk1r * (x0r + x0i);
+ a[i][j3 + 1] = wk1r * (x0r - x0i);
+ }
+ k1 = 1;
+ ks = -1;
+ for (k = (m << 1); k <= n - m; k += m) {
+ k1++;
+ ks = -ks;
+ wk1r = w[k1 << 1];
+ wk1i = w[(k1 << 1) + 1];
+ wk2r = ks * w[k1];
+ wk2i = w[k1 + ks];
+ wk3r = wk1r - 2 * wk2i * wk1i;
+ wk3i = 2 * wk2i * wk1r - wk1i;
+ for (j = k; j <= l + k - 2; j += 2) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ x0r = a[i][j] + a[i][j1];
+ x0i = a[i][j + 1] + a[i][j1 + 1];
+ x1r = a[i][j] - a[i][j1];
+ x1i = a[i][j + 1] - a[i][j1 + 1];
+ x2r = a[i][j2] + a[i][j3];
+ x2i = a[i][j2 + 1] + a[i][j3 + 1];
+ x3r = a[i][j2] - a[i][j3];
+ x3i = a[i][j2 + 1] - a[i][j3 + 1];
+ a[i][j] = x0r + x2r;
+ a[i][j + 1] = x0i + x2i;
+ x0r -= x2r;
+ x0i -= x2i;
+ a[i][j2] = wk2r * x0r + wk2i * x0i;
+ a[i][j2 + 1] = wk2r * x0i - wk2i * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[i][j1] = wk1r * x0r + wk1i * x0i;
+ a[i][j1 + 1] = wk1r * x0i - wk1i * x0r;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[i][j3] = wk3r * x0r + wk3i * x0i;
+ a[i][j3 + 1] = wk3r * x0i - wk3i * x0r;
+ }
+ }
+ }
+ l = m;
+ }
+ if (l < n) {
+ for (j = 0; j <= l - 2; j += 2) {
+ j1 = j + l;
+ x0r = a[i][j] - a[i][j1];
+ x0i = a[i][j + 1] - a[i][j1 + 1];
+ a[i][j] += a[i][j1];
+ a[i][j + 1] += a[i][j1 + 1];
+ a[i][j1] = x0r;
+ a[i][j1 + 1] = x0i;
+ }
+ }
+ }
+}
+
+
+void cftfrow(int n, int n2, double **a, double *w)
+{
+ int i, j, j1, j2, j3, k, k1, ks, l, m;
+ double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+ l = 1;
+ while ((l << 1) < n) {
+ m = l << 2;
+ for (j = 0; j <= l - 1; j++) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ for (i = 0; i <= n2 - 2; i += 2) {
+ x0r = a[j][i] + a[j1][i];
+ x0i = a[j][i + 1] + a[j1][i + 1];
+ x1r = a[j][i] - a[j1][i];
+ x1i = a[j][i + 1] - a[j1][i + 1];
+ x2r = a[j2][i] + a[j3][i];
+ x2i = a[j2][i + 1] + a[j3][i + 1];
+ x3r = a[j2][i] - a[j3][i];
+ x3i = a[j2][i + 1] - a[j3][i + 1];
+ a[j][i] = x0r + x2r;
+ a[j][i + 1] = x0i + x2i;
+ a[j2][i] = x0r - x2r;
+ a[j2][i + 1] = x0i - x2i;
+ a[j1][i] = x1r + x3i;
+ a[j1][i + 1] = x1i - x3r;
+ a[j3][i] = x1r - x3i;
+ a[j3][i + 1] = x1i + x3r;
+ }
+ }
+ if (m < n) {
+ wk1r = w[2];
+ for (j = m; j <= l + m - 1; j++) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ for (i = 0; i <= n2 - 2; i += 2) {
+ x0r = a[j][i] + a[j1][i];
+ x0i = a[j][i + 1] + a[j1][i + 1];
+ x1r = a[j][i] - a[j1][i];
+ x1i = a[j][i + 1] - a[j1][i + 1];
+ x2r = a[j2][i] + a[j3][i];
+ x2i = a[j2][i + 1] + a[j3][i + 1];
+ x3r = a[j2][i] - a[j3][i];
+ x3i = a[j2][i + 1] - a[j3][i + 1];
+ a[j][i] = x0r + x2r;
+ a[j][i + 1] = x0i + x2i;
+ a[j2][i] = x0i - x2i;
+ a[j2][i + 1] = x2r - x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j1][i] = wk1r * (x0i + x0r);
+ a[j1][i + 1] = wk1r * (x0i - x0r);
+ x0r = x3i - x1r;
+ x0i = x3r + x1i;
+ a[j3][i] = wk1r * (x0r + x0i);
+ a[j3][i + 1] = wk1r * (x0r - x0i);
+ }
+ }
+ k1 = 1;
+ ks = -1;
+ for (k = (m << 1); k <= n - m; k += m) {
+ k1++;
+ ks = -ks;
+ wk1r = w[k1 << 1];
+ wk1i = w[(k1 << 1) + 1];
+ wk2r = ks * w[k1];
+ wk2i = w[k1 + ks];
+ wk3r = wk1r - 2 * wk2i * wk1i;
+ wk3i = 2 * wk2i * wk1r - wk1i;
+ for (j = k; j <= l + k - 1; j++) {
+ j1 = j + l;
+ j2 = j1 + l;
+ j3 = j2 + l;
+ for (i = 0; i <= n2 - 2; i += 2) {
+ x0r = a[j][i] + a[j1][i];
+ x0i = a[j][i + 1] + a[j1][i + 1];
+ x1r = a[j][i] - a[j1][i];
+ x1i = a[j][i + 1] - a[j1][i + 1];
+ x2r = a[j2][i] + a[j3][i];
+ x2i = a[j2][i + 1] + a[j3][i + 1];
+ x3r = a[j2][i] - a[j3][i];
+ x3i = a[j2][i + 1] - a[j3][i + 1];
+ a[j][i] = x0r + x2r;
+ a[j][i + 1] = x0i + x2i;
+ x0r -= x2r;
+ x0i -= x2i;
+ a[j2][i] = wk2r * x0r + wk2i * x0i;
+ a[j2][i + 1] = wk2r * x0i - wk2i * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j1][i] = wk1r * x0r + wk1i * x0i;
+ a[j1][i + 1] = wk1r * x0i - wk1i * x0r;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j3][i] = wk3r * x0r + wk3i * x0i;
+ a[j3][i + 1] = wk3r * x0i - wk3i * x0r;
+ }
+ }
+ }
+ }
+ l = m;
+ }
+ if (l < n) {
+ for (j = 0; j <= l - 1; j++) {
+ j1 = j + l;
+ for (i = 0; i <= n2 - 2; i += 2) {
+ x0r = a[j][i] - a[j1][i];
+ x0i = a[j][i + 1] - a[j1][i + 1];
+ a[j][i] += a[j1][i];
+ a[j][i + 1] += a[j1][i + 1];
+ a[j1][i] = x0r;
+ a[j1][i + 1] = x0i;
+ }
+ }
+ }
+}
+
+
+void rftbcol(int n1, int n, double **a, int nc, double *c)
+{
+ int i, j, k, kk, ks;
+ double wkr, wki, xr, xi, yr, yi;
+
+ ks = (nc << 2) / n;
+ for (i = 0; i <= n1 - 1; i++) {
+ kk = 0;
+ for (k = (n >> 1) - 2; k >= 2; k -= 2) {
+ j = n - k;
+ kk += ks;
+ wkr = 0.5 - c[kk];
+ wki = c[nc - kk];
+ xr = a[i][k] - a[i][j];
+ xi = a[i][k + 1] + a[i][j + 1];
+ yr = wkr * xr - wki * xi;
+ yi = wkr * xi + wki * xr;
+ a[i][k] -= yr;
+ a[i][k + 1] -= yi;
+ a[i][j] += yr;
+ a[i][j + 1] -= yi;
+ }
+ }
+}
+
+
+void rftfcol(int n1, int n, double **a, int nc, double *c)
+{
+ int i, j, k, kk, ks;
+ double wkr, wki, xr, xi, yr, yi;
+
+ ks = (nc << 2) / n;
+ for (i = 0; i <= n1 - 1; i++) {
+ kk = 0;
+ for (k = (n >> 1) - 2; k >= 2; k -= 2) {
+ j = n - k;
+ kk += ks;
+ wkr = 0.5 - c[kk];
+ wki = c[nc - kk];
+ xr = a[i][k] - a[i][j];
+ xi = a[i][k + 1] + a[i][j + 1];
+ yr = wkr * xr + wki * xi;
+ yi = wkr * xi - wki * xr;
+ a[i][k] -= yr;
+ a[i][k + 1] -= yi;
+ a[i][j] += yr;
+ a[i][j + 1] -= yi;
+ }
+ }
+}
+
+
+void dctbsub(int n1, int n2, double **a, int nc, double *c)
+{
+ int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
+ double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
+
+ ks1 = nc / n1;
+ ks2 = nc / n2;
+ n1h = n1 >> 1;
+ kk1 = ks1;
+ for (k1 = 1; k1 <= n1h - 1; k1++) {
+ j1 = n1 - k1;
+ w1r = 2 * c[kk1];
+ w1i = 2 * c[nc - kk1];
+ kk1 += ks1;
+ kk2 = ks2;
+ for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+ x0r = w1r * c[kk2];
+ x0i = w1i * c[kk2];
+ x1r = w1r * c[nc - kk2];
+ x1i = w1i * c[nc - kk2];
+ wkr = x0r - x1i;
+ wki = x0i + x1r;
+ wji = x0r + x1i;
+ wjr = x0i - x1r;
+ kk2 += ks2;
+ x0r = wkr * a[k1][k2] - wki * a[k1][k2 + 1];
+ x0i = wkr * a[k1][k2 + 1] + wki * a[k1][k2];
+ x1r = wjr * a[j1][k2] - wji * a[j1][k2 + 1];
+ x1i = wjr * a[j1][k2 + 1] + wji * a[j1][k2];
+ a[k1][k2] = x0r + x1i;
+ a[k1][k2 + 1] = x0i - x1r;
+ a[j1][k2] = x1r + x0i;
+ a[j1][k2 + 1] = x1i - x0r;
+ }
+ wkr = w1r * 0.5;
+ wki = w1i * 0.5;
+ wjr = w1r * c[kk2];
+ wji = w1i * c[kk2];
+ x0r = a[k1][0] + a[j1][0];
+ x0i = a[k1][1] - a[j1][1];
+ x1r = a[k1][0] - a[j1][0];
+ x1i = a[k1][1] + a[j1][1];
+ a[k1][0] = wkr * x0r - wki * x0i;
+ a[k1][1] = wkr * x0i + wki * x0r;
+ a[j1][0] = -wjr * x1r + wji * x1i;
+ a[j1][1] = wjr * x1i + wji * x1r;
+ }
+ w1r = 2 * c[kk1];
+ kk2 = ks2;
+ for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+ wkr = 2 * c[kk2];
+ wki = 2 * c[nc - kk2];
+ wjr = w1r * wkr;
+ wji = w1r * wki;
+ kk2 += ks2;
+ x0i = wkr * a[0][k2 + 1] + wki * a[0][k2];
+ a[0][k2] = wkr * a[0][k2] - wki * a[0][k2 + 1];
+ a[0][k2 + 1] = x0i;
+ x0i = wjr * a[n1h][k2 + 1] + wji * a[n1h][k2];
+ a[n1h][k2] = wjr * a[n1h][k2] - wji * a[n1h][k2 + 1];
+ a[n1h][k2 + 1] = x0i;
+ }
+ a[0][1] *= w1r;
+ a[n1h][0] *= w1r;
+ a[n1h][1] *= 0.5;
+}
+
+
+void dctfsub(int n1, int n2, double **a, int nc, double *c)
+{
+ int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
+ double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
+
+ ks1 = nc / n1;
+ ks2 = nc / n2;
+ n1h = n1 >> 1;
+ kk1 = ks1;
+ for (k1 = 1; k1 <= n1h - 1; k1++) {
+ j1 = n1 - k1;
+ w1r = 2 * c[kk1];
+ w1i = 2 * c[nc - kk1];
+ kk1 += ks1;
+ kk2 = ks2;
+ for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+ x0r = w1r * c[kk2];
+ x0i = w1i * c[kk2];
+ x1r = w1r * c[nc - kk2];
+ x1i = w1i * c[nc - kk2];
+ wkr = x0r - x1i;
+ wki = x0i + x1r;
+ wji = x0r + x1i;
+ wjr = x0i - x1r;
+ kk2 += ks2;
+ x0r = a[k1][k2] - a[j1][k2 + 1];
+ x0i = a[j1][k2] + a[k1][k2 + 1];
+ x1r = a[j1][k2] - a[k1][k2 + 1];
+ x1i = a[k1][k2] + a[j1][k2 + 1];
+ a[k1][k2] = wkr * x0r + wki * x0i;
+ a[k1][k2 + 1] = wkr * x0i - wki * x0r;
+ a[j1][k2] = wjr * x1r + wji * x1i;
+ a[j1][k2 + 1] = wjr * x1i - wji * x1r;
+ }
+ x0r = 2 * c[kk2];
+ wjr = x0r * w1r;
+ wji = x0r * w1i;
+ x0r = w1r * a[k1][0] + w1i * a[k1][1];
+ x0i = w1r * a[k1][1] - w1i * a[k1][0];
+ x1r = -wjr * a[j1][0] + wji * a[j1][1];
+ x1i = wjr * a[j1][1] + wji * a[j1][0];
+ a[k1][0] = x0r + x1r;
+ a[k1][1] = x1i + x0i;
+ a[j1][0] = x0r - x1r;
+ a[j1][1] = x1i - x0i;
+ }
+ w1r = 2 * c[kk1];
+ kk2 = ks2;
+ for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+ wkr = 2 * c[kk2];
+ wki = 2 * c[nc - kk2];
+ wjr = w1r * wkr;
+ wji = w1r * wki;
+ kk2 += ks2;
+ x0i = wkr * a[0][k2 + 1] - wki * a[0][k2];
+ a[0][k2] = wkr * a[0][k2] + wki * a[0][k2 + 1];
+ a[0][k2 + 1] = x0i;
+ x0i = wjr * a[n1h][k2 + 1] - wji * a[n1h][k2];
+ a[n1h][k2] = wjr * a[n1h][k2] + wji * a[n1h][k2 + 1];
+ a[n1h][k2 + 1] = x0i;
+ }
+ w1r *= 2;
+ a[0][0] *= 2;
+ a[0][1] *= w1r;
+ a[n1h][0] *= w1r;
+}
+
+
+void dstbsub(int n1, int n2, double **a, int nc, double *c)
+{
+ int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
+ double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
+
+ ks1 = nc / n1;
+ ks2 = nc / n2;
+ n1h = n1 >> 1;
+ kk1 = ks1;
+ for (k1 = 1; k1 <= n1h - 1; k1++) {
+ j1 = n1 - k1;
+ w1r = 2 * c[kk1];
+ w1i = 2 * c[nc - kk1];
+ kk1 += ks1;
+ kk2 = ks2;
+ for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+ x0r = w1r * c[kk2];
+ x0i = w1i * c[kk2];
+ x1r = w1r * c[nc - kk2];
+ x1i = w1i * c[nc - kk2];
+ wkr = x0r - x1i;
+ wki = x0i + x1r;
+ wji = x0r + x1i;
+ wjr = x0i - x1r;
+ kk2 += ks2;
+ x0r = wkr * a[k1][k2] - wki * a[k1][k2 + 1];
+ x0i = wkr * a[k1][k2 + 1] + wki * a[k1][k2];
+ x1r = wjr * a[j1][k2] - wji * a[j1][k2 + 1];
+ x1i = wjr * a[j1][k2 + 1] + wji * a[j1][k2];
+ a[k1][k2] = x1i - x0r;
+ a[k1][k2 + 1] = x1r + x0i;
+ a[j1][k2] = x0i - x1r;
+ a[j1][k2 + 1] = x0r + x1i;
+ }
+ wkr = w1r * 0.5;
+ wki = w1i * 0.5;
+ wjr = w1r * c[kk2];
+ wji = w1i * c[kk2];
+ x0r = a[k1][0] + a[j1][0];
+ x0i = a[k1][1] - a[j1][1];
+ x1r = a[k1][0] - a[j1][0];
+ x1i = a[k1][1] + a[j1][1];
+ a[k1][1] = wkr * x0r - wki * x0i;
+ a[k1][0] = wkr * x0i + wki * x0r;
+ a[j1][1] = -wjr * x1r + wji * x1i;
+ a[j1][0] = wjr * x1i + wji * x1r;
+ }
+ w1r = 2 * c[kk1];
+ kk2 = ks2;
+ for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+ wkr = 2 * c[kk2];
+ wki = 2 * c[nc - kk2];
+ wjr = w1r * wkr;
+ wji = w1r * wki;
+ kk2 += ks2;
+ x0i = wkr * a[0][k2 + 1] + wki * a[0][k2];
+ a[0][k2 + 1] = wkr * a[0][k2] - wki * a[0][k2 + 1];
+ a[0][k2] = x0i;
+ x0i = wjr * a[n1h][k2 + 1] + wji * a[n1h][k2];
+ a[n1h][k2 + 1] = wjr * a[n1h][k2] - wji * a[n1h][k2 + 1];
+ a[n1h][k2] = x0i;
+ }
+ a[0][1] *= w1r;
+ a[n1h][0] *= w1r;
+ a[n1h][1] *= 0.5;
+}
+
+
+void dstfsub(int n1, int n2, double **a, int nc, double *c)
+{
+ int kk1, kk2, ks1, ks2, n1h, j1, k1, k2;
+ double w1r, w1i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i;
+
+ ks1 = nc / n1;
+ ks2 = nc / n2;
+ n1h = n1 >> 1;
+ kk1 = ks1;
+ for (k1 = 1; k1 <= n1h - 1; k1++) {
+ j1 = n1 - k1;
+ w1r = 2 * c[kk1];
+ w1i = 2 * c[nc - kk1];
+ kk1 += ks1;
+ kk2 = ks2;
+ for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+ x0r = w1r * c[kk2];
+ x0i = w1i * c[kk2];
+ x1r = w1r * c[nc - kk2];
+ x1i = w1i * c[nc - kk2];
+ wkr = x0r - x1i;
+ wki = x0i + x1r;
+ wji = x0r + x1i;
+ wjr = x0i - x1r;
+ kk2 += ks2;
+ x0r = a[j1][k2 + 1] - a[k1][k2];
+ x0i = a[k1][k2 + 1] + a[j1][k2];
+ x1r = a[k1][k2 + 1] - a[j1][k2];
+ x1i = a[j1][k2 + 1] + a[k1][k2];
+ a[k1][k2] = wkr * x0r + wki * x0i;
+ a[k1][k2 + 1] = wkr * x0i - wki * x0r;
+ a[j1][k2] = wjr * x1r + wji * x1i;
+ a[j1][k2 + 1] = wjr * x1i - wji * x1r;
+ }
+ x0r = 2 * c[kk2];
+ wjr = x0r * w1r;
+ wji = x0r * w1i;
+ x0r = w1r * a[k1][1] + w1i * a[k1][0];
+ x0i = w1r * a[k1][0] - w1i * a[k1][1];
+ x1r = -wjr * a[j1][1] + wji * a[j1][0];
+ x1i = wjr * a[j1][0] + wji * a[j1][1];
+ a[k1][0] = x0r + x1r;
+ a[k1][1] = x1i + x0i;
+ a[j1][0] = x0r - x1r;
+ a[j1][1] = x1i - x0i;
+ }
+ w1r = 2 * c[kk1];
+ kk2 = ks2;
+ for (k2 = 2; k2 <= n2 - 2; k2 += 2) {
+ wkr = 2 * c[kk2];
+ wki = 2 * c[nc - kk2];
+ wjr = w1r * wkr;
+ wji = w1r * wki;
+ kk2 += ks2;
+ x0i = wkr * a[0][k2] - wki * a[0][k2 + 1];
+ a[0][k2] = wkr * a[0][k2 + 1] + wki * a[0][k2];
+ a[0][k2 + 1] = x0i;
+ x0i = wjr * a[n1h][k2] - wji * a[n1h][k2 + 1];
+ a[n1h][k2] = wjr * a[n1h][k2 + 1] + wji * a[n1h][k2];
+ a[n1h][k2 + 1] = x0i;
+ }
+ w1r *= 2;
+ a[0][0] *= 2;
+ a[0][1] *= w1r;
+ a[n1h][0] *= w1r;
+}
+
diff --git a/src/fft2d/fft2d/fft4f2d.f b/src/fft2d/fft2d/fft4f2d.f
new file mode 100644
index 0000000..af529e3
--- /dev/null
+++ b/src/fft2d/fft2d/fft4f2d.f
@@ -0,0 +1,1591 @@
+! Fast Fourier/Cosine/Sine Transform
+! dimension :two
+! data length :power of 2
+! decimation :frequency
+! radix :4, 2, row-column
+! data :inplace
+! table :use
+! subroutines
+! cdft2d: Complex Discrete Fourier Transform
+! rdft2d: Real Discrete Fourier Transform
+! ddct2d: Discrete Cosine Transform
+! ddst2d: Discrete Sine Transform
+!
+!
+! -------- Complex DFT (Discrete Fourier Transform) --------
+! [definition]
+! <case1>
+! X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) *
+! exp(2*pi*i*j1*k1/n1) *
+! exp(2*pi*i*j2*k2/n2),
+! 0<=k1<n1, 0<=k2<n2
+! <case2>
+! X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) *
+! exp(-2*pi*i*j1*k1/n1) *
+! exp(-2*pi*i*j2*k2/n2),
+! 0<=k1<n1, 0<=k2<n2
+! (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call cdft2d(n1max, 2*n1, n2, 1, a, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call cdft2d(n1max, 2*n1, n2, -1, a, ip, w)
+! [parameters]
+! n1max :row size of the 2D array (integer)
+! 2*n1 :data length (integer)
+! n1 >= 1, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 1, n2 = power of 2
+! a(0:2*n1-1,0:n2-1)
+! :input/output data (real*8)
+! input data
+! a(2*j1,j2) = Re(x(j1,j2)),
+! a(2*j1+1,j2) = Im(x(j1,j2)),
+! 0<=j1<n1, 0<=j2<n2
+! output data
+! a(2*k1,k2) = Re(X(k1,k2)),
+! a(2*k1+1,k2) = Im(X(k1,k2)),
+! 0<=k1<n1, 0<=k2<n2
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1, n2))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1/2, n2/2)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call cdft2d(n1max, 2*n1, n2, -1, a, ip, w)
+! is
+! call cdft2d(n1max, 2*n1, n2, 1, a, ip, w)
+! do j2 = 0, n2 - 1
+! do j1 = 0, 2 * n1 - 1
+! a(j1, j2) = a(j1, j2) * (1.0d0 / (n1 * n2))
+! end do
+! end do
+! .
+!
+!
+! -------- Real DFT / Inverse of Real DFT --------
+! [definition]
+! <case1> RDFT
+! R(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) *
+! cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
+! 0<=k1<n1, 0<=k2<n2
+! I(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) *
+! sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
+! 0<=k1<n1, 0<=k2<n2
+! <case2> IRDFT (excluding scale)
+! a(k1,k2) = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
+! (R(j1,j2) *
+! cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) +
+! I(j1,j2) *
+! sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)),
+! 0<=k1<n1, 0<=k2<n2
+! (notes: R(n1-k1,n2-k2) = R(k1,k2),
+! I(n1-k1,n2-k2) = -I(k1,k2),
+! R(n1-k1,0) = R(k1,0),
+! I(n1-k1,0) = -I(k1,0),
+! R(0,n2-k2) = R(0,k2),
+! I(0,n2-k2) = -I(0,k2),
+! 0<k1<n1, 0<k2<n2)
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call rdft2d(n1max, n1, n2, 1, a, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call rdft2d(n1max, n1, n2, -1, a, ip, w)
+! [parameters]
+! n1max :row size of the 2D array (integer)
+! n1 :data length (integer)
+! n1 >= 2, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 2, n2 = power of 2
+! a(0:n1-1,0:n2-1)
+! :input/output data (real*8)
+! <case1>
+! output data
+! a(2*k1,k2) = R(k1,k2) = R(n1-k1,n2-k2),
+! a(2*k1+1,k2) = I(k1,k2) = -I(n1-k1,n2-k2),
+! 0<k1<n1/2, 0<k2<n2,
+! a(2*k1,0) = R(k1,0) = R(n1-k1,0),
+! a(2*k1+1,0) = I(k1,0) = -I(n1-k1,0),
+! 0<k1<n1/2,
+! a(0,k2) = R(0,k2) = R(0,n2-k2),
+! a(1,k2) = I(0,k2) = -I(0,n2-k2),
+! a(1,n2-k2) = R(n1/2,k2) = R(n1/2,n2-k2),
+! a(0,n2-k2) = -I(n1/2,k2) = I(n1/2,n2-k2),
+! 0<k2<n2/2,
+! a(0,0) = R(0,0),
+! a(1,0) = R(n1/2,0),
+! a(0,n2/2) = R(0,n2/2),
+! a(1,n2/2) = R(n1/2,n2/2)
+! <case2>
+! input data
+! a(2*j1,j2) = R(j1,j2) = R(n1-j1,n2-j2),
+! a(2*j1+1,j2) = I(j1,j2) = -I(n1-j1,n2-j2),
+! 0<j1<n1/2, 0<j2<n2,
+! a(2*j1,0) = R(j1,0) = R(n1-j1,0),
+! a(2*j1+1,0) = I(j1,0) = -I(n1-j1,0),
+! 0<j1<n1/2,
+! a(0,j2) = R(0,j2) = R(0,n2-j2),
+! a(1,j2) = I(0,j2) = -I(0,n2-j2),
+! a(1,n2-j2) = R(n1/2,j2) = R(n1/2,n2-j2),
+! a(0,n2-j2) = -I(n1/2,j2) = I(n1/2,n2-j2),
+! 0<j2<n2/2,
+! a(0,0) = R(0,0),
+! a(1,0) = R(n1/2,0),
+! a(0,n2/2) = R(0,n2/2),
+! a(1,n2/2) = R(n1/2,n2/2)
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1/2, n2))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1/4, n2/2) + n1/4
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call rdft2d(n1max, n1, n2, 1, a, ip, w)
+! is
+! call rdft2d(n1max, n1, n2, -1, a, ip, w)
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1, j2) = a(j1, j2) * (2.0d0 / (n1 * n2))
+! end do
+! end do
+! .
+!
+!
+! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+! [definition]
+! <case1> IDCT (excluding scale)
+! C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) *
+! cos(pi*j1*(k1+1/2)/n1) *
+! cos(pi*j2*(k2+1/2)/n2),
+! 0<=k1<n1, 0<=k2<n2
+! <case2> DCT
+! C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) *
+! cos(pi*(j1+1/2)*k1/n1) *
+! cos(pi*(j2+1/2)*k2/n2),
+! 0<=k1<n1, 0<=k2<n2
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
+! [parameters]
+! n1max :row size of the 2D array (integer)
+! n1 :data length (integer)
+! n1 >= 2, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 2, n2 = power of 2
+! a(0:n1-1,0:n2-1)
+! :input/output data (real*8)
+! output data
+! a(k1,k2) = C(k1,k2), 0<=k1<n1, 0<=k2<n2
+! t(0:n1-1,0:n2-1)
+! :work area (real*8)
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1/2, n2))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1/4, n2/2) + max(n1, n2)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
+! is
+! do j1 = 0, n1 - 1
+! a(j1, 0) = a(j1, 0) * 0.5d0
+! end do
+! do j2 = 0, n2 - 1
+! a(0, j2) = a(0, j2) * 0.5d0
+! end do
+! call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1, j2) = a(j1, j2) * (4.0d0 / (n1 * n2))
+! end do
+! end do
+! .
+!
+!
+! -------- DST (Discrete Sine Transform) / Inverse of DST --------
+! [definition]
+! <case1> IDST (excluding scale)
+! S(k1,k2) = sum_j1=1^n1 sum_j2=1^n2 A(j1,j2) *
+! sin(pi*j1*(k1+1/2)/n1) *
+! sin(pi*j2*(k2+1/2)/n2),
+! 0<=k1<n1, 0<=k2<n2
+! <case2> DST
+! S(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) *
+! sin(pi*(j1+1/2)*k1/n1) *
+! sin(pi*(j2+1/2)*k2/n2),
+! 0<k1<=n1, 0<k2<=n2
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
+! [parameters]
+! n1max :row size of the 2D array (integer)
+! n1 :data length (integer)
+! n1 >= 2, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 2, n2 = power of 2
+! a(0:n1-1,0:n2-1)
+! :input/output data (real*8)
+! <case1>
+! input data
+! a(j1,j2) = A(j1,j2), 0<j1<n1, 0<j2<n2,
+! a(j1,0) = A(j1,n2), 0<j1<n1,
+! a(0,j2) = A(n1,j2), 0<j2<n2,
+! a(0,0) = A(n1,n2)
+! (i.e. A(j1,j2) = a(mod(j1,n1),mod(j2,n2)))
+! output data
+! a(k1,k2) = S(k1,k2), 0<=k1<n1, 0<=k2<n2
+! <case2>
+! output data
+! a(k1,k2) = S(k1,k2), 0<k1<n1, 0<k2<n2,
+! a(k1,0) = S(k1,n2), 0<k1<n1,
+! a(0,k2) = S(n1,k2), 0<k2<n2,
+! a(0,0) = S(n1,n2)
+! (i.e. S(k1,k2) = a(mod(k1,n1),mod(k2,n2)))
+! t(0:n1-1,0:n2-1)
+! :work area (real*8)
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1/2, n2))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1/4, n2/2) + max(n1, n2)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
+! is
+! do j1 = 0, n1 - 1
+! a(j1, 0) = a(j1, 0) * 0.5d0
+! end do
+! do j2 = 0, n2 - 1
+! a(0, j2) = a(0, j2) * 0.5d0
+! end do
+! call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1, j2) = a(j1, j2) * (4.0d0 / (n1 * n2))
+! end do
+! end do
+! .
+!
+!
+ subroutine cdft2d(n1max, n1, n2, isgn, a, ip, w)
+ integer n1max, n1, n2, isgn, ip(0 : *), n
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *)
+ n = max(n1, 2 * n2)
+ if (n .gt. 4 * ip(0)) then
+ call makewt(n / 4, ip, w)
+ end if
+ if (n1 .gt. 4) then
+ call bitrv2row(n1max, n1, n2, ip(2), a)
+ end if
+ if (n2 .gt. 2) then
+ call bitrv2col(n1max, n1, n2, ip(2), a)
+ end if
+ if (isgn .lt. 0) then
+ call cftfrow(n1max, n1, n2, a, w)
+ call cftfcol(n1max, n1, n2, a, w)
+ else
+ call cftbrow(n1max, n1, n2, a, w)
+ call cftbcol(n1max, n1, n2, a, w)
+ end if
+ end
+!
+ subroutine rdft2d(n1max, n1, n2, isgn, a, ip, w)
+ integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, n2h, i, j
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *), xi
+ n = max(n1, 2 * n2)
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n1 .gt. 4 * nc) then
+ nc = n1 / 4
+ call makect(nc, ip, w(nw))
+ end if
+ n2h = n2 / 2
+ if (isgn .lt. 0) then
+ do i = 1, n2h - 1
+ j = n2 - i
+ xi = a(0, i) - a(0, j)
+ a(0, i) = a(0, i) + a(0, j)
+ a(0, j) = xi
+ xi = a(1, j) - a(1, i)
+ a(1, i) = a(1, i) + a(1, j)
+ a(1, j) = xi
+ end do
+ if (n2 .gt. 2) then
+ call bitrv2col(n1max, n1, n2, ip(2), a)
+ end if
+ call cftfcol(n1max, n1, n2, a, w)
+ do i = 0, n2 - 1
+ a(1, i) = 0.5d0 * (a(0, i) - a(1, i))
+ a(0, i) = a(0, i) - a(1, i)
+ end do
+ if (n1 .gt. 4) then
+ call rftfrow(n1max, n1, n2, a, nc, w(nw))
+ call bitrv2row(n1max, n1, n2, ip(2), a)
+ end if
+ call cftfrow(n1max, n1, n2, a, w)
+ else
+ if (n1 .gt. 4) then
+ call bitrv2row(n1max, n1, n2, ip(2), a)
+ end if
+ call cftbrow(n1max, n1, n2, a, w)
+ if (n1 .gt. 4) then
+ call rftbrow(n1max, n1, n2, a, nc, w(nw))
+ end if
+ do i = 0, n2 - 1
+ xi = a(0, i) - a(1, i)
+ a(0, i) = a(0, i) + a(1, i)
+ a(1, i) = xi
+ end do
+ if (n2 .gt. 2) then
+ call bitrv2col(n1max, n1, n2, ip(2), a)
+ end if
+ call cftbcol(n1max, n1, n2, a, w)
+ do i = 1, n2h - 1
+ j = n2 - i
+ a(0, j) = 0.5d0 * (a(0, i) - a(0, j))
+ a(0, i) = a(0, i) - a(0, j)
+ a(1, j) = 0.5d0 * (a(1, i) + a(1, j))
+ a(1, i) = a(1, i) - a(1, j)
+ end do
+ end if
+ end
+!
+ subroutine ddct2d(n1max, n1, n2, isgn, a, t, ip, w)
+ integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, n1h, n2h,
+ & i, ix, ic, j, jx, jc
+ real*8 a(0 : n1max - 1, 0 : n2 - 1),
+ & t(0 : n1max - 1, 0 : n2 - 1), w(0 : *), xi
+ n = max(n1, 2 * n2)
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n1 .gt. nc .or. n2 .gt. nc) then
+ nc = max(n1, n2)
+ call makect(nc, ip, w(nw))
+ end if
+ n1h = n1 / 2
+ n2h = n2 / 2
+ if (isgn .ge. 0) then
+ do i = 0, n2 - 1
+ do j = 1, n1h - 1
+ jx = 2 * j
+ t(jx, i) = a(j, i)
+ t(jx + 1, i) = a(n1 - j, i)
+ end do
+ end do
+ t(0, 0) = a(0, 0)
+ t(1, 0) = a(n1h, 0)
+ t(0, n2h) = a(0, n2h)
+ t(1, n2h) = a(n1h, n2h)
+ do i = 1, n2h - 1
+ ic = n2 - i
+ t(0, i) = a(0, i)
+ t(1, ic) = a(n1h, i)
+ t(1, i) = a(0, ic)
+ t(0, ic) = a(n1h, ic)
+ end do
+ call dctfsub(n1max, n1, n2, t, nc, w(nw))
+ if (n2 .gt. 2) then
+ call bitrv2col(n1max, n1, n2, ip(2), t)
+ end if
+ call cftfcol(n1max, n1, n2, t, w)
+ do i = 0, n2 - 1
+ t(1, i) = 0.5d0 * (t(0, i) - t(1, i))
+ t(0, i) = t(0, i) - t(1, i)
+ end do
+ if (n1 .gt. 4) then
+ call rftfrow(n1max, n1, n2, t, nc, w(nw))
+ call bitrv2row(n1max, n1, n2, ip(2), t)
+ end if
+ call cftfrow(n1max, n1, n2, t, w)
+ do i = 0, n2h - 1
+ ix = 2 * i
+ ic = n2 - 1 - i
+ do j = 0, n1h - 1
+ jx = 2 * j
+ jc = n1 - 1 - j
+ a(jx, ix) = t(j, i)
+ a(jx + 1, ix) = t(jc, i)
+ a(jx, ix + 1) = t(j, ic)
+ a(jx + 1, ix + 1) = t(jc, ic)
+ end do
+ end do
+ else
+ do i = 0, n2h - 1
+ ix = 2 * i
+ ic = n2 - 1 - i
+ do j = 0, n1h - 1
+ jx = 2 * j
+ jc = n1 - 1 - j
+ t(j, i) = a(jx, ix)
+ t(jc, i) = a(jx + 1, ix)
+ t(j, ic) = a(jx, ix + 1)
+ t(jc, ic) = a(jx + 1, ix + 1)
+ end do
+ end do
+ if (n1 .gt. 4) then
+ call bitrv2row(n1max, n1, n2, ip(2), t)
+ end if
+ call cftbrow(n1max, n1, n2, t, w)
+ if (n1 .gt. 4) then
+ call rftbrow(n1max, n1, n2, t, nc, w(nw))
+ end if
+ do i = 0, n2 - 1
+ xi = t(0, i) - t(1, i)
+ t(0, i) = t(0, i) + t(1, i)
+ t(1, i) = xi
+ end do
+ if (n2 .gt. 2) then
+ call bitrv2col(n1max, n1, n2, ip(2), t)
+ end if
+ call cftbcol(n1max, n1, n2, t, w)
+ call dctbsub(n1max, n1, n2, t, nc, w(nw))
+ do i = 0, n2 - 1
+ do j = 1, n1h - 1
+ jx = 2 * j
+ a(j, i) = t(jx, i)
+ a(n1 - j, i) = t(jx + 1, i)
+ end do
+ end do
+ a(0, 0) = t(0, 0)
+ a(n1h, 0) = t(1, 0)
+ a(0, n2h) = t(0, n2h)
+ a(n1h, n2h) = t(1, n2h)
+ do i = 1, n2h - 1
+ ic = n2 - i
+ a(0, i) = t(0, i)
+ a(n1h, i) = t(1, ic)
+ a(0, ic) = t(1, i)
+ a(n1h, ic) = t(0, ic)
+ end do
+ end if
+ end
+!
+ subroutine ddst2d(n1max, n1, n2, isgn, a, t, ip, w)
+ integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, n1h, n2h,
+ & i, ix, ic, j, jx, jc
+ real*8 a(0 : n1max - 1, 0 : n2 - 1),
+ & t(0 : n1max - 1, 0 : n2 - 1), w(0 : *), xi
+ n = max(n1, 2 * n2)
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n1 .gt. nc .or. n2 .gt. nc) then
+ nc = max(n1, n2)
+ call makect(nc, ip, w(nw))
+ end if
+ n1h = n1 / 2
+ n2h = n2 / 2
+ if (isgn .ge. 0) then
+ do i = 0, n2 - 1
+ do j = 1, n1h - 1
+ jx = 2 * j
+ t(jx, i) = a(j, i)
+ t(jx + 1, i) = a(n1 - j, i)
+ end do
+ end do
+ t(0, 0) = a(0, 0)
+ t(1, 0) = a(n1h, 0)
+ t(0, n2h) = a(0, n2h)
+ t(1, n2h) = a(n1h, n2h)
+ do i = 1, n2h - 1
+ ic = n2 - i
+ t(0, i) = a(0, i)
+ t(1, ic) = a(n1h, i)
+ t(1, i) = a(0, ic)
+ t(0, ic) = a(n1h, ic)
+ end do
+ call dstfsub(n1max, n1, n2, t, nc, w(nw))
+ if (n2 .gt. 2) then
+ call bitrv2col(n1max, n1, n2, ip(2), t)
+ end if
+ call cftfcol(n1max, n1, n2, t, w)
+ do i = 0, n2 - 1
+ t(1, i) = 0.5d0 * (t(0, i) - t(1, i))
+ t(0, i) = t(0, i) - t(1, i)
+ end do
+ if (n1 .gt. 4) then
+ call rftfrow(n1max, n1, n2, t, nc, w(nw))
+ call bitrv2row(n1max, n1, n2, ip(2), t)
+ end if
+ call cftfrow(n1max, n1, n2, t, w)
+ do i = 0, n2h - 1
+ ix = 2 * i
+ ic = n2 - 1 - i
+ do j = 0, n1h - 1
+ jx = 2 * j
+ jc = n1 - 1 - j
+ a(jx, ix) = t(j, i)
+ a(jx + 1, ix) = -t(jc, i)
+ a(jx, ix + 1) = -t(j, ic)
+ a(jx + 1, ix + 1) = t(jc, ic)
+ end do
+ end do
+ else
+ do i = 0, n2h - 1
+ ix = 2 * i
+ ic = n2 - 1 - i
+ do j = 0, n1h - 1
+ jx = 2 * j
+ jc = n1 - 1 - j
+ t(j, i) = a(jx, ix)
+ t(jc, i) = -a(jx + 1, ix)
+ t(j, ic) = -a(jx, ix + 1)
+ t(jc, ic) = a(jx + 1, ix + 1)
+ end do
+ end do
+ if (n1 .gt. 4) then
+ call bitrv2row(n1max, n1, n2, ip(2), t)
+ end if
+ call cftbrow(n1max, n1, n2, t, w)
+ if (n1 .gt. 4) then
+ call rftbrow(n1max, n1, n2, t, nc, w(nw))
+ end if
+ do i = 0, n2 - 1
+ xi = t(0, i) - t(1, i)
+ t(0, i) = t(0, i) + t(1, i)
+ t(1, i) = xi
+ end do
+ if (n2 .gt. 2) then
+ call bitrv2col(n1max, n1, n2, ip(2), t)
+ end if
+ call cftbcol(n1max, n1, n2, t, w)
+ call dstbsub(n1max, n1, n2, t, nc, w(nw))
+ do i = 0, n2 - 1
+ do j = 1, n1h - 1
+ jx = 2 * j
+ a(j, i) = t(jx, i)
+ a(n1 - j, i) = t(jx + 1, i)
+ end do
+ end do
+ a(0, 0) = t(0, 0)
+ a(n1h, 0) = t(1, 0)
+ a(0, n2h) = t(0, n2h)
+ a(n1h, n2h) = t(1, n2h)
+ do i = 1, n2h - 1
+ ic = n2 - i
+ a(0, i) = t(0, i)
+ a(n1h, i) = t(1, ic)
+ a(0, ic) = t(1, i)
+ a(n1h, ic) = t(0, ic)
+ end do
+ end if
+ end
+!
+! -------- initializing routines --------
+!
+ subroutine makewt(nw, ip, w)
+ integer nw, ip(0 : *), nwh, j
+ real*8 w(0 : nw - 1), delta, x, y
+ ip(0) = nw
+ ip(1) = 1
+ if (nw .gt. 2) then
+ nwh = nw / 2
+ delta = atan(1.0d0) / nwh
+ w(0) = 1
+ w(1) = 0
+ w(nwh) = cos(delta * nwh)
+ w(nwh + 1) = w(nwh)
+ do j = 2, nwh - 2, 2
+ x = cos(delta * j)
+ y = sin(delta * j)
+ w(j) = x
+ w(j + 1) = y
+ w(nw - j) = y
+ w(nw - j + 1) = x
+ end do
+ call bitrv2(nw, ip(2), w)
+ end if
+ end
+!
+ subroutine makect(nc, ip, c)
+ integer nc, ip(0 : *), nch, j
+ real*8 c(0 : nc - 1), delta
+ ip(1) = nc
+ if (nc .gt. 1) then
+ nch = nc / 2
+ delta = atan(1.0d0) / nch
+ c(0) = 0.5d0
+ c(nch) = 0.5d0 * cos(delta * nch)
+ do j = 1, nch - 1
+ c(j) = 0.5d0 * cos(delta * j)
+ c(nc - j) = 0.5d0 * sin(delta * j)
+ end do
+ end if
+ end
+!
+! -------- child routines --------
+!
+ subroutine bitrv2(n, ip, a)
+ integer n, ip(0 : *), j, j1, k, k1, l, m, m2
+ real*8 a(0 : n - 1), xr, xi
+ ip(0) = 0
+ l = n
+ m = 1
+ do while (4 * m .lt. l)
+ l = l / 2
+ do j = 0, m - 1
+ ip(m + j) = ip(j) + l
+ end do
+ m = m * 2
+ end do
+ if (4 * m .gt. l) then
+ do k = 1, m - 1
+ do j = 0, k - 1
+ j1 = 2 * j + ip(k)
+ k1 = 2 * k + ip(j)
+ xr = a(j1)
+ xi = a(j1 + 1)
+ a(j1) = a(k1)
+ a(j1 + 1) = a(k1 + 1)
+ a(k1) = xr
+ a(k1 + 1) = xi
+ end do
+ end do
+ else
+ m2 = 2 * m
+ do k = 1, m - 1
+ do j = 0, k - 1
+ j1 = 2 * j + ip(k)
+ k1 = 2 * k + ip(j)
+ xr = a(j1)
+ xi = a(j1 + 1)
+ a(j1) = a(k1)
+ a(j1 + 1) = a(k1 + 1)
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + m2
+ k1 = k1 + m2
+ xr = a(j1)
+ xi = a(j1 + 1)
+ a(j1) = a(k1)
+ a(j1 + 1) = a(k1 + 1)
+ a(k1) = xr
+ a(k1 + 1) = xi
+ end do
+ end do
+ end if
+ end
+!
+ subroutine bitrv2row(n1max, n, n2, ip, a)
+ integer n1max, n, n2, ip(0 : *), i, j, j1, k, k1, l, m, m2
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), xr, xi
+ ip(0) = 0
+ l = n
+ m = 1
+ do while (4 * m .lt. l)
+ l = l / 2
+ do j = 0, m - 1
+ ip(m + j) = ip(j) + l
+ end do
+ m = m * 2
+ end do
+ if (4 * m .gt. l) then
+ do i = 0, n2 - 1
+ do k = 1, m - 1
+ do j = 0, k - 1
+ j1 = 2 * j + ip(k)
+ k1 = 2 * k + ip(j)
+ xr = a(j1, i)
+ xi = a(j1 + 1, i)
+ a(j1, i) = a(k1, i)
+ a(j1 + 1, i) = a(k1 + 1, i)
+ a(k1, i) = xr
+ a(k1 + 1, i) = xi
+ end do
+ end do
+ end do
+ else
+ m2 = 2 * m
+ do i = 0, n2 - 1
+ do k = 1, m - 1
+ do j = 0, k - 1
+ j1 = 2 * j + ip(k)
+ k1 = 2 * k + ip(j)
+ xr = a(j1, i)
+ xi = a(j1 + 1, i)
+ a(j1, i) = a(k1, i)
+ a(j1 + 1, i) = a(k1 + 1, i)
+ a(k1, i) = xr
+ a(k1 + 1, i) = xi
+ j1 = j1 + m2
+ k1 = k1 + m2
+ xr = a(j1, i)
+ xi = a(j1 + 1, i)
+ a(j1, i) = a(k1, i)
+ a(j1 + 1, i) = a(k1 + 1, i)
+ a(k1, i) = xr
+ a(k1 + 1, i) = xi
+ end do
+ end do
+ end do
+ end if
+ end
+!
+ subroutine bitrv2col(n1max, n1, n, ip, a)
+ integer n1max, n1, n, ip(0 : *), i, j, j1, k, k1, l, m
+ real*8 a(0 : n1max - 1, 0 : n - 1), xr, xi
+ ip(0) = 0
+ l = n
+ m = 1
+ do while (2 * m .lt. l)
+ l = l / 2
+ do j = 0, m - 1
+ ip(m + j) = ip(j) + l
+ end do
+ m = m * 2
+ end do
+ if (2 * m .gt. l) then
+ do k = 1, m - 1
+ do j = 0, k - 1
+ j1 = j + ip(k)
+ k1 = k + ip(j)
+ do i = 0, n1 - 2, 2
+ xr = a(i, j1)
+ xi = a(i + 1, j1)
+ a(i, j1) = a(i, k1)
+ a(i + 1, j1) = a(i + 1, k1)
+ a(i, k1) = xr
+ a(i + 1, k1) = xi
+ end do
+ end do
+ end do
+ else
+ do k = 1, m - 1
+ do j = 0, k - 1
+ j1 = j + ip(k)
+ k1 = k + ip(j)
+ do i = 0, n1 - 2, 2
+ xr = a(i, j1)
+ xi = a(i + 1, j1)
+ a(i, j1) = a(i, k1)
+ a(i + 1, j1) = a(i + 1, k1)
+ a(i, k1) = xr
+ a(i + 1, k1) = xi
+ end do
+ j1 = j1 + m
+ k1 = k1 + m
+ do i = 0, n1 - 2, 2
+ xr = a(i, j1)
+ xi = a(i + 1, j1)
+ a(i, j1) = a(i, k1)
+ a(i + 1, j1) = a(i + 1, k1)
+ a(i, k1) = xr
+ a(i + 1, k1) = xi
+ end do
+ end do
+ end do
+ end if
+ end
+!
+ subroutine cftbrow(n1max, n, n2, a, w)
+ integer n1max, n, n2, i, j, j1, j2, j3, k, k1, ks, l, m
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *)
+ real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+ real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ do i = 0, n2 - 1
+ l = 2
+ do while (2 * l .lt. n)
+ m = 4 * l
+ do j = 0, l - 2, 2
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ x0r = a(j, i) + a(j1, i)
+ x0i = a(j + 1, i) + a(j1 + 1, i)
+ x1r = a(j, i) - a(j1, i)
+ x1i = a(j + 1, i) - a(j1 + 1, i)
+ x2r = a(j2, i) + a(j3, i)
+ x2i = a(j2 + 1, i) + a(j3 + 1, i)
+ x3r = a(j2, i) - a(j3, i)
+ x3i = a(j2 + 1, i) - a(j3 + 1, i)
+ a(j, i) = x0r + x2r
+ a(j + 1, i) = x0i + x2i
+ a(j2, i) = x0r - x2r
+ a(j2 + 1, i) = x0i - x2i
+ a(j1, i) = x1r - x3i
+ a(j1 + 1, i) = x1i + x3r
+ a(j3, i) = x1r + x3i
+ a(j3 + 1, i) = x1i - x3r
+ end do
+ if (m .lt. n) then
+ wk1r = w(2)
+ do j = m, l + m - 2, 2
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ x0r = a(j, i) + a(j1, i)
+ x0i = a(j + 1, i) + a(j1 + 1, i)
+ x1r = a(j, i) - a(j1, i)
+ x1i = a(j + 1, i) - a(j1 + 1, i)
+ x2r = a(j2, i) + a(j3, i)
+ x2i = a(j2 + 1, i) + a(j3 + 1, i)
+ x3r = a(j2, i) - a(j3, i)
+ x3i = a(j2 + 1, i) - a(j3 + 1, i)
+ a(j, i) = x0r + x2r
+ a(j + 1, i) = x0i + x2i
+ a(j2, i) = x2i - x0i
+ a(j2 + 1, i) = x0r - x2r
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j1, i) = wk1r * (x0r - x0i)
+ a(j1 + 1, i) = wk1r * (x0r + x0i)
+ x0r = x3i + x1r
+ x0i = x3r - x1i
+ a(j3, i) = wk1r * (x0i - x0r)
+ a(j3 + 1, i) = wk1r * (x0i + x0r)
+ end do
+ k1 = 1
+ ks = -1
+ do k = 2 * m, n - m, m
+ k1 = k1 + 1
+ ks = -ks
+ wk1r = w(2 * k1)
+ wk1i = w(2 * k1 + 1)
+ wk2r = ks * w(k1)
+ wk2i = w(k1 + ks)
+ wk3r = wk1r - 2 * wk2i * wk1i
+ wk3i = 2 * wk2i * wk1r - wk1i
+ do j = k, l + k - 2, 2
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ x0r = a(j, i) + a(j1, i)
+ x0i = a(j + 1, i) + a(j1 + 1, i)
+ x1r = a(j, i) - a(j1, i)
+ x1i = a(j + 1, i) - a(j1 + 1, i)
+ x2r = a(j2, i) + a(j3, i)
+ x2i = a(j2 + 1, i) + a(j3 + 1, i)
+ x3r = a(j2, i) - a(j3, i)
+ x3i = a(j2 + 1, i) - a(j3 + 1, i)
+ a(j, i) = x0r + x2r
+ a(j + 1, i) = x0i + x2i
+ x0r = x0r - x2r
+ x0i = x0i - x2i
+ a(j2, i) = wk2r * x0r - wk2i * x0i
+ a(j2 + 1, i) = wk2r * x0i + wk2i * x0r
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j1, i) = wk1r * x0r - wk1i * x0i
+ a(j1 + 1, i) = wk1r * x0i + wk1i * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j3, i) = wk3r * x0r - wk3i * x0i
+ a(j3 + 1, i) = wk3r * x0i + wk3i * x0r
+ end do
+ end do
+ end if
+ l = m
+ end do
+ if (l .lt. n) then
+ do j = 0, l - 2, 2
+ j1 = j + l
+ x0r = a(j, i) - a(j1, i)
+ x0i = a(j + 1, i) - a(j1 + 1, i)
+ a(j, i) = a(j, i) + a(j1, i)
+ a(j + 1, i) = a(j + 1, i) + a(j1 + 1, i)
+ a(j1, i) = x0r
+ a(j1 + 1, i) = x0i
+ end do
+ end if
+ end do
+ end
+!
+ subroutine cftbcol(n1max, n1, n, a, w)
+ integer n1max, n1, n, i, j, j1, j2, j3, k, k1, ks, l, m
+ real*8 a(0 : n1max - 1, 0 : n - 1), w(0 : *)
+ real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+ real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ l = 1
+ do while (2 * l .lt. n)
+ m = 4 * l
+ do j = 0, l - 1
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ do i = 0, n1 - 2, 2
+ x0r = a(i, j) + a(i, j1)
+ x0i = a(i + 1, j) + a(i + 1, j1)
+ x1r = a(i, j) - a(i, j1)
+ x1i = a(i + 1, j) - a(i + 1, j1)
+ x2r = a(i, j2) + a(i, j3)
+ x2i = a(i + 1, j2) + a(i + 1, j3)
+ x3r = a(i, j2) - a(i, j3)
+ x3i = a(i + 1, j2) - a(i + 1, j3)
+ a(i, j) = x0r + x2r
+ a(i + 1, j) = x0i + x2i
+ a(i, j2) = x0r - x2r
+ a(i + 1, j2) = x0i - x2i
+ a(i, j1) = x1r - x3i
+ a(i + 1, j1) = x1i + x3r
+ a(i, j3) = x1r + x3i
+ a(i + 1, j3) = x1i - x3r
+ end do
+ end do
+ if (m .lt. n) then
+ wk1r = w(2)
+ do j = m, l + m - 1
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ do i = 0, n1 - 2, 2
+ x0r = a(i, j) + a(i, j1)
+ x0i = a(i + 1, j) + a(i + 1, j1)
+ x1r = a(i, j) - a(i, j1)
+ x1i = a(i + 1, j) - a(i + 1, j1)
+ x2r = a(i, j2) + a(i, j3)
+ x2i = a(i + 1, j2) + a(i + 1, j3)
+ x3r = a(i, j2) - a(i, j3)
+ x3i = a(i + 1, j2) - a(i + 1, j3)
+ a(i, j) = x0r + x2r
+ a(i + 1, j) = x0i + x2i
+ a(i, j2) = x2i - x0i
+ a(i + 1, j2) = x0r - x2r
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(i, j1) = wk1r * (x0r - x0i)
+ a(i + 1, j1) = wk1r * (x0r + x0i)
+ x0r = x3i + x1r
+ x0i = x3r - x1i
+ a(i, j3) = wk1r * (x0i - x0r)
+ a(i + 1, j3) = wk1r * (x0i + x0r)
+ end do
+ end do
+ k1 = 1
+ ks = -1
+ do k = 2 * m, n - m, m
+ k1 = k1 + 1
+ ks = -ks
+ wk1r = w(2 * k1)
+ wk1i = w(2 * k1 + 1)
+ wk2r = ks * w(k1)
+ wk2i = w(k1 + ks)
+ wk3r = wk1r - 2 * wk2i * wk1i
+ wk3i = 2 * wk2i * wk1r - wk1i
+ do j = k, l + k - 1
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ do i = 0, n1 - 2, 2
+ x0r = a(i, j) + a(i, j1)
+ x0i = a(i + 1, j) + a(i + 1, j1)
+ x1r = a(i, j) - a(i, j1)
+ x1i = a(i + 1, j) - a(i + 1, j1)
+ x2r = a(i, j2) + a(i, j3)
+ x2i = a(i + 1, j2) + a(i + 1, j3)
+ x3r = a(i, j2) - a(i, j3)
+ x3i = a(i + 1, j2) - a(i + 1, j3)
+ a(i, j) = x0r + x2r
+ a(i + 1, j) = x0i + x2i
+ x0r = x0r - x2r
+ x0i = x0i - x2i
+ a(i, j2) = wk2r * x0r - wk2i * x0i
+ a(i + 1, j2) = wk2r * x0i + wk2i * x0r
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(i, j1) = wk1r * x0r - wk1i * x0i
+ a(i + 1, j1) = wk1r * x0i + wk1i * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(i, j3) = wk3r * x0r - wk3i * x0i
+ a(i + 1, j3) = wk3r * x0i + wk3i * x0r
+ end do
+ end do
+ end do
+ end if
+ l = m
+ end do
+ if (l .lt. n) then
+ do j = 0, l - 1
+ j1 = j + l
+ do i = 0, n1 - 2, 2
+ x0r = a(i, j) - a(i, j1)
+ x0i = a(i + 1, j) - a(i + 1, j1)
+ a(i, j) = a(i, j) + a(i, j1)
+ a(i + 1, j) = a(i + 1, j) + a(i + 1, j1)
+ a(i, j1) = x0r
+ a(i + 1, j1) = x0i
+ end do
+ end do
+ end if
+ end
+!
+ subroutine cftfrow(n1max, n, n2, a, w)
+ integer n1max, n, n2, i, j, j1, j2, j3, k, k1, ks, l, m
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), w(0 : *)
+ real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+ real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ do i = 0, n2 - 1
+ l = 2
+ do while (2 * l .lt. n)
+ m = 4 * l
+ do j = 0, l - 2, 2
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ x0r = a(j, i) + a(j1, i)
+ x0i = a(j + 1, i) + a(j1 + 1, i)
+ x1r = a(j, i) - a(j1, i)
+ x1i = a(j + 1, i) - a(j1 + 1, i)
+ x2r = a(j2, i) + a(j3, i)
+ x2i = a(j2 + 1, i) + a(j3 + 1, i)
+ x3r = a(j2, i) - a(j3, i)
+ x3i = a(j2 + 1, i) - a(j3 + 1, i)
+ a(j, i) = x0r + x2r
+ a(j + 1, i) = x0i + x2i
+ a(j2, i) = x0r - x2r
+ a(j2 + 1, i) = x0i - x2i
+ a(j1, i) = x1r + x3i
+ a(j1 + 1, i) = x1i - x3r
+ a(j3, i) = x1r - x3i
+ a(j3 + 1, i) = x1i + x3r
+ end do
+ if (m .lt. n) then
+ wk1r = w(2)
+ do j = m, l + m - 2, 2
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ x0r = a(j, i) + a(j1, i)
+ x0i = a(j + 1, i) + a(j1 + 1, i)
+ x1r = a(j, i) - a(j1, i)
+ x1i = a(j + 1, i) - a(j1 + 1, i)
+ x2r = a(j2, i) + a(j3, i)
+ x2i = a(j2 + 1, i) + a(j3 + 1, i)
+ x3r = a(j2, i) - a(j3, i)
+ x3i = a(j2 + 1, i) - a(j3 + 1, i)
+ a(j, i) = x0r + x2r
+ a(j + 1, i) = x0i + x2i
+ a(j2, i) = x0i - x2i
+ a(j2 + 1, i) = x2r - x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j1, i) = wk1r * (x0i + x0r)
+ a(j1 + 1, i) = wk1r * (x0i - x0r)
+ x0r = x3i - x1r
+ x0i = x3r + x1i
+ a(j3, i) = wk1r * (x0r + x0i)
+ a(j3 + 1, i) = wk1r * (x0r - x0i)
+ end do
+ k1 = 1
+ ks = -1
+ do k = 2 * m, n - m, m
+ k1 = k1 + 1
+ ks = -ks
+ wk1r = w(2 * k1)
+ wk1i = w(2 * k1 + 1)
+ wk2r = ks * w(k1)
+ wk2i = w(k1 + ks)
+ wk3r = wk1r - 2 * wk2i * wk1i
+ wk3i = 2 * wk2i * wk1r - wk1i
+ do j = k, l + k - 2, 2
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ x0r = a(j, i) + a(j1, i)
+ x0i = a(j + 1, i) + a(j1 + 1, i)
+ x1r = a(j, i) - a(j1, i)
+ x1i = a(j + 1, i) - a(j1 + 1, i)
+ x2r = a(j2, i) + a(j3, i)
+ x2i = a(j2 + 1, i) + a(j3 + 1, i)
+ x3r = a(j2, i) - a(j3, i)
+ x3i = a(j2 + 1, i) - a(j3 + 1, i)
+ a(j, i) = x0r + x2r
+ a(j + 1, i) = x0i + x2i
+ x0r = x0r - x2r
+ x0i = x0i - x2i
+ a(j2, i) = wk2r * x0r + wk2i * x0i
+ a(j2 + 1, i) = wk2r * x0i - wk2i * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j1, i) = wk1r * x0r + wk1i * x0i
+ a(j1 + 1, i) = wk1r * x0i - wk1i * x0r
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j3, i) = wk3r * x0r + wk3i * x0i
+ a(j3 + 1, i) = wk3r * x0i - wk3i * x0r
+ end do
+ end do
+ end if
+ l = m
+ end do
+ if (l .lt. n) then
+ do j = 0, l - 2, 2
+ j1 = j + l
+ x0r = a(j, i) - a(j1, i)
+ x0i = a(j + 1, i) - a(j1 + 1, i)
+ a(j, i) = a(j, i) + a(j1, i)
+ a(j + 1, i) = a(j + 1, i) + a(j1 + 1, i)
+ a(j1, i) = x0r
+ a(j1 + 1, i) = x0i
+ end do
+ end if
+ end do
+ end
+!
+ subroutine cftfcol(n1max, n1, n, a, w)
+ integer n1max, n1, n, i, j, j1, j2, j3, k, k1, ks, l, m
+ real*8 a(0 : n1max - 1, 0 : n - 1), w(0 : *)
+ real*8 wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+ real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ l = 1
+ do while (2 * l .lt. n)
+ m = 4 * l
+ do j = 0, l - 1
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ do i = 0, n1 - 2, 2
+ x0r = a(i, j) + a(i, j1)
+ x0i = a(i + 1, j) + a(i + 1, j1)
+ x1r = a(i, j) - a(i, j1)
+ x1i = a(i + 1, j) - a(i + 1, j1)
+ x2r = a(i, j2) + a(i, j3)
+ x2i = a(i + 1, j2) + a(i + 1, j3)
+ x3r = a(i, j2) - a(i, j3)
+ x3i = a(i + 1, j2) - a(i + 1, j3)
+ a(i, j) = x0r + x2r
+ a(i + 1, j) = x0i + x2i
+ a(i, j2) = x0r - x2r
+ a(i + 1, j2) = x0i - x2i
+ a(i, j1) = x1r + x3i
+ a(i + 1, j1) = x1i - x3r
+ a(i, j3) = x1r - x3i
+ a(i + 1, j3) = x1i + x3r
+ end do
+ end do
+ if (m .lt. n) then
+ wk1r = w(2)
+ do j = m, l + m - 1
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ do i = 0, n1 - 2, 2
+ x0r = a(i, j) + a(i, j1)
+ x0i = a(i + 1, j) + a(i + 1, j1)
+ x1r = a(i, j) - a(i, j1)
+ x1i = a(i + 1, j) - a(i + 1, j1)
+ x2r = a(i, j2) + a(i, j3)
+ x2i = a(i + 1, j2) + a(i + 1, j3)
+ x3r = a(i, j2) - a(i, j3)
+ x3i = a(i + 1, j2) - a(i + 1, j3)
+ a(i, j) = x0r + x2r
+ a(i + 1, j) = x0i + x2i
+ a(i, j2) = x0i - x2i
+ a(i + 1, j2) = x2r - x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(i, j1) = wk1r * (x0i + x0r)
+ a(i + 1, j1) = wk1r * (x0i - x0r)
+ x0r = x3i - x1r
+ x0i = x3r + x1i
+ a(i, j3) = wk1r * (x0r + x0i)
+ a(i + 1, j3) = wk1r * (x0r - x0i)
+ end do
+ end do
+ k1 = 1
+ ks = -1
+ do k = 2 * m, n - m, m
+ k1 = k1 + 1
+ ks = -ks
+ wk1r = w(2 * k1)
+ wk1i = w(2 * k1 + 1)
+ wk2r = ks * w(k1)
+ wk2i = w(k1 + ks)
+ wk3r = wk1r - 2 * wk2i * wk1i
+ wk3i = 2 * wk2i * wk1r - wk1i
+ do j = k, l + k - 1
+ j1 = j + l
+ j2 = j1 + l
+ j3 = j2 + l
+ do i = 0, n1 - 2, 2
+ x0r = a(i, j) + a(i, j1)
+ x0i = a(i + 1, j) + a(i + 1, j1)
+ x1r = a(i, j) - a(i, j1)
+ x1i = a(i + 1, j) - a(i + 1, j1)
+ x2r = a(i, j2) + a(i, j3)
+ x2i = a(i + 1, j2) + a(i + 1, j3)
+ x3r = a(i, j2) - a(i, j3)
+ x3i = a(i + 1, j2) - a(i + 1, j3)
+ a(i, j) = x0r + x2r
+ a(i + 1, j) = x0i + x2i
+ x0r = x0r - x2r
+ x0i = x0i - x2i
+ a(i, j2) = wk2r * x0r + wk2i * x0i
+ a(i + 1, j2) = wk2r * x0i - wk2i * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(i, j1) = wk1r * x0r + wk1i * x0i
+ a(i + 1, j1) = wk1r * x0i - wk1i * x0r
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(i, j3) = wk3r * x0r + wk3i * x0i
+ a(i + 1, j3) = wk3r * x0i - wk3i * x0r
+ end do
+ end do
+ end do
+ end if
+ l = m
+ end do
+ if (l .lt. n) then
+ do j = 0, l - 1
+ j1 = j + l
+ do i = 0, n1 - 2, 2
+ x0r = a(i, j) - a(i, j1)
+ x0i = a(i + 1, j) - a(i + 1, j1)
+ a(i, j) = a(i, j) + a(i, j1)
+ a(i + 1, j) = a(i + 1, j) + a(i + 1, j1)
+ a(i, j1) = x0r
+ a(i + 1, j1) = x0i
+ end do
+ end do
+ end if
+ end
+!
+ subroutine rftbrow(n1max, n, n2, a, nc, c)
+ integer n1max, n, n2, nc, i, j, k, kk, ks
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1),
+ & wkr, wki, xr, xi, yr, yi
+ ks = 4 * nc / n
+ do i = 0, n2 - 1
+ kk = 0
+ do k = n / 2 - 2, 2, -2
+ j = n - k
+ kk = kk + ks
+ wkr = 0.5d0 - c(kk)
+ wki = c(nc - kk)
+ xr = a(k, i) - a(j, i)
+ xi = a(k + 1, i) + a(j + 1, i)
+ yr = wkr * xr - wki * xi
+ yi = wkr * xi + wki * xr
+ a(k, i) = a(k, i) - yr
+ a(k + 1, i) = a(k + 1, i) - yi
+ a(j, i) = a(j, i) + yr
+ a(j + 1, i) = a(j + 1, i) - yi
+ end do
+ end do
+ end
+!
+ subroutine rftfrow(n1max, n, n2, a, nc, c)
+ integer n1max, n, n2, nc, i, j, k, kk, ks
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1),
+ & wkr, wki, xr, xi, yr, yi
+ ks = 4 * nc / n
+ do i = 0, n2 - 1
+ kk = 0
+ do k = n / 2 - 2, 2, -2
+ j = n - k
+ kk = kk + ks
+ wkr = 0.5d0 - c(kk)
+ wki = c(nc - kk)
+ xr = a(k, i) - a(j, i)
+ xi = a(k + 1, i) + a(j + 1, i)
+ yr = wkr * xr + wki * xi
+ yi = wkr * xi - wki * xr
+ a(k, i) = a(k, i) - yr
+ a(k + 1, i) = a(k + 1, i) - yi
+ a(j, i) = a(j, i) + yr
+ a(j + 1, i) = a(j + 1, i) - yi
+ end do
+ end do
+ end
+!
+ subroutine dctbsub(n1max, n1, n2, a, nc, c)
+ integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2,
+ & k1, k2
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1),
+ & w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
+ ks1 = nc / n1
+ ks2 = nc / n2
+ n2h = n2 / 2
+ kk2 = ks2
+ do k2 = 1, n2h - 1
+ j2 = n2 - k2
+ w2r = 2 * c(kk2)
+ w2i = 2 * c(nc - kk2)
+ kk2 = kk2 + ks2
+ kk1 = ks1
+ do k1 = 2, n1 - 2, 2
+ x0r = w2r * c(kk1)
+ x0i = w2i * c(kk1)
+ x1r = w2r * c(nc - kk1)
+ x1i = w2i * c(nc - kk1)
+ wkr = x0r - x1i
+ wki = x0i + x1r
+ wji = x0r + x1i
+ wjr = x0i - x1r
+ kk1 = kk1 + ks1
+ x0r = wkr * a(k1, k2) - wki * a(k1 + 1, k2)
+ x0i = wkr * a(k1 + 1, k2) + wki * a(k1, k2)
+ x1r = wjr * a(k1, j2) - wji * a(k1 + 1, j2)
+ x1i = wjr * a(k1 + 1, j2) + wji * a(k1, j2)
+ a(k1, k2) = x0r + x1i
+ a(k1 + 1, k2) = x0i - x1r
+ a(k1, j2) = x1r + x0i
+ a(k1 + 1, j2) = x1i - x0r
+ end do
+ wkr = w2r * 0.5d0
+ wki = w2i * 0.5d0
+ wjr = w2r * c(kk1)
+ wji = w2i * c(kk1)
+ x0r = a(0, k2) + a(0, j2)
+ x0i = a(1, k2) - a(1, j2)
+ x1r = a(0, k2) - a(0, j2)
+ x1i = a(1, k2) + a(1, j2)
+ a(0, k2) = wkr * x0r - wki * x0i
+ a(1, k2) = wkr * x0i + wki * x0r
+ a(0, j2) = -wjr * x1r + wji * x1i
+ a(1, j2) = wjr * x1i + wji * x1r
+ end do
+ w2r = 2 * c(kk2)
+ kk1 = ks1
+ do k1 = 2, n1 - 2, 2
+ wkr = 2 * c(kk1)
+ wki = 2 * c(nc - kk1)
+ wjr = w2r * wkr
+ wji = w2r * wki
+ kk1 = kk1 + ks1
+ x0i = wkr * a(k1 + 1, 0) + wki * a(k1, 0)
+ a(k1, 0) = wkr * a(k1, 0) - wki * a(k1 + 1, 0)
+ a(k1 + 1, 0) = x0i
+ x0i = wjr * a(k1 + 1, n2h) + wji * a(k1, n2h)
+ a(k1, n2h) = wjr * a(k1, n2h) - wji * a(k1 + 1, n2h)
+ a(k1 + 1, n2h) = x0i
+ end do
+ a(1, 0) = a(1, 0) * w2r
+ a(0, n2h) = a(0, n2h) * w2r
+ a(1, n2h) = a(1, n2h) * 0.5d0
+ end
+!
+ subroutine dctfsub(n1max, n1, n2, a, nc, c)
+ integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2,
+ & k1, k2
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1),
+ & w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
+ ks1 = nc / n1
+ ks2 = nc / n2
+ n2h = n2 / 2
+ kk2 = ks2
+ do k2 = 1, n2h - 1
+ j2 = n2 - k2
+ w2r = 2 * c(kk2)
+ w2i = 2 * c(nc - kk2)
+ kk2 = kk2 + ks2
+ kk1 = ks1
+ do k1 = 2, n1 - 2, 2
+ x0r = w2r * c(kk1)
+ x0i = w2i * c(kk1)
+ x1r = w2r * c(nc - kk1)
+ x1i = w2i * c(nc - kk1)
+ wkr = x0r - x1i
+ wki = x0i + x1r
+ wji = x0r + x1i
+ wjr = x0i - x1r
+ kk1 = kk1 + ks1
+ x0r = a(k1, k2) - a(k1 + 1, j2)
+ x0i = a(k1, j2) + a(k1 + 1, k2)
+ x1r = a(k1, j2) - a(k1 + 1, k2)
+ x1i = a(k1, k2) + a(k1 + 1, j2)
+ a(k1, k2) = wkr * x0r + wki * x0i
+ a(k1 + 1, k2) = wkr * x0i - wki * x0r
+ a(k1, j2) = wjr * x1r + wji * x1i
+ a(k1 + 1, j2) = wjr * x1i - wji * x1r
+ end do
+ x0r = 2 * c(kk1)
+ wjr = x0r * w2r
+ wji = x0r * w2i
+ x0r = w2r * a(0, k2) + w2i * a(1, k2)
+ x0i = w2r * a(1, k2) - w2i * a(0, k2)
+ x1r = -wjr * a(0, j2) + wji * a(1, j2)
+ x1i = wjr * a(1, j2) + wji * a(0, j2)
+ a(0, k2) = x0r + x1r
+ a(1, k2) = x1i + x0i
+ a(0, j2) = x0r - x1r
+ a(1, j2) = x1i - x0i
+ end do
+ w2r = 2 * c(kk2)
+ kk1 = ks1
+ do k1 = 2, n1 - 2, 2
+ wkr = 2 * c(kk1)
+ wki = 2 * c(nc - kk1)
+ wjr = w2r * wkr
+ wji = w2r * wki
+ kk1 = kk1 + ks1
+ x0i = wkr * a(k1 + 1, 0) - wki * a(k1, 0)
+ a(k1, 0) = wkr * a(k1, 0) + wki * a(k1 + 1, 0)
+ a(k1 + 1, 0) = x0i
+ x0i = wjr * a(k1 + 1, n2h) - wji * a(k1, n2h)
+ a(k1, n2h) = wjr * a(k1, n2h) + wji * a(k1 + 1, n2h)
+ a(k1 + 1, n2h) = x0i
+ end do
+ w2r = w2r * 2
+ a(0, 0) = a(0, 0) * 2
+ a(1, 0) = a(1, 0) * w2r
+ a(0, n2h) = a(0, n2h) * w2r
+ end
+!
+ subroutine dstbsub(n1max, n1, n2, a, nc, c)
+ integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2,
+ & k1, k2
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1),
+ & w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
+ ks1 = nc / n1
+ ks2 = nc / n2
+ n2h = n2 / 2
+ kk2 = ks2
+ do k2 = 1, n2h - 1
+ j2 = n2 - k2
+ w2r = 2 * c(kk2)
+ w2i = 2 * c(nc - kk2)
+ kk2 = kk2 + ks2
+ kk1 = ks1
+ do k1 = 2, n1 - 2, 2
+ x0r = w2r * c(kk1)
+ x0i = w2i * c(kk1)
+ x1r = w2r * c(nc - kk1)
+ x1i = w2i * c(nc - kk1)
+ wkr = x0r - x1i
+ wki = x0i + x1r
+ wji = x0r + x1i
+ wjr = x0i - x1r
+ kk1 = kk1 + ks1
+ x0r = wkr * a(k1, k2) - wki * a(k1 + 1, k2)
+ x0i = wkr * a(k1 + 1, k2) + wki * a(k1, k2)
+ x1r = wjr * a(k1, j2) - wji * a(k1 + 1, j2)
+ x1i = wjr * a(k1 + 1, j2) + wji * a(k1, j2)
+ a(k1, k2) = x1i - x0r
+ a(k1 + 1, k2) = x1r + x0i
+ a(k1, j2) = x0i - x1r
+ a(k1 + 1, j2) = x0r + x1i
+ end do
+ wkr = w2r * 0.5d0
+ wki = w2i * 0.5d0
+ wjr = w2r * c(kk1)
+ wji = w2i * c(kk1)
+ x0r = a(0, k2) + a(0, j2)
+ x0i = a(1, k2) - a(1, j2)
+ x1r = a(0, k2) - a(0, j2)
+ x1i = a(1, k2) + a(1, j2)
+ a(1, k2) = wkr * x0r - wki * x0i
+ a(0, k2) = wkr * x0i + wki * x0r
+ a(1, j2) = -wjr * x1r + wji * x1i
+ a(0, j2) = wjr * x1i + wji * x1r
+ end do
+ w2r = 2 * c(kk2)
+ kk1 = ks1
+ do k1 = 2, n1 - 2, 2
+ wkr = 2 * c(kk1)
+ wki = 2 * c(nc - kk1)
+ wjr = w2r * wkr
+ wji = w2r * wki
+ kk1 = kk1 + ks1
+ x0i = wkr * a(k1 + 1, 0) + wki * a(k1, 0)
+ a(k1 + 1, 0) = wkr * a(k1, 0) - wki * a(k1 + 1, 0)
+ a(k1, 0) = x0i
+ x0i = wjr * a(k1 + 1, n2h) + wji * a(k1, n2h)
+ a(k1 + 1, n2h) = wjr * a(k1, n2h) - wji * a(k1 + 1, n2h)
+ a(k1, n2h) = x0i
+ end do
+ a(1, 0) = a(1, 0) * w2r
+ a(0, n2h) = a(0, n2h) * w2r
+ a(1, n2h) = a(1, n2h) * 0.5d0
+ end
+!
+ subroutine dstfsub(n1max, n1, n2, a, nc, c)
+ integer n1max, n1, n2, nc, kk1, kk2, ks1, ks2, n2h, j2,
+ & k1, k2
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), c(0 : nc - 1),
+ & w2r, w2i, wkr, wki, wjr, wji, x0r, x0i, x1r, x1i
+ ks1 = nc / n1
+ ks2 = nc / n2
+ n2h = n2 / 2
+ kk2 = ks2
+ do k2 = 1, n2h - 1
+ j2 = n2 - k2
+ w2r = 2 * c(kk2)
+ w2i = 2 * c(nc - kk2)
+ kk2 = kk2 + ks2
+ kk1 = ks1
+ do k1 = 2, n1 - 2, 2
+ x0r = w2r * c(kk1)
+ x0i = w2i * c(kk1)
+ x1r = w2r * c(nc - kk1)
+ x1i = w2i * c(nc - kk1)
+ wkr = x0r - x1i
+ wki = x0i + x1r
+ wji = x0r + x1i
+ wjr = x0i - x1r
+ kk1 = kk1 + ks1
+ x0r = a(k1 + 1, j2) - a(k1, k2)
+ x0i = a(k1 + 1, k2) + a(k1, j2)
+ x1r = a(k1 + 1, k2) - a(k1, j2)
+ x1i = a(k1 + 1, j2) + a(k1, k2)
+ a(k1, k2) = wkr * x0r + wki * x0i
+ a(k1 + 1, k2) = wkr * x0i - wki * x0r
+ a(k1, j2) = wjr * x1r + wji * x1i
+ a(k1 + 1, j2) = wjr * x1i - wji * x1r
+ end do
+ x0r = 2 * c(kk1)
+ wjr = x0r * w2r
+ wji = x0r * w2i
+ x0r = w2r * a(1, k2) + w2i * a(0, k2)
+ x0i = w2r * a(0, k2) - w2i * a(1, k2)
+ x1r = -wjr * a(1, j2) + wji * a(0, j2)
+ x1i = wjr * a(0, j2) + wji * a(1, j2)
+ a(0, k2) = x0r + x1r
+ a(1, k2) = x1i + x0i
+ a(0, j2) = x0r - x1r
+ a(1, j2) = x1i - x0i
+ end do
+ w2r = 2 * c(kk2)
+ kk1 = ks1
+ do k1 = 2, n1 - 2, 2
+ wkr = 2 * c(kk1)
+ wki = 2 * c(nc - kk1)
+ wjr = w2r * wkr
+ wji = w2r * wki
+ kk1 = kk1 + ks1
+ x0i = wkr * a(k1, 0) - wki * a(k1 + 1, 0)
+ a(k1, 0) = wkr * a(k1 + 1, 0) + wki * a(k1, 0)
+ a(k1 + 1, 0) = x0i
+ x0i = wjr * a(k1, n2h) - wji * a(k1 + 1, n2h)
+ a(k1, n2h) = wjr * a(k1 + 1, n2h) + wji * a(k1, n2h)
+ a(k1 + 1, n2h) = x0i
+ end do
+ w2r = w2r * 2
+ a(0, 0) = a(0, 0) * 2
+ a(1, 0) = a(1, 0) * w2r
+ a(0, n2h) = a(0, n2h) * w2r
+ end
+!
diff --git a/src/fft2d/fft2d/fftsg.c b/src/fft2d/fft2d/fftsg.c
new file mode 100644
index 0000000..43d7534
--- /dev/null
+++ b/src/fft2d/fft2d/fftsg.c
@@ -0,0 +1,3314 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+ dimension :one
+ data length :power of 2
+ decimation :frequency
+ radix :split-radix
+ data :inplace
+ table :use
+functions
+ cdft: Complex Discrete Fourier Transform
+ rdft: Real Discrete Fourier Transform
+ ddct: Discrete Cosine Transform
+ ddst: Discrete Sine Transform
+ dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+ dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+ void cdft(int, int, double *, int *, double *);
+ void rdft(int, int, double *, int *, double *);
+ void ddct(int, int, double *, int *, double *);
+ void ddst(int, int, double *, int *, double *);
+ void dfct(int, double *, double *, int *, double *);
+ void dfst(int, double *, double *, int *, double *);
+macro definitions
+ USE_CDFT_PTHREADS : default=not defined
+ CDFT_THREADS_BEGIN_N : must be >= 512, default=8192
+ CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536
+ USE_CDFT_WINTHREADS : default=not defined
+ CDFT_THREADS_BEGIN_N : must be >= 512, default=32768
+ CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+ [definition]
+ <case1>
+ X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+ <case2>
+ X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+ (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ cdft(2*n, 1, a, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ cdft(2*n, -1, a, ip, w);
+ [parameters]
+ 2*n :data length (int)
+ n >= 1, n = power of 2
+ a[0...2*n-1] :input/output data (double *)
+ input data
+ a[2*j] = Re(x[j]),
+ a[2*j+1] = Im(x[j]), 0<=j<n
+ output data
+ a[2*k] = Re(X[k]),
+ a[2*k+1] = Im(X[k]), 0<=k<n
+ ip[0...*] :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ strictly,
+ length of ip >=
+ 2+(1<<(int)(log(n+0.5)/log(2))/2).
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...n/2-1] :cos/sin table (double *)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ cdft(2*n, -1, a, ip, w);
+ is
+ cdft(2*n, 1, a, ip, w);
+ for (j = 0; j <= 2 * n - 1; j++) {
+ a[j] *= 1.0 / n;
+ }
+ .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+ [definition]
+ <case1> RDFT
+ R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+ I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+ <case2> IRDFT (excluding scale)
+ a[k] = (R[0] + R[n/2]*cos(pi*k))/2 +
+ sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) +
+ sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ rdft(n, 1, a, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ rdft(n, -1, a, ip, w);
+ [parameters]
+ n :data length (int)
+ n >= 2, n = power of 2
+ a[0...n-1] :input/output data (double *)
+ <case1>
+ output data
+ a[2*k] = R[k], 0<=k<n/2
+ a[2*k+1] = I[k], 0<k<n/2
+ a[1] = R[n/2]
+ <case2>
+ input data
+ a[2*j] = R[j], 0<=j<n/2
+ a[2*j+1] = I[j], 0<j<n/2
+ a[1] = R[n/2]
+ ip[0...*] :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n/2)
+ strictly,
+ length of ip >=
+ 2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...n/2-1] :cos/sin table (double *)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ rdft(n, 1, a, ip, w);
+ is
+ rdft(n, -1, a, ip, w);
+ for (j = 0; j <= n - 1; j++) {
+ a[j] *= 2.0 / n;
+ }
+ .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+ [definition]
+ <case1> IDCT (excluding scale)
+ C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+ <case2> DCT
+ C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ ddct(n, 1, a, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ ddct(n, -1, a, ip, w);
+ [parameters]
+ n :data length (int)
+ n >= 2, n = power of 2
+ a[0...n-1] :input/output data (double *)
+ output data
+ a[k] = C[k], 0<=k<n
+ ip[0...*] :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n/2)
+ strictly,
+ length of ip >=
+ 2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...n*5/4-1] :cos/sin table (double *)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ ddct(n, -1, a, ip, w);
+ is
+ a[0] *= 0.5;
+ ddct(n, 1, a, ip, w);
+ for (j = 0; j <= n - 1; j++) {
+ a[j] *= 2.0 / n;
+ }
+ .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+ [definition]
+ <case1> IDST (excluding scale)
+ S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+ <case2> DST
+ S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ ddst(n, 1, a, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ ddst(n, -1, a, ip, w);
+ [parameters]
+ n :data length (int)
+ n >= 2, n = power of 2
+ a[0...n-1] :input/output data (double *)
+ <case1>
+ input data
+ a[j] = A[j], 0<j<n
+ a[0] = A[n]
+ output data
+ a[k] = S[k], 0<=k<n
+ <case2>
+ output data
+ a[k] = S[k], 0<k<n
+ a[0] = S[n]
+ ip[0...*] :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n/2)
+ strictly,
+ length of ip >=
+ 2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...n*5/4-1] :cos/sin table (double *)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ ddst(n, -1, a, ip, w);
+ is
+ a[0] *= 0.5;
+ ddst(n, 1, a, ip, w);
+ for (j = 0; j <= n - 1; j++) {
+ a[j] *= 2.0 / n;
+ }
+ .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+ [definition]
+ C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+ [usage]
+ ip[0] = 0; // first time only
+ dfct(n, a, t, ip, w);
+ [parameters]
+ n :data length - 1 (int)
+ n >= 2, n = power of 2
+ a[0...n] :input/output data (double *)
+ output data
+ a[k] = C[k], 0<=k<=n
+ t[0...n/2] :work area (double *)
+ ip[0...*] :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n/4)
+ strictly,
+ length of ip >=
+ 2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...n*5/8-1] :cos/sin table (double *)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ a[0] *= 0.5;
+ a[n] *= 0.5;
+ dfct(n, a, t, ip, w);
+ is
+ a[0] *= 0.5;
+ a[n] *= 0.5;
+ dfct(n, a, t, ip, w);
+ for (j = 0; j <= n; j++) {
+ a[j] *= 2.0 / n;
+ }
+ .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+ [definition]
+ S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+ [usage]
+ ip[0] = 0; // first time only
+ dfst(n, a, t, ip, w);
+ [parameters]
+ n :data length + 1 (int)
+ n >= 2, n = power of 2
+ a[0...n-1] :input/output data (double *)
+ output data
+ a[k] = S[k], 0<k<n
+ (a[0] is used for work area)
+ t[0...n/2-1] :work area (double *)
+ ip[0...*] :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n/4)
+ strictly,
+ length of ip >=
+ 2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...n*5/8-1] :cos/sin table (double *)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ dfst(n, a, t, ip, w);
+ is
+ dfst(n, a, t, ip, w);
+ for (j = 1; j <= n - 1; j++) {
+ a[j] *= 2.0 / n;
+ }
+ .
+
+
+Appendix :
+ The cos/sin table is recalculated when the larger table required.
+ w[] and ip[] are compatible with all routines.
+*/
+
+
+void cdft(int n, int isgn, double *a, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void cftfsub(int n, double *a, int *ip, int nw, double *w);
+ void cftbsub(int n, double *a, int *ip, int nw, double *w);
+ int nw;
+
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ if (isgn >= 0) {
+ cftfsub(n, a, ip, nw, w);
+ } else {
+ cftbsub(n, a, ip, nw, w);
+ }
+}
+
+
+void rdft(int n, int isgn, double *a, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void cftfsub(int n, double *a, int *ip, int nw, double *w);
+ void cftbsub(int n, double *a, int *ip, int nw, double *w);
+ void rftfsub(int n, double *a, int nc, double *c);
+ void rftbsub(int n, double *a, int nc, double *c);
+ int nw, nc;
+ double xi;
+
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n > (nc << 2)) {
+ nc = n >> 2;
+ makect(nc, ip, w + nw);
+ }
+ if (isgn >= 0) {
+ if (n > 4) {
+ cftfsub(n, a, ip, nw, w);
+ rftfsub(n, a, nc, w + nw);
+ } else if (n == 4) {
+ cftfsub(n, a, ip, nw, w);
+ }
+ xi = a[0] - a[1];
+ a[0] += a[1];
+ a[1] = xi;
+ } else {
+ a[1] = 0.5 * (a[0] - a[1]);
+ a[0] -= a[1];
+ if (n > 4) {
+ rftbsub(n, a, nc, w + nw);
+ cftbsub(n, a, ip, nw, w);
+ } else if (n == 4) {
+ cftbsub(n, a, ip, nw, w);
+ }
+ }
+}
+
+
+void ddct(int n, int isgn, double *a, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void cftfsub(int n, double *a, int *ip, int nw, double *w);
+ void cftbsub(int n, double *a, int *ip, int nw, double *w);
+ void rftfsub(int n, double *a, int nc, double *c);
+ void rftbsub(int n, double *a, int nc, double *c);
+ void dctsub(int n, double *a, int nc, double *c);
+ int j, nw, nc;
+ double xr;
+
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n > nc) {
+ nc = n;
+ makect(nc, ip, w + nw);
+ }
+ if (isgn < 0) {
+ xr = a[n - 1];
+ for (j = n - 2; j >= 2; j -= 2) {
+ a[j + 1] = a[j] - a[j - 1];
+ a[j] += a[j - 1];
+ }
+ a[1] = a[0] - xr;
+ a[0] += xr;
+ if (n > 4) {
+ rftbsub(n, a, nc, w + nw);
+ cftbsub(n, a, ip, nw, w);
+ } else if (n == 4) {
+ cftbsub(n, a, ip, nw, w);
+ }
+ }
+ dctsub(n, a, nc, w + nw);
+ if (isgn >= 0) {
+ if (n > 4) {
+ cftfsub(n, a, ip, nw, w);
+ rftfsub(n, a, nc, w + nw);
+ } else if (n == 4) {
+ cftfsub(n, a, ip, nw, w);
+ }
+ xr = a[0] - a[1];
+ a[0] += a[1];
+ for (j = 2; j < n; j += 2) {
+ a[j - 1] = a[j] - a[j + 1];
+ a[j] += a[j + 1];
+ }
+ a[n - 1] = xr;
+ }
+}
+
+
+void ddst(int n, int isgn, double *a, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void cftfsub(int n, double *a, int *ip, int nw, double *w);
+ void cftbsub(int n, double *a, int *ip, int nw, double *w);
+ void rftfsub(int n, double *a, int nc, double *c);
+ void rftbsub(int n, double *a, int nc, double *c);
+ void dstsub(int n, double *a, int nc, double *c);
+ int j, nw, nc;
+ double xr;
+
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n > nc) {
+ nc = n;
+ makect(nc, ip, w + nw);
+ }
+ if (isgn < 0) {
+ xr = a[n - 1];
+ for (j = n - 2; j >= 2; j -= 2) {
+ a[j + 1] = -a[j] - a[j - 1];
+ a[j] -= a[j - 1];
+ }
+ a[1] = a[0] + xr;
+ a[0] -= xr;
+ if (n > 4) {
+ rftbsub(n, a, nc, w + nw);
+ cftbsub(n, a, ip, nw, w);
+ } else if (n == 4) {
+ cftbsub(n, a, ip, nw, w);
+ }
+ }
+ dstsub(n, a, nc, w + nw);
+ if (isgn >= 0) {
+ if (n > 4) {
+ cftfsub(n, a, ip, nw, w);
+ rftfsub(n, a, nc, w + nw);
+ } else if (n == 4) {
+ cftfsub(n, a, ip, nw, w);
+ }
+ xr = a[0] - a[1];
+ a[0] += a[1];
+ for (j = 2; j < n; j += 2) {
+ a[j - 1] = -a[j] - a[j + 1];
+ a[j] -= a[j + 1];
+ }
+ a[n - 1] = -xr;
+ }
+}
+
+
+void dfct(int n, double *a, double *t, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void cftfsub(int n, double *a, int *ip, int nw, double *w);
+ void rftfsub(int n, double *a, int nc, double *c);
+ void dctsub(int n, double *a, int nc, double *c);
+ int j, k, l, m, mh, nw, nc;
+ double xr, xi, yr, yi;
+
+ nw = ip[0];
+ if (n > (nw << 3)) {
+ nw = n >> 3;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n > (nc << 1)) {
+ nc = n >> 1;
+ makect(nc, ip, w + nw);
+ }
+ m = n >> 1;
+ yi = a[m];
+ xi = a[0] + a[n];
+ a[0] -= a[n];
+ t[0] = xi - yi;
+ t[m] = xi + yi;
+ if (n > 2) {
+ mh = m >> 1;
+ for (j = 1; j < mh; j++) {
+ k = m - j;
+ xr = a[j] - a[n - j];
+ xi = a[j] + a[n - j];
+ yr = a[k] - a[n - k];
+ yi = a[k] + a[n - k];
+ a[j] = xr;
+ a[k] = yr;
+ t[j] = xi - yi;
+ t[k] = xi + yi;
+ }
+ t[mh] = a[mh] + a[n - mh];
+ a[mh] -= a[n - mh];
+ dctsub(m, a, nc, w + nw);
+ if (m > 4) {
+ cftfsub(m, a, ip, nw, w);
+ rftfsub(m, a, nc, w + nw);
+ } else if (m == 4) {
+ cftfsub(m, a, ip, nw, w);
+ }
+ a[n - 1] = a[0] - a[1];
+ a[1] = a[0] + a[1];
+ for (j = m - 2; j >= 2; j -= 2) {
+ a[2 * j + 1] = a[j] + a[j + 1];
+ a[2 * j - 1] = a[j] - a[j + 1];
+ }
+ l = 2;
+ m = mh;
+ while (m >= 2) {
+ dctsub(m, t, nc, w + nw);
+ if (m > 4) {
+ cftfsub(m, t, ip, nw, w);
+ rftfsub(m, t, nc, w + nw);
+ } else if (m == 4) {
+ cftfsub(m, t, ip, nw, w);
+ }
+ a[n - l] = t[0] - t[1];
+ a[l] = t[0] + t[1];
+ k = 0;
+ for (j = 2; j < m; j += 2) {
+ k += l << 2;
+ a[k - l] = t[j] - t[j + 1];
+ a[k + l] = t[j] + t[j + 1];
+ }
+ l <<= 1;
+ mh = m >> 1;
+ for (j = 0; j < mh; j++) {
+ k = m - j;
+ t[j] = t[m + k] - t[m + j];
+ t[k] = t[m + k] + t[m + j];
+ }
+ t[mh] = t[m + mh];
+ m = mh;
+ }
+ a[l] = t[0];
+ a[n] = t[2] - t[1];
+ a[0] = t[2] + t[1];
+ } else {
+ a[1] = a[0];
+ a[2] = t[0];
+ a[0] = t[1];
+ }
+}
+
+
+void dfst(int n, double *a, double *t, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void cftfsub(int n, double *a, int *ip, int nw, double *w);
+ void rftfsub(int n, double *a, int nc, double *c);
+ void dstsub(int n, double *a, int nc, double *c);
+ int j, k, l, m, mh, nw, nc;
+ double xr, xi, yr, yi;
+
+ nw = ip[0];
+ if (n > (nw << 3)) {
+ nw = n >> 3;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n > (nc << 1)) {
+ nc = n >> 1;
+ makect(nc, ip, w + nw);
+ }
+ if (n > 2) {
+ m = n >> 1;
+ mh = m >> 1;
+ for (j = 1; j < mh; j++) {
+ k = m - j;
+ xr = a[j] + a[n - j];
+ xi = a[j] - a[n - j];
+ yr = a[k] + a[n - k];
+ yi = a[k] - a[n - k];
+ a[j] = xr;
+ a[k] = yr;
+ t[j] = xi + yi;
+ t[k] = xi - yi;
+ }
+ t[0] = a[mh] - a[n - mh];
+ a[mh] += a[n - mh];
+ a[0] = a[m];
+ dstsub(m, a, nc, w + nw);
+ if (m > 4) {
+ cftfsub(m, a, ip, nw, w);
+ rftfsub(m, a, nc, w + nw);
+ } else if (m == 4) {
+ cftfsub(m, a, ip, nw, w);
+ }
+ a[n - 1] = a[1] - a[0];
+ a[1] = a[0] + a[1];
+ for (j = m - 2; j >= 2; j -= 2) {
+ a[2 * j + 1] = a[j] - a[j + 1];
+ a[2 * j - 1] = -a[j] - a[j + 1];
+ }
+ l = 2;
+ m = mh;
+ while (m >= 2) {
+ dstsub(m, t, nc, w + nw);
+ if (m > 4) {
+ cftfsub(m, t, ip, nw, w);
+ rftfsub(m, t, nc, w + nw);
+ } else if (m == 4) {
+ cftfsub(m, t, ip, nw, w);
+ }
+ a[n - l] = t[1] - t[0];
+ a[l] = t[0] + t[1];
+ k = 0;
+ for (j = 2; j < m; j += 2) {
+ k += l << 2;
+ a[k - l] = -t[j] - t[j + 1];
+ a[k + l] = t[j] - t[j + 1];
+ }
+ l <<= 1;
+ mh = m >> 1;
+ for (j = 1; j < mh; j++) {
+ k = m - j;
+ t[j] = t[m + k] + t[m + j];
+ t[k] = t[m + k] - t[m + j];
+ }
+ t[0] = t[m + mh];
+ m = mh;
+ }
+ a[l] = t[0];
+ }
+ a[0] = 0;
+}
+
+
+/* -------- initializing routines -------- */
+
+
+#include <math.h>
+
+void makewt(int nw, int *ip, double *w)
+{
+ void makeipt(int nw, int *ip);
+ int j, nwh, nw0, nw1;
+ double delta, wn4r, wk1r, wk1i, wk3r, wk3i;
+
+ ip[0] = nw;
+ ip[1] = 1;
+ if (nw > 2) {
+ nwh = nw >> 1;
+ delta = atan(1.0) / nwh;
+ wn4r = cos(delta * nwh);
+ w[0] = 1;
+ w[1] = wn4r;
+ if (nwh == 4) {
+ w[2] = cos(delta * 2);
+ w[3] = sin(delta * 2);
+ } else if (nwh > 4) {
+ makeipt(nw, ip);
+ w[2] = 0.5 / cos(delta * 2);
+ w[3] = 0.5 / cos(delta * 6);
+ for (j = 4; j < nwh; j += 4) {
+ w[j] = cos(delta * j);
+ w[j + 1] = sin(delta * j);
+ w[j + 2] = cos(3 * delta * j);
+ w[j + 3] = -sin(3 * delta * j);
+ }
+ }
+ nw0 = 0;
+ while (nwh > 2) {
+ nw1 = nw0 + nwh;
+ nwh >>= 1;
+ w[nw1] = 1;
+ w[nw1 + 1] = wn4r;
+ if (nwh == 4) {
+ wk1r = w[nw0 + 4];
+ wk1i = w[nw0 + 5];
+ w[nw1 + 2] = wk1r;
+ w[nw1 + 3] = wk1i;
+ } else if (nwh > 4) {
+ wk1r = w[nw0 + 4];
+ wk3r = w[nw0 + 6];
+ w[nw1 + 2] = 0.5 / wk1r;
+ w[nw1 + 3] = 0.5 / wk3r;
+ for (j = 4; j < nwh; j += 4) {
+ wk1r = w[nw0 + 2 * j];
+ wk1i = w[nw0 + 2 * j + 1];
+ wk3r = w[nw0 + 2 * j + 2];
+ wk3i = w[nw0 + 2 * j + 3];
+ w[nw1 + j] = wk1r;
+ w[nw1 + j + 1] = wk1i;
+ w[nw1 + j + 2] = wk3r;
+ w[nw1 + j + 3] = wk3i;
+ }
+ }
+ nw0 = nw1;
+ }
+ }
+}
+
+
+void makeipt(int nw, int *ip)
+{
+ int j, l, m, m2, p, q;
+
+ ip[2] = 0;
+ ip[3] = 16;
+ m = 2;
+ for (l = nw; l > 32; l >>= 2) {
+ m2 = m << 1;
+ q = m2 << 3;
+ for (j = m; j < m2; j++) {
+ p = ip[j] << 2;
+ ip[m + j] = p;
+ ip[m2 + j] = p + q;
+ }
+ m = m2;
+ }
+}
+
+
+void makect(int nc, int *ip, double *c)
+{
+ int j, nch;
+ double delta;
+
+ ip[1] = nc;
+ if (nc > 1) {
+ nch = nc >> 1;
+ delta = atan(1.0) / nch;
+ c[0] = cos(delta * nch);
+ c[nch] = 0.5 * c[0];
+ for (j = 1; j < nch; j++) {
+ c[j] = 0.5 * cos(delta * j);
+ c[nc - j] = 0.5 * sin(delta * j);
+ }
+ }
+}
+
+
+/* -------- child routines -------- */
+
+
+#ifdef USE_CDFT_PTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 8192
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t pthread_t
+#define cdft_thread_create(thp,func,argp) { \
+ if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \
+ fprintf(stderr, "cdft thread error\n"); \
+ exit(1); \
+ } \
+}
+#define cdft_thread_wait(th) { \
+ if (pthread_join(th, NULL) != 0) { \
+ fprintf(stderr, "cdft thread error\n"); \
+ exit(1); \
+ } \
+}
+#endif /* USE_CDFT_PTHREADS */
+
+
+#ifdef USE_CDFT_WINTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 32768
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 524288
+#endif
+#include <windows.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t HANDLE
+#define cdft_thread_create(thp,func,argp) { \
+ DWORD thid; \
+ *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \
+ if (*(thp) == 0) { \
+ fprintf(stderr, "cdft thread error\n"); \
+ exit(1); \
+ } \
+}
+#define cdft_thread_wait(th) { \
+ WaitForSingleObject(th, INFINITE); \
+ CloseHandle(th); \
+}
+#endif /* USE_CDFT_WINTHREADS */
+
+
+void cftfsub(int n, double *a, int *ip, int nw, double *w)
+{
+ void bitrv2(int n, int *ip, double *a);
+ void bitrv216(double *a);
+ void bitrv208(double *a);
+ void cftf1st(int n, double *a, double *w);
+ void cftrec4(int n, double *a, int nw, double *w);
+ void cftleaf(int n, int isplt, double *a, int nw, double *w);
+ void cftfx41(int n, double *a, int nw, double *w);
+ void cftf161(double *a, double *w);
+ void cftf081(double *a, double *w);
+ void cftf040(double *a);
+ void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+ void cftrec4_th(int n, double *a, int nw, double *w);
+#endif /* USE_CDFT_THREADS */
+
+ if (n > 8) {
+ if (n > 32) {
+ cftf1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+ if (n > CDFT_THREADS_BEGIN_N) {
+ cftrec4_th(n, a, nw, w);
+ } else
+#endif /* USE_CDFT_THREADS */
+ if (n > 512) {
+ cftrec4(n, a, nw, w);
+ } else if (n > 128) {
+ cftleaf(n, 1, a, nw, w);
+ } else {
+ cftfx41(n, a, nw, w);
+ }
+ bitrv2(n, ip, a);
+ } else if (n == 32) {
+ cftf161(a, &w[nw - 8]);
+ bitrv216(a);
+ } else {
+ cftf081(a, w);
+ bitrv208(a);
+ }
+ } else if (n == 8) {
+ cftf040(a);
+ } else if (n == 4) {
+ cftx020(a);
+ }
+}
+
+
+void cftbsub(int n, double *a, int *ip, int nw, double *w)
+{
+ void bitrv2conj(int n, int *ip, double *a);
+ void bitrv216neg(double *a);
+ void bitrv208neg(double *a);
+ void cftb1st(int n, double *a, double *w);
+ void cftrec4(int n, double *a, int nw, double *w);
+ void cftleaf(int n, int isplt, double *a, int nw, double *w);
+ void cftfx41(int n, double *a, int nw, double *w);
+ void cftf161(double *a, double *w);
+ void cftf081(double *a, double *w);
+ void cftb040(double *a);
+ void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+ void cftrec4_th(int n, double *a, int nw, double *w);
+#endif /* USE_CDFT_THREADS */
+
+ if (n > 8) {
+ if (n > 32) {
+ cftb1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+ if (n > CDFT_THREADS_BEGIN_N) {
+ cftrec4_th(n, a, nw, w);
+ } else
+#endif /* USE_CDFT_THREADS */
+ if (n > 512) {
+ cftrec4(n, a, nw, w);
+ } else if (n > 128) {
+ cftleaf(n, 1, a, nw, w);
+ } else {
+ cftfx41(n, a, nw, w);
+ }
+ bitrv2conj(n, ip, a);
+ } else if (n == 32) {
+ cftf161(a, &w[nw - 8]);
+ bitrv216neg(a);
+ } else {
+ cftf081(a, w);
+ bitrv208neg(a);
+ }
+ } else if (n == 8) {
+ cftb040(a);
+ } else if (n == 4) {
+ cftx020(a);
+ }
+}
+
+
+void bitrv2(int n, int *ip, double *a)
+{
+ int j, j1, k, k1, l, m, nh, nm;
+ double xr, xi, yr, yi;
+
+ m = 1;
+ for (l = n >> 2; l > 8; l >>= 2) {
+ m <<= 1;
+ }
+ nh = n >> 1;
+ nm = 4 * m;
+ if (l == 8) {
+ for (k = 0; k < m; k++) {
+ for (j = 0; j < k; j++) {
+ j1 = 4 * j + 2 * ip[m + k];
+ k1 = 4 * k + 2 * ip[m + j];
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += 2 * nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 -= nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += 2 * nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nh;
+ k1 += 2;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= 2 * nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 += nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= 2 * nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += 2;
+ k1 += nh;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += 2 * nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 -= nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += 2 * nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nh;
+ k1 -= 2;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= 2 * nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 += nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= 2 * nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ }
+ k1 = 4 * k + 2 * ip[m + k];
+ j1 = k1 + 2;
+ k1 += nh;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += 2 * nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 -= nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= 2;
+ k1 -= nh;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nh + 2;
+ k1 += nh + 2;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nh - nm;
+ k1 += 2 * nm - 2;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ }
+ } else {
+ for (k = 0; k < m; k++) {
+ for (j = 0; j < k; j++) {
+ j1 = 4 * j + ip[m + k];
+ k1 = 4 * k + ip[m + j];
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nh;
+ k1 += 2;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += 2;
+ k1 += nh;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nh;
+ k1 -= 2;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ }
+ k1 = 4 * k + ip[m + k];
+ j1 = k1 + 2;
+ k1 += nh;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += nm;
+ xr = a[j1];
+ xi = a[j1 + 1];
+ yr = a[k1];
+ yi = a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ }
+ }
+}
+
+
+void bitrv2conj(int n, int *ip, double *a)
+{
+ int j, j1, k, k1, l, m, nh, nm;
+ double xr, xi, yr, yi;
+
+ m = 1;
+ for (l = n >> 2; l > 8; l >>= 2) {
+ m <<= 1;
+ }
+ nh = n >> 1;
+ nm = 4 * m;
+ if (l == 8) {
+ for (k = 0; k < m; k++) {
+ for (j = 0; j < k; j++) {
+ j1 = 4 * j + 2 * ip[m + k];
+ k1 = 4 * k + 2 * ip[m + j];
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += 2 * nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 -= nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += 2 * nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nh;
+ k1 += 2;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= 2 * nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 += nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= 2 * nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += 2;
+ k1 += nh;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += 2 * nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 -= nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += 2 * nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nh;
+ k1 -= 2;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= 2 * nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 += nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= 2 * nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ }
+ k1 = 4 * k + 2 * ip[m + k];
+ j1 = k1 + 2;
+ k1 += nh;
+ a[j1 - 1] = -a[j1 - 1];
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ a[k1 + 3] = -a[k1 + 3];
+ j1 += nm;
+ k1 += 2 * nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 -= nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= 2;
+ k1 -= nh;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nh + 2;
+ k1 += nh + 2;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nh - nm;
+ k1 += 2 * nm - 2;
+ a[j1 - 1] = -a[j1 - 1];
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ a[k1 + 3] = -a[k1 + 3];
+ }
+ } else {
+ for (k = 0; k < m; k++) {
+ for (j = 0; j < k; j++) {
+ j1 = 4 * j + ip[m + k];
+ k1 = 4 * k + ip[m + j];
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nh;
+ k1 += 2;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += 2;
+ k1 += nh;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 += nm;
+ k1 += nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nh;
+ k1 -= 2;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ j1 -= nm;
+ k1 -= nm;
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ }
+ k1 = 4 * k + ip[m + k];
+ j1 = k1 + 2;
+ k1 += nh;
+ a[j1 - 1] = -a[j1 - 1];
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ a[k1 + 3] = -a[k1 + 3];
+ j1 += nm;
+ k1 += nm;
+ a[j1 - 1] = -a[j1 - 1];
+ xr = a[j1];
+ xi = -a[j1 + 1];
+ yr = a[k1];
+ yi = -a[k1 + 1];
+ a[j1] = yr;
+ a[j1 + 1] = yi;
+ a[k1] = xr;
+ a[k1 + 1] = xi;
+ a[k1 + 3] = -a[k1 + 3];
+ }
+ }
+}
+
+
+void bitrv216(double *a)
+{
+ double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,
+ x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i,
+ x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i;
+
+ x1r = a[2];
+ x1i = a[3];
+ x2r = a[4];
+ x2i = a[5];
+ x3r = a[6];
+ x3i = a[7];
+ x4r = a[8];
+ x4i = a[9];
+ x5r = a[10];
+ x5i = a[11];
+ x7r = a[14];
+ x7i = a[15];
+ x8r = a[16];
+ x8i = a[17];
+ x10r = a[20];
+ x10i = a[21];
+ x11r = a[22];
+ x11i = a[23];
+ x12r = a[24];
+ x12i = a[25];
+ x13r = a[26];
+ x13i = a[27];
+ x14r = a[28];
+ x14i = a[29];
+ a[2] = x8r;
+ a[3] = x8i;
+ a[4] = x4r;
+ a[5] = x4i;
+ a[6] = x12r;
+ a[7] = x12i;
+ a[8] = x2r;
+ a[9] = x2i;
+ a[10] = x10r;
+ a[11] = x10i;
+ a[14] = x14r;
+ a[15] = x14i;
+ a[16] = x1r;
+ a[17] = x1i;
+ a[20] = x5r;
+ a[21] = x5i;
+ a[22] = x13r;
+ a[23] = x13i;
+ a[24] = x3r;
+ a[25] = x3i;
+ a[26] = x11r;
+ a[27] = x11i;
+ a[28] = x7r;
+ a[29] = x7i;
+}
+
+
+void bitrv216neg(double *a)
+{
+ double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,
+ x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i,
+ x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i,
+ x13r, x13i, x14r, x14i, x15r, x15i;
+
+ x1r = a[2];
+ x1i = a[3];
+ x2r = a[4];
+ x2i = a[5];
+ x3r = a[6];
+ x3i = a[7];
+ x4r = a[8];
+ x4i = a[9];
+ x5r = a[10];
+ x5i = a[11];
+ x6r = a[12];
+ x6i = a[13];
+ x7r = a[14];
+ x7i = a[15];
+ x8r = a[16];
+ x8i = a[17];
+ x9r = a[18];
+ x9i = a[19];
+ x10r = a[20];
+ x10i = a[21];
+ x11r = a[22];
+ x11i = a[23];
+ x12r = a[24];
+ x12i = a[25];
+ x13r = a[26];
+ x13i = a[27];
+ x14r = a[28];
+ x14i = a[29];
+ x15r = a[30];
+ x15i = a[31];
+ a[2] = x15r;
+ a[3] = x15i;
+ a[4] = x7r;
+ a[5] = x7i;
+ a[6] = x11r;
+ a[7] = x11i;
+ a[8] = x3r;
+ a[9] = x3i;
+ a[10] = x13r;
+ a[11] = x13i;
+ a[12] = x5r;
+ a[13] = x5i;
+ a[14] = x9r;
+ a[15] = x9i;
+ a[16] = x1r;
+ a[17] = x1i;
+ a[18] = x14r;
+ a[19] = x14i;
+ a[20] = x6r;
+ a[21] = x6i;
+ a[22] = x10r;
+ a[23] = x10i;
+ a[24] = x2r;
+ a[25] = x2i;
+ a[26] = x12r;
+ a[27] = x12i;
+ a[28] = x4r;
+ a[29] = x4i;
+ a[30] = x8r;
+ a[31] = x8i;
+}
+
+
+void bitrv208(double *a)
+{
+ double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i;
+
+ x1r = a[2];
+ x1i = a[3];
+ x3r = a[6];
+ x3i = a[7];
+ x4r = a[8];
+ x4i = a[9];
+ x6r = a[12];
+ x6i = a[13];
+ a[2] = x4r;
+ a[3] = x4i;
+ a[6] = x6r;
+ a[7] = x6i;
+ a[8] = x1r;
+ a[9] = x1i;
+ a[12] = x3r;
+ a[13] = x3i;
+}
+
+
+void bitrv208neg(double *a)
+{
+ double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,
+ x5r, x5i, x6r, x6i, x7r, x7i;
+
+ x1r = a[2];
+ x1i = a[3];
+ x2r = a[4];
+ x2i = a[5];
+ x3r = a[6];
+ x3i = a[7];
+ x4r = a[8];
+ x4i = a[9];
+ x5r = a[10];
+ x5i = a[11];
+ x6r = a[12];
+ x6i = a[13];
+ x7r = a[14];
+ x7i = a[15];
+ a[2] = x7r;
+ a[3] = x7i;
+ a[4] = x3r;
+ a[5] = x3i;
+ a[6] = x5r;
+ a[7] = x5i;
+ a[8] = x1r;
+ a[9] = x1i;
+ a[10] = x6r;
+ a[11] = x6i;
+ a[12] = x2r;
+ a[13] = x2i;
+ a[14] = x4r;
+ a[15] = x4i;
+}
+
+
+void cftf1st(int n, double *a, double *w)
+{
+ int j, j0, j1, j2, j3, k, m, mh;
+ double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i,
+ wd1r, wd1i, wd3r, wd3i;
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
+ y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
+
+ mh = n >> 3;
+ m = 2 * mh;
+ j1 = m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[0] + a[j2];
+ x0i = a[1] + a[j2 + 1];
+ x1r = a[0] - a[j2];
+ x1i = a[1] - a[j2 + 1];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ a[0] = x0r + x2r;
+ a[1] = x0i + x2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i - x2i;
+ a[j2] = x1r - x3i;
+ a[j2 + 1] = x1i + x3r;
+ a[j3] = x1r + x3i;
+ a[j3 + 1] = x1i - x3r;
+ wn4r = w[1];
+ csc1 = w[2];
+ csc3 = w[3];
+ wd1r = 1;
+ wd1i = 0;
+ wd3r = 1;
+ wd3i = 0;
+ k = 0;
+ for (j = 2; j < mh - 2; j += 4) {
+ k += 4;
+ wk1r = csc1 * (wd1r + w[k]);
+ wk1i = csc1 * (wd1i + w[k + 1]);
+ wk3r = csc3 * (wd3r + w[k + 2]);
+ wk3i = csc3 * (wd3i + w[k + 3]);
+ wd1r = w[k];
+ wd1i = w[k + 1];
+ wd3r = w[k + 2];
+ wd3i = w[k + 3];
+ j1 = j + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j] + a[j2];
+ x0i = a[j + 1] + a[j2 + 1];
+ x1r = a[j] - a[j2];
+ x1i = a[j + 1] - a[j2 + 1];
+ y0r = a[j + 2] + a[j2 + 2];
+ y0i = a[j + 3] + a[j2 + 3];
+ y1r = a[j + 2] - a[j2 + 2];
+ y1i = a[j + 3] - a[j2 + 3];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ y2r = a[j1 + 2] + a[j3 + 2];
+ y2i = a[j1 + 3] + a[j3 + 3];
+ y3r = a[j1 + 2] - a[j3 + 2];
+ y3i = a[j1 + 3] - a[j3 + 3];
+ a[j] = x0r + x2r;
+ a[j + 1] = x0i + x2i;
+ a[j + 2] = y0r + y2r;
+ a[j + 3] = y0i + y2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i - x2i;
+ a[j1 + 2] = y0r - y2r;
+ a[j1 + 3] = y0i - y2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j2] = wk1r * x0r - wk1i * x0i;
+ a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+ x0r = y1r - y3i;
+ x0i = y1i + y3r;
+ a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+ a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j3] = wk3r * x0r + wk3i * x0i;
+ a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+ x0r = y1r + y3i;
+ x0i = y1i - y3r;
+ a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+ a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+ j0 = m - j;
+ j1 = j0 + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j0] + a[j2];
+ x0i = a[j0 + 1] + a[j2 + 1];
+ x1r = a[j0] - a[j2];
+ x1i = a[j0 + 1] - a[j2 + 1];
+ y0r = a[j0 - 2] + a[j2 - 2];
+ y0i = a[j0 - 1] + a[j2 - 1];
+ y1r = a[j0 - 2] - a[j2 - 2];
+ y1i = a[j0 - 1] - a[j2 - 1];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ y2r = a[j1 - 2] + a[j3 - 2];
+ y2i = a[j1 - 1] + a[j3 - 1];
+ y3r = a[j1 - 2] - a[j3 - 2];
+ y3i = a[j1 - 1] - a[j3 - 1];
+ a[j0] = x0r + x2r;
+ a[j0 + 1] = x0i + x2i;
+ a[j0 - 2] = y0r + y2r;
+ a[j0 - 1] = y0i + y2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i - x2i;
+ a[j1 - 2] = y0r - y2r;
+ a[j1 - 1] = y0i - y2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j2] = wk1i * x0r - wk1r * x0i;
+ a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+ x0r = y1r - y3i;
+ x0i = y1i + y3r;
+ a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+ a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j3] = wk3i * x0r + wk3r * x0i;
+ a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+ x0r = y1r + y3i;
+ x0i = y1i - y3r;
+ a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+ a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+ }
+ wk1r = csc1 * (wd1r + wn4r);
+ wk1i = csc1 * (wd1i + wn4r);
+ wk3r = csc3 * (wd3r - wn4r);
+ wk3i = csc3 * (wd3i - wn4r);
+ j0 = mh;
+ j1 = j0 + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j0 - 2] + a[j2 - 2];
+ x0i = a[j0 - 1] + a[j2 - 1];
+ x1r = a[j0 - 2] - a[j2 - 2];
+ x1i = a[j0 - 1] - a[j2 - 1];
+ x2r = a[j1 - 2] + a[j3 - 2];
+ x2i = a[j1 - 1] + a[j3 - 1];
+ x3r = a[j1 - 2] - a[j3 - 2];
+ x3i = a[j1 - 1] - a[j3 - 1];
+ a[j0 - 2] = x0r + x2r;
+ a[j0 - 1] = x0i + x2i;
+ a[j1 - 2] = x0r - x2r;
+ a[j1 - 1] = x0i - x2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+ a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+ a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+ x0r = a[j0] + a[j2];
+ x0i = a[j0 + 1] + a[j2 + 1];
+ x1r = a[j0] - a[j2];
+ x1i = a[j0 + 1] - a[j2 + 1];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ a[j0] = x0r + x2r;
+ a[j0 + 1] = x0i + x2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i - x2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j2] = wn4r * (x0r - x0i);
+ a[j2 + 1] = wn4r * (x0i + x0r);
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j3] = -wn4r * (x0r + x0i);
+ a[j3 + 1] = -wn4r * (x0i - x0r);
+ x0r = a[j0 + 2] + a[j2 + 2];
+ x0i = a[j0 + 3] + a[j2 + 3];
+ x1r = a[j0 + 2] - a[j2 + 2];
+ x1i = a[j0 + 3] - a[j2 + 3];
+ x2r = a[j1 + 2] + a[j3 + 2];
+ x2i = a[j1 + 3] + a[j3 + 3];
+ x3r = a[j1 + 2] - a[j3 + 2];
+ x3i = a[j1 + 3] - a[j3 + 3];
+ a[j0 + 2] = x0r + x2r;
+ a[j0 + 3] = x0i + x2i;
+ a[j1 + 2] = x0r - x2r;
+ a[j1 + 3] = x0i - x2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+ a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+ a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+void cftb1st(int n, double *a, double *w)
+{
+ int j, j0, j1, j2, j3, k, m, mh;
+ double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i,
+ wd1r, wd1i, wd3r, wd3i;
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
+ y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
+
+ mh = n >> 3;
+ m = 2 * mh;
+ j1 = m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[0] + a[j2];
+ x0i = -a[1] - a[j2 + 1];
+ x1r = a[0] - a[j2];
+ x1i = -a[1] + a[j2 + 1];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ a[0] = x0r + x2r;
+ a[1] = x0i - x2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i + x2i;
+ a[j2] = x1r + x3i;
+ a[j2 + 1] = x1i + x3r;
+ a[j3] = x1r - x3i;
+ a[j3 + 1] = x1i - x3r;
+ wn4r = w[1];
+ csc1 = w[2];
+ csc3 = w[3];
+ wd1r = 1;
+ wd1i = 0;
+ wd3r = 1;
+ wd3i = 0;
+ k = 0;
+ for (j = 2; j < mh - 2; j += 4) {
+ k += 4;
+ wk1r = csc1 * (wd1r + w[k]);
+ wk1i = csc1 * (wd1i + w[k + 1]);
+ wk3r = csc3 * (wd3r + w[k + 2]);
+ wk3i = csc3 * (wd3i + w[k + 3]);
+ wd1r = w[k];
+ wd1i = w[k + 1];
+ wd3r = w[k + 2];
+ wd3i = w[k + 3];
+ j1 = j + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j] + a[j2];
+ x0i = -a[j + 1] - a[j2 + 1];
+ x1r = a[j] - a[j2];
+ x1i = -a[j + 1] + a[j2 + 1];
+ y0r = a[j + 2] + a[j2 + 2];
+ y0i = -a[j + 3] - a[j2 + 3];
+ y1r = a[j + 2] - a[j2 + 2];
+ y1i = -a[j + 3] + a[j2 + 3];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ y2r = a[j1 + 2] + a[j3 + 2];
+ y2i = a[j1 + 3] + a[j3 + 3];
+ y3r = a[j1 + 2] - a[j3 + 2];
+ y3i = a[j1 + 3] - a[j3 + 3];
+ a[j] = x0r + x2r;
+ a[j + 1] = x0i - x2i;
+ a[j + 2] = y0r + y2r;
+ a[j + 3] = y0i - y2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i + x2i;
+ a[j1 + 2] = y0r - y2r;
+ a[j1 + 3] = y0i + y2i;
+ x0r = x1r + x3i;
+ x0i = x1i + x3r;
+ a[j2] = wk1r * x0r - wk1i * x0i;
+ a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+ x0r = y1r + y3i;
+ x0i = y1i + y3r;
+ a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+ a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+ x0r = x1r - x3i;
+ x0i = x1i - x3r;
+ a[j3] = wk3r * x0r + wk3i * x0i;
+ a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+ x0r = y1r - y3i;
+ x0i = y1i - y3r;
+ a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+ a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+ j0 = m - j;
+ j1 = j0 + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j0] + a[j2];
+ x0i = -a[j0 + 1] - a[j2 + 1];
+ x1r = a[j0] - a[j2];
+ x1i = -a[j0 + 1] + a[j2 + 1];
+ y0r = a[j0 - 2] + a[j2 - 2];
+ y0i = -a[j0 - 1] - a[j2 - 1];
+ y1r = a[j0 - 2] - a[j2 - 2];
+ y1i = -a[j0 - 1] + a[j2 - 1];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ y2r = a[j1 - 2] + a[j3 - 2];
+ y2i = a[j1 - 1] + a[j3 - 1];
+ y3r = a[j1 - 2] - a[j3 - 2];
+ y3i = a[j1 - 1] - a[j3 - 1];
+ a[j0] = x0r + x2r;
+ a[j0 + 1] = x0i - x2i;
+ a[j0 - 2] = y0r + y2r;
+ a[j0 - 1] = y0i - y2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i + x2i;
+ a[j1 - 2] = y0r - y2r;
+ a[j1 - 1] = y0i + y2i;
+ x0r = x1r + x3i;
+ x0i = x1i + x3r;
+ a[j2] = wk1i * x0r - wk1r * x0i;
+ a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+ x0r = y1r + y3i;
+ x0i = y1i + y3r;
+ a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+ a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+ x0r = x1r - x3i;
+ x0i = x1i - x3r;
+ a[j3] = wk3i * x0r + wk3r * x0i;
+ a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+ x0r = y1r - y3i;
+ x0i = y1i - y3r;
+ a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+ a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+ }
+ wk1r = csc1 * (wd1r + wn4r);
+ wk1i = csc1 * (wd1i + wn4r);
+ wk3r = csc3 * (wd3r - wn4r);
+ wk3i = csc3 * (wd3i - wn4r);
+ j0 = mh;
+ j1 = j0 + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j0 - 2] + a[j2 - 2];
+ x0i = -a[j0 - 1] - a[j2 - 1];
+ x1r = a[j0 - 2] - a[j2 - 2];
+ x1i = -a[j0 - 1] + a[j2 - 1];
+ x2r = a[j1 - 2] + a[j3 - 2];
+ x2i = a[j1 - 1] + a[j3 - 1];
+ x3r = a[j1 - 2] - a[j3 - 2];
+ x3i = a[j1 - 1] - a[j3 - 1];
+ a[j0 - 2] = x0r + x2r;
+ a[j0 - 1] = x0i - x2i;
+ a[j1 - 2] = x0r - x2r;
+ a[j1 - 1] = x0i + x2i;
+ x0r = x1r + x3i;
+ x0i = x1i + x3r;
+ a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+ a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+ x0r = x1r - x3i;
+ x0i = x1i - x3r;
+ a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+ a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+ x0r = a[j0] + a[j2];
+ x0i = -a[j0 + 1] - a[j2 + 1];
+ x1r = a[j0] - a[j2];
+ x1i = -a[j0 + 1] + a[j2 + 1];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ a[j0] = x0r + x2r;
+ a[j0 + 1] = x0i - x2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i + x2i;
+ x0r = x1r + x3i;
+ x0i = x1i + x3r;
+ a[j2] = wn4r * (x0r - x0i);
+ a[j2 + 1] = wn4r * (x0i + x0r);
+ x0r = x1r - x3i;
+ x0i = x1i - x3r;
+ a[j3] = -wn4r * (x0r + x0i);
+ a[j3 + 1] = -wn4r * (x0i - x0r);
+ x0r = a[j0 + 2] + a[j2 + 2];
+ x0i = -a[j0 + 3] - a[j2 + 3];
+ x1r = a[j0 + 2] - a[j2 + 2];
+ x1i = -a[j0 + 3] + a[j2 + 3];
+ x2r = a[j1 + 2] + a[j3 + 2];
+ x2i = a[j1 + 3] + a[j3 + 3];
+ x3r = a[j1 + 2] - a[j3 + 2];
+ x3i = a[j1 + 3] - a[j3 + 3];
+ a[j0 + 2] = x0r + x2r;
+ a[j0 + 3] = x0i - x2i;
+ a[j1 + 2] = x0r - x2r;
+ a[j1 + 3] = x0i + x2i;
+ x0r = x1r + x3i;
+ x0i = x1i + x3r;
+ a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+ a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+ x0r = x1r - x3i;
+ x0i = x1i - x3r;
+ a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+ a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+#ifdef USE_CDFT_THREADS
+struct cdft_arg_st {
+ int n0;
+ int n;
+ double *a;
+ int nw;
+ double *w;
+};
+typedef struct cdft_arg_st cdft_arg_t;
+
+
+void cftrec4_th(int n, double *a, int nw, double *w)
+{
+ void *cftrec1_th(void *p);
+ void *cftrec2_th(void *p);
+ int i, idiv4, m, nthread;
+ cdft_thread_t th[4];
+ cdft_arg_t ag[4];
+
+ nthread = 2;
+ idiv4 = 0;
+ m = n >> 1;
+ if (n > CDFT_4THREADS_BEGIN_N) {
+ nthread = 4;
+ idiv4 = 1;
+ m >>= 1;
+ }
+ for (i = 0; i < nthread; i++) {
+ ag[i].n0 = n;
+ ag[i].n = m;
+ ag[i].a = &a[i * m];
+ ag[i].nw = nw;
+ ag[i].w = w;
+ if (i != idiv4) {
+ cdft_thread_create(&th[i], cftrec1_th, &ag[i]);
+ } else {
+ cdft_thread_create(&th[i], cftrec2_th, &ag[i]);
+ }
+ }
+ for (i = 0; i < nthread; i++) {
+ cdft_thread_wait(th[i]);
+ }
+}
+
+
+void *cftrec1_th(void *p)
+{
+ int cfttree(int n, int j, int k, double *a, int nw, double *w);
+ void cftleaf(int n, int isplt, double *a, int nw, double *w);
+ void cftmdl1(int n, double *a, double *w);
+ int isplt, j, k, m, n, n0, nw;
+ double *a, *w;
+
+ n0 = ((cdft_arg_t *) p)->n0;
+ n = ((cdft_arg_t *) p)->n;
+ a = ((cdft_arg_t *) p)->a;
+ nw = ((cdft_arg_t *) p)->nw;
+ w = ((cdft_arg_t *) p)->w;
+ m = n0;
+ while (m > 512) {
+ m >>= 2;
+ cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+ }
+ cftleaf(m, 1, &a[n - m], nw, w);
+ k = 0;
+ for (j = n - m; j > 0; j -= m) {
+ k++;
+ isplt = cfttree(m, j, k, a, nw, w);
+ cftleaf(m, isplt, &a[j - m], nw, w);
+ }
+ return (void *) 0;
+}
+
+
+void *cftrec2_th(void *p)
+{
+ int cfttree(int n, int j, int k, double *a, int nw, double *w);
+ void cftleaf(int n, int isplt, double *a, int nw, double *w);
+ void cftmdl2(int n, double *a, double *w);
+ int isplt, j, k, m, n, n0, nw;
+ double *a, *w;
+
+ n0 = ((cdft_arg_t *) p)->n0;
+ n = ((cdft_arg_t *) p)->n;
+ a = ((cdft_arg_t *) p)->a;
+ nw = ((cdft_arg_t *) p)->nw;
+ w = ((cdft_arg_t *) p)->w;
+ k = 1;
+ m = n0;
+ while (m > 512) {
+ m >>= 2;
+ k <<= 2;
+ cftmdl2(m, &a[n - m], &w[nw - m]);
+ }
+ cftleaf(m, 0, &a[n - m], nw, w);
+ k >>= 1;
+ for (j = n - m; j > 0; j -= m) {
+ k++;
+ isplt = cfttree(m, j, k, a, nw, w);
+ cftleaf(m, isplt, &a[j - m], nw, w);
+ }
+ return (void *) 0;
+}
+#endif /* USE_CDFT_THREADS */
+
+
+void cftrec4(int n, double *a, int nw, double *w)
+{
+ int cfttree(int n, int j, int k, double *a, int nw, double *w);
+ void cftleaf(int n, int isplt, double *a, int nw, double *w);
+ void cftmdl1(int n, double *a, double *w);
+ int isplt, j, k, m;
+
+ m = n;
+ while (m > 512) {
+ m >>= 2;
+ cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+ }
+ cftleaf(m, 1, &a[n - m], nw, w);
+ k = 0;
+ for (j = n - m; j > 0; j -= m) {
+ k++;
+ isplt = cfttree(m, j, k, a, nw, w);
+ cftleaf(m, isplt, &a[j - m], nw, w);
+ }
+}
+
+
+int cfttree(int n, int j, int k, double *a, int nw, double *w)
+{
+ void cftmdl1(int n, double *a, double *w);
+ void cftmdl2(int n, double *a, double *w);
+ int i, isplt, m;
+
+ if ((k & 3) != 0) {
+ isplt = k & 1;
+ if (isplt != 0) {
+ cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]);
+ } else {
+ cftmdl2(n, &a[j - n], &w[nw - n]);
+ }
+ } else {
+ m = n;
+ for (i = k; (i & 3) == 0; i >>= 2) {
+ m <<= 2;
+ }
+ isplt = i & 1;
+ if (isplt != 0) {
+ while (m > 128) {
+ cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]);
+ m >>= 2;
+ }
+ } else {
+ while (m > 128) {
+ cftmdl2(m, &a[j - m], &w[nw - m]);
+ m >>= 2;
+ }
+ }
+ }
+ return isplt;
+}
+
+
+void cftleaf(int n, int isplt, double *a, int nw, double *w)
+{
+ void cftmdl1(int n, double *a, double *w);
+ void cftmdl2(int n, double *a, double *w);
+ void cftf161(double *a, double *w);
+ void cftf162(double *a, double *w);
+ void cftf081(double *a, double *w);
+ void cftf082(double *a, double *w);
+
+ if (n == 512) {
+ cftmdl1(128, a, &w[nw - 64]);
+ cftf161(a, &w[nw - 8]);
+ cftf162(&a[32], &w[nw - 32]);
+ cftf161(&a[64], &w[nw - 8]);
+ cftf161(&a[96], &w[nw - 8]);
+ cftmdl2(128, &a[128], &w[nw - 128]);
+ cftf161(&a[128], &w[nw - 8]);
+ cftf162(&a[160], &w[nw - 32]);
+ cftf161(&a[192], &w[nw - 8]);
+ cftf162(&a[224], &w[nw - 32]);
+ cftmdl1(128, &a[256], &w[nw - 64]);
+ cftf161(&a[256], &w[nw - 8]);
+ cftf162(&a[288], &w[nw - 32]);
+ cftf161(&a[320], &w[nw - 8]);
+ cftf161(&a[352], &w[nw - 8]);
+ if (isplt != 0) {
+ cftmdl1(128, &a[384], &w[nw - 64]);
+ cftf161(&a[480], &w[nw - 8]);
+ } else {
+ cftmdl2(128, &a[384], &w[nw - 128]);
+ cftf162(&a[480], &w[nw - 32]);
+ }
+ cftf161(&a[384], &w[nw - 8]);
+ cftf162(&a[416], &w[nw - 32]);
+ cftf161(&a[448], &w[nw - 8]);
+ } else {
+ cftmdl1(64, a, &w[nw - 32]);
+ cftf081(a, &w[nw - 8]);
+ cftf082(&a[16], &w[nw - 8]);
+ cftf081(&a[32], &w[nw - 8]);
+ cftf081(&a[48], &w[nw - 8]);
+ cftmdl2(64, &a[64], &w[nw - 64]);
+ cftf081(&a[64], &w[nw - 8]);
+ cftf082(&a[80], &w[nw - 8]);
+ cftf081(&a[96], &w[nw - 8]);
+ cftf082(&a[112], &w[nw - 8]);
+ cftmdl1(64, &a[128], &w[nw - 32]);
+ cftf081(&a[128], &w[nw - 8]);
+ cftf082(&a[144], &w[nw - 8]);
+ cftf081(&a[160], &w[nw - 8]);
+ cftf081(&a[176], &w[nw - 8]);
+ if (isplt != 0) {
+ cftmdl1(64, &a[192], &w[nw - 32]);
+ cftf081(&a[240], &w[nw - 8]);
+ } else {
+ cftmdl2(64, &a[192], &w[nw - 64]);
+ cftf082(&a[240], &w[nw - 8]);
+ }
+ cftf081(&a[192], &w[nw - 8]);
+ cftf082(&a[208], &w[nw - 8]);
+ cftf081(&a[224], &w[nw - 8]);
+ }
+}
+
+
+void cftmdl1(int n, double *a, double *w)
+{
+ int j, j0, j1, j2, j3, k, m, mh;
+ double wn4r, wk1r, wk1i, wk3r, wk3i;
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+ mh = n >> 3;
+ m = 2 * mh;
+ j1 = m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[0] + a[j2];
+ x0i = a[1] + a[j2 + 1];
+ x1r = a[0] - a[j2];
+ x1i = a[1] - a[j2 + 1];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ a[0] = x0r + x2r;
+ a[1] = x0i + x2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i - x2i;
+ a[j2] = x1r - x3i;
+ a[j2 + 1] = x1i + x3r;
+ a[j3] = x1r + x3i;
+ a[j3 + 1] = x1i - x3r;
+ wn4r = w[1];
+ k = 0;
+ for (j = 2; j < mh; j += 2) {
+ k += 4;
+ wk1r = w[k];
+ wk1i = w[k + 1];
+ wk3r = w[k + 2];
+ wk3i = w[k + 3];
+ j1 = j + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j] + a[j2];
+ x0i = a[j + 1] + a[j2 + 1];
+ x1r = a[j] - a[j2];
+ x1i = a[j + 1] - a[j2 + 1];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ a[j] = x0r + x2r;
+ a[j + 1] = x0i + x2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i - x2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j2] = wk1r * x0r - wk1i * x0i;
+ a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j3] = wk3r * x0r + wk3i * x0i;
+ a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+ j0 = m - j;
+ j1 = j0 + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j0] + a[j2];
+ x0i = a[j0 + 1] + a[j2 + 1];
+ x1r = a[j0] - a[j2];
+ x1i = a[j0 + 1] - a[j2 + 1];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ a[j0] = x0r + x2r;
+ a[j0 + 1] = x0i + x2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i - x2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j2] = wk1i * x0r - wk1r * x0i;
+ a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j3] = wk3i * x0r + wk3r * x0i;
+ a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+ }
+ j0 = mh;
+ j1 = j0 + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j0] + a[j2];
+ x0i = a[j0 + 1] + a[j2 + 1];
+ x1r = a[j0] - a[j2];
+ x1i = a[j0 + 1] - a[j2 + 1];
+ x2r = a[j1] + a[j3];
+ x2i = a[j1 + 1] + a[j3 + 1];
+ x3r = a[j1] - a[j3];
+ x3i = a[j1 + 1] - a[j3 + 1];
+ a[j0] = x0r + x2r;
+ a[j0 + 1] = x0i + x2i;
+ a[j1] = x0r - x2r;
+ a[j1 + 1] = x0i - x2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ a[j2] = wn4r * (x0r - x0i);
+ a[j2 + 1] = wn4r * (x0i + x0r);
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ a[j3] = -wn4r * (x0r + x0i);
+ a[j3 + 1] = -wn4r * (x0i - x0r);
+}
+
+
+void cftmdl2(int n, double *a, double *w)
+{
+ int j, j0, j1, j2, j3, k, kr, m, mh;
+ double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i;
+
+ mh = n >> 3;
+ m = 2 * mh;
+ wn4r = w[1];
+ j1 = m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[0] - a[j2 + 1];
+ x0i = a[1] + a[j2];
+ x1r = a[0] + a[j2 + 1];
+ x1i = a[1] - a[j2];
+ x2r = a[j1] - a[j3 + 1];
+ x2i = a[j1 + 1] + a[j3];
+ x3r = a[j1] + a[j3 + 1];
+ x3i = a[j1 + 1] - a[j3];
+ y0r = wn4r * (x2r - x2i);
+ y0i = wn4r * (x2i + x2r);
+ a[0] = x0r + y0r;
+ a[1] = x0i + y0i;
+ a[j1] = x0r - y0r;
+ a[j1 + 1] = x0i - y0i;
+ y0r = wn4r * (x3r - x3i);
+ y0i = wn4r * (x3i + x3r);
+ a[j2] = x1r - y0i;
+ a[j2 + 1] = x1i + y0r;
+ a[j3] = x1r + y0i;
+ a[j3 + 1] = x1i - y0r;
+ k = 0;
+ kr = 2 * m;
+ for (j = 2; j < mh; j += 2) {
+ k += 4;
+ wk1r = w[k];
+ wk1i = w[k + 1];
+ wk3r = w[k + 2];
+ wk3i = w[k + 3];
+ kr -= 4;
+ wd1i = w[kr];
+ wd1r = w[kr + 1];
+ wd3i = w[kr + 2];
+ wd3r = w[kr + 3];
+ j1 = j + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j] - a[j2 + 1];
+ x0i = a[j + 1] + a[j2];
+ x1r = a[j] + a[j2 + 1];
+ x1i = a[j + 1] - a[j2];
+ x2r = a[j1] - a[j3 + 1];
+ x2i = a[j1 + 1] + a[j3];
+ x3r = a[j1] + a[j3 + 1];
+ x3i = a[j1 + 1] - a[j3];
+ y0r = wk1r * x0r - wk1i * x0i;
+ y0i = wk1r * x0i + wk1i * x0r;
+ y2r = wd1r * x2r - wd1i * x2i;
+ y2i = wd1r * x2i + wd1i * x2r;
+ a[j] = y0r + y2r;
+ a[j + 1] = y0i + y2i;
+ a[j1] = y0r - y2r;
+ a[j1 + 1] = y0i - y2i;
+ y0r = wk3r * x1r + wk3i * x1i;
+ y0i = wk3r * x1i - wk3i * x1r;
+ y2r = wd3r * x3r + wd3i * x3i;
+ y2i = wd3r * x3i - wd3i * x3r;
+ a[j2] = y0r + y2r;
+ a[j2 + 1] = y0i + y2i;
+ a[j3] = y0r - y2r;
+ a[j3 + 1] = y0i - y2i;
+ j0 = m - j;
+ j1 = j0 + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j0] - a[j2 + 1];
+ x0i = a[j0 + 1] + a[j2];
+ x1r = a[j0] + a[j2 + 1];
+ x1i = a[j0 + 1] - a[j2];
+ x2r = a[j1] - a[j3 + 1];
+ x2i = a[j1 + 1] + a[j3];
+ x3r = a[j1] + a[j3 + 1];
+ x3i = a[j1 + 1] - a[j3];
+ y0r = wd1i * x0r - wd1r * x0i;
+ y0i = wd1i * x0i + wd1r * x0r;
+ y2r = wk1i * x2r - wk1r * x2i;
+ y2i = wk1i * x2i + wk1r * x2r;
+ a[j0] = y0r + y2r;
+ a[j0 + 1] = y0i + y2i;
+ a[j1] = y0r - y2r;
+ a[j1 + 1] = y0i - y2i;
+ y0r = wd3i * x1r + wd3r * x1i;
+ y0i = wd3i * x1i - wd3r * x1r;
+ y2r = wk3i * x3r + wk3r * x3i;
+ y2i = wk3i * x3i - wk3r * x3r;
+ a[j2] = y0r + y2r;
+ a[j2 + 1] = y0i + y2i;
+ a[j3] = y0r - y2r;
+ a[j3 + 1] = y0i - y2i;
+ }
+ wk1r = w[m];
+ wk1i = w[m + 1];
+ j0 = mh;
+ j1 = j0 + m;
+ j2 = j1 + m;
+ j3 = j2 + m;
+ x0r = a[j0] - a[j2 + 1];
+ x0i = a[j0 + 1] + a[j2];
+ x1r = a[j0] + a[j2 + 1];
+ x1i = a[j0 + 1] - a[j2];
+ x2r = a[j1] - a[j3 + 1];
+ x2i = a[j1 + 1] + a[j3];
+ x3r = a[j1] + a[j3 + 1];
+ x3i = a[j1 + 1] - a[j3];
+ y0r = wk1r * x0r - wk1i * x0i;
+ y0i = wk1r * x0i + wk1i * x0r;
+ y2r = wk1i * x2r - wk1r * x2i;
+ y2i = wk1i * x2i + wk1r * x2r;
+ a[j0] = y0r + y2r;
+ a[j0 + 1] = y0i + y2i;
+ a[j1] = y0r - y2r;
+ a[j1 + 1] = y0i - y2i;
+ y0r = wk1i * x1r - wk1r * x1i;
+ y0i = wk1i * x1i + wk1r * x1r;
+ y2r = wk1r * x3r - wk1i * x3i;
+ y2i = wk1r * x3i + wk1i * x3r;
+ a[j2] = y0r - y2r;
+ a[j2 + 1] = y0i - y2i;
+ a[j3] = y0r + y2r;
+ a[j3 + 1] = y0i + y2i;
+}
+
+
+void cftfx41(int n, double *a, int nw, double *w)
+{
+ void cftf161(double *a, double *w);
+ void cftf162(double *a, double *w);
+ void cftf081(double *a, double *w);
+ void cftf082(double *a, double *w);
+
+ if (n == 128) {
+ cftf161(a, &w[nw - 8]);
+ cftf162(&a[32], &w[nw - 32]);
+ cftf161(&a[64], &w[nw - 8]);
+ cftf161(&a[96], &w[nw - 8]);
+ } else {
+ cftf081(a, &w[nw - 8]);
+ cftf082(&a[16], &w[nw - 8]);
+ cftf081(&a[32], &w[nw - 8]);
+ cftf081(&a[48], &w[nw - 8]);
+ }
+}
+
+
+void cftf161(double *a, double *w)
+{
+ double wn4r, wk1r, wk1i,
+ x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
+ y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
+ y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i,
+ y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i,
+ y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+
+ wn4r = w[1];
+ wk1r = w[2];
+ wk1i = w[3];
+ x0r = a[0] + a[16];
+ x0i = a[1] + a[17];
+ x1r = a[0] - a[16];
+ x1i = a[1] - a[17];
+ x2r = a[8] + a[24];
+ x2i = a[9] + a[25];
+ x3r = a[8] - a[24];
+ x3i = a[9] - a[25];
+ y0r = x0r + x2r;
+ y0i = x0i + x2i;
+ y4r = x0r - x2r;
+ y4i = x0i - x2i;
+ y8r = x1r - x3i;
+ y8i = x1i + x3r;
+ y12r = x1r + x3i;
+ y12i = x1i - x3r;
+ x0r = a[2] + a[18];
+ x0i = a[3] + a[19];
+ x1r = a[2] - a[18];
+ x1i = a[3] - a[19];
+ x2r = a[10] + a[26];
+ x2i = a[11] + a[27];
+ x3r = a[10] - a[26];
+ x3i = a[11] - a[27];
+ y1r = x0r + x2r;
+ y1i = x0i + x2i;
+ y5r = x0r - x2r;
+ y5i = x0i - x2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ y9r = wk1r * x0r - wk1i * x0i;
+ y9i = wk1r * x0i + wk1i * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ y13r = wk1i * x0r - wk1r * x0i;
+ y13i = wk1i * x0i + wk1r * x0r;
+ x0r = a[4] + a[20];
+ x0i = a[5] + a[21];
+ x1r = a[4] - a[20];
+ x1i = a[5] - a[21];
+ x2r = a[12] + a[28];
+ x2i = a[13] + a[29];
+ x3r = a[12] - a[28];
+ x3i = a[13] - a[29];
+ y2r = x0r + x2r;
+ y2i = x0i + x2i;
+ y6r = x0r - x2r;
+ y6i = x0i - x2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ y10r = wn4r * (x0r - x0i);
+ y10i = wn4r * (x0i + x0r);
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ y14r = wn4r * (x0r + x0i);
+ y14i = wn4r * (x0i - x0r);
+ x0r = a[6] + a[22];
+ x0i = a[7] + a[23];
+ x1r = a[6] - a[22];
+ x1i = a[7] - a[23];
+ x2r = a[14] + a[30];
+ x2i = a[15] + a[31];
+ x3r = a[14] - a[30];
+ x3i = a[15] - a[31];
+ y3r = x0r + x2r;
+ y3i = x0i + x2i;
+ y7r = x0r - x2r;
+ y7i = x0i - x2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ y11r = wk1i * x0r - wk1r * x0i;
+ y11i = wk1i * x0i + wk1r * x0r;
+ x0r = x1r + x3i;
+ x0i = x1i - x3r;
+ y15r = wk1r * x0r - wk1i * x0i;
+ y15i = wk1r * x0i + wk1i * x0r;
+ x0r = y12r - y14r;
+ x0i = y12i - y14i;
+ x1r = y12r + y14r;
+ x1i = y12i + y14i;
+ x2r = y13r - y15r;
+ x2i = y13i - y15i;
+ x3r = y13r + y15r;
+ x3i = y13i + y15i;
+ a[24] = x0r + x2r;
+ a[25] = x0i + x2i;
+ a[26] = x0r - x2r;
+ a[27] = x0i - x2i;
+ a[28] = x1r - x3i;
+ a[29] = x1i + x3r;
+ a[30] = x1r + x3i;
+ a[31] = x1i - x3r;
+ x0r = y8r + y10r;
+ x0i = y8i + y10i;
+ x1r = y8r - y10r;
+ x1i = y8i - y10i;
+ x2r = y9r + y11r;
+ x2i = y9i + y11i;
+ x3r = y9r - y11r;
+ x3i = y9i - y11i;
+ a[16] = x0r + x2r;
+ a[17] = x0i + x2i;
+ a[18] = x0r - x2r;
+ a[19] = x0i - x2i;
+ a[20] = x1r - x3i;
+ a[21] = x1i + x3r;
+ a[22] = x1r + x3i;
+ a[23] = x1i - x3r;
+ x0r = y5r - y7i;
+ x0i = y5i + y7r;
+ x2r = wn4r * (x0r - x0i);
+ x2i = wn4r * (x0i + x0r);
+ x0r = y5r + y7i;
+ x0i = y5i - y7r;
+ x3r = wn4r * (x0r - x0i);
+ x3i = wn4r * (x0i + x0r);
+ x0r = y4r - y6i;
+ x0i = y4i + y6r;
+ x1r = y4r + y6i;
+ x1i = y4i - y6r;
+ a[8] = x0r + x2r;
+ a[9] = x0i + x2i;
+ a[10] = x0r - x2r;
+ a[11] = x0i - x2i;
+ a[12] = x1r - x3i;
+ a[13] = x1i + x3r;
+ a[14] = x1r + x3i;
+ a[15] = x1i - x3r;
+ x0r = y0r + y2r;
+ x0i = y0i + y2i;
+ x1r = y0r - y2r;
+ x1i = y0i - y2i;
+ x2r = y1r + y3r;
+ x2i = y1i + y3i;
+ x3r = y1r - y3r;
+ x3i = y1i - y3i;
+ a[0] = x0r + x2r;
+ a[1] = x0i + x2i;
+ a[2] = x0r - x2r;
+ a[3] = x0i - x2i;
+ a[4] = x1r - x3i;
+ a[5] = x1i + x3r;
+ a[6] = x1r + x3i;
+ a[7] = x1i - x3r;
+}
+
+
+void cftf162(double *a, double *w)
+{
+ double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i,
+ x0r, x0i, x1r, x1i, x2r, x2i,
+ y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
+ y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i,
+ y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i,
+ y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+
+ wn4r = w[1];
+ wk1r = w[4];
+ wk1i = w[5];
+ wk3r = w[6];
+ wk3i = -w[7];
+ wk2r = w[8];
+ wk2i = w[9];
+ x1r = a[0] - a[17];
+ x1i = a[1] + a[16];
+ x0r = a[8] - a[25];
+ x0i = a[9] + a[24];
+ x2r = wn4r * (x0r - x0i);
+ x2i = wn4r * (x0i + x0r);
+ y0r = x1r + x2r;
+ y0i = x1i + x2i;
+ y4r = x1r - x2r;
+ y4i = x1i - x2i;
+ x1r = a[0] + a[17];
+ x1i = a[1] - a[16];
+ x0r = a[8] + a[25];
+ x0i = a[9] - a[24];
+ x2r = wn4r * (x0r - x0i);
+ x2i = wn4r * (x0i + x0r);
+ y8r = x1r - x2i;
+ y8i = x1i + x2r;
+ y12r = x1r + x2i;
+ y12i = x1i - x2r;
+ x0r = a[2] - a[19];
+ x0i = a[3] + a[18];
+ x1r = wk1r * x0r - wk1i * x0i;
+ x1i = wk1r * x0i + wk1i * x0r;
+ x0r = a[10] - a[27];
+ x0i = a[11] + a[26];
+ x2r = wk3i * x0r - wk3r * x0i;
+ x2i = wk3i * x0i + wk3r * x0r;
+ y1r = x1r + x2r;
+ y1i = x1i + x2i;
+ y5r = x1r - x2r;
+ y5i = x1i - x2i;
+ x0r = a[2] + a[19];
+ x0i = a[3] - a[18];
+ x1r = wk3r * x0r - wk3i * x0i;
+ x1i = wk3r * x0i + wk3i * x0r;
+ x0r = a[10] + a[27];
+ x0i = a[11] - a[26];
+ x2r = wk1r * x0r + wk1i * x0i;
+ x2i = wk1r * x0i - wk1i * x0r;
+ y9r = x1r - x2r;
+ y9i = x1i - x2i;
+ y13r = x1r + x2r;
+ y13i = x1i + x2i;
+ x0r = a[4] - a[21];
+ x0i = a[5] + a[20];
+ x1r = wk2r * x0r - wk2i * x0i;
+ x1i = wk2r * x0i + wk2i * x0r;
+ x0r = a[12] - a[29];
+ x0i = a[13] + a[28];
+ x2r = wk2i * x0r - wk2r * x0i;
+ x2i = wk2i * x0i + wk2r * x0r;
+ y2r = x1r + x2r;
+ y2i = x1i + x2i;
+ y6r = x1r - x2r;
+ y6i = x1i - x2i;
+ x0r = a[4] + a[21];
+ x0i = a[5] - a[20];
+ x1r = wk2i * x0r - wk2r * x0i;
+ x1i = wk2i * x0i + wk2r * x0r;
+ x0r = a[12] + a[29];
+ x0i = a[13] - a[28];
+ x2r = wk2r * x0r - wk2i * x0i;
+ x2i = wk2r * x0i + wk2i * x0r;
+ y10r = x1r - x2r;
+ y10i = x1i - x2i;
+ y14r = x1r + x2r;
+ y14i = x1i + x2i;
+ x0r = a[6] - a[23];
+ x0i = a[7] + a[22];
+ x1r = wk3r * x0r - wk3i * x0i;
+ x1i = wk3r * x0i + wk3i * x0r;
+ x0r = a[14] - a[31];
+ x0i = a[15] + a[30];
+ x2r = wk1i * x0r - wk1r * x0i;
+ x2i = wk1i * x0i + wk1r * x0r;
+ y3r = x1r + x2r;
+ y3i = x1i + x2i;
+ y7r = x1r - x2r;
+ y7i = x1i - x2i;
+ x0r = a[6] + a[23];
+ x0i = a[7] - a[22];
+ x1r = wk1i * x0r + wk1r * x0i;
+ x1i = wk1i * x0i - wk1r * x0r;
+ x0r = a[14] + a[31];
+ x0i = a[15] - a[30];
+ x2r = wk3i * x0r - wk3r * x0i;
+ x2i = wk3i * x0i + wk3r * x0r;
+ y11r = x1r + x2r;
+ y11i = x1i + x2i;
+ y15r = x1r - x2r;
+ y15i = x1i - x2i;
+ x1r = y0r + y2r;
+ x1i = y0i + y2i;
+ x2r = y1r + y3r;
+ x2i = y1i + y3i;
+ a[0] = x1r + x2r;
+ a[1] = x1i + x2i;
+ a[2] = x1r - x2r;
+ a[3] = x1i - x2i;
+ x1r = y0r - y2r;
+ x1i = y0i - y2i;
+ x2r = y1r - y3r;
+ x2i = y1i - y3i;
+ a[4] = x1r - x2i;
+ a[5] = x1i + x2r;
+ a[6] = x1r + x2i;
+ a[7] = x1i - x2r;
+ x1r = y4r - y6i;
+ x1i = y4i + y6r;
+ x0r = y5r - y7i;
+ x0i = y5i + y7r;
+ x2r = wn4r * (x0r - x0i);
+ x2i = wn4r * (x0i + x0r);
+ a[8] = x1r + x2r;
+ a[9] = x1i + x2i;
+ a[10] = x1r - x2r;
+ a[11] = x1i - x2i;
+ x1r = y4r + y6i;
+ x1i = y4i - y6r;
+ x0r = y5r + y7i;
+ x0i = y5i - y7r;
+ x2r = wn4r * (x0r - x0i);
+ x2i = wn4r * (x0i + x0r);
+ a[12] = x1r - x2i;
+ a[13] = x1i + x2r;
+ a[14] = x1r + x2i;
+ a[15] = x1i - x2r;
+ x1r = y8r + y10r;
+ x1i = y8i + y10i;
+ x2r = y9r - y11r;
+ x2i = y9i - y11i;
+ a[16] = x1r + x2r;
+ a[17] = x1i + x2i;
+ a[18] = x1r - x2r;
+ a[19] = x1i - x2i;
+ x1r = y8r - y10r;
+ x1i = y8i - y10i;
+ x2r = y9r + y11r;
+ x2i = y9i + y11i;
+ a[20] = x1r - x2i;
+ a[21] = x1i + x2r;
+ a[22] = x1r + x2i;
+ a[23] = x1i - x2r;
+ x1r = y12r - y14i;
+ x1i = y12i + y14r;
+ x0r = y13r + y15i;
+ x0i = y13i - y15r;
+ x2r = wn4r * (x0r - x0i);
+ x2i = wn4r * (x0i + x0r);
+ a[24] = x1r + x2r;
+ a[25] = x1i + x2i;
+ a[26] = x1r - x2r;
+ a[27] = x1i - x2i;
+ x1r = y12r + y14i;
+ x1i = y12i - y14r;
+ x0r = y13r - y15i;
+ x0i = y13i + y15r;
+ x2r = wn4r * (x0r - x0i);
+ x2i = wn4r * (x0i + x0r);
+ a[28] = x1r - x2i;
+ a[29] = x1i + x2r;
+ a[30] = x1r + x2i;
+ a[31] = x1i - x2r;
+}
+
+
+void cftf081(double *a, double *w)
+{
+ double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,
+ y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
+ y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+
+ wn4r = w[1];
+ x0r = a[0] + a[8];
+ x0i = a[1] + a[9];
+ x1r = a[0] - a[8];
+ x1i = a[1] - a[9];
+ x2r = a[4] + a[12];
+ x2i = a[5] + a[13];
+ x3r = a[4] - a[12];
+ x3i = a[5] - a[13];
+ y0r = x0r + x2r;
+ y0i = x0i + x2i;
+ y2r = x0r - x2r;
+ y2i = x0i - x2i;
+ y1r = x1r - x3i;
+ y1i = x1i + x3r;
+ y3r = x1r + x3i;
+ y3i = x1i - x3r;
+ x0r = a[2] + a[10];
+ x0i = a[3] + a[11];
+ x1r = a[2] - a[10];
+ x1i = a[3] - a[11];
+ x2r = a[6] + a[14];
+ x2i = a[7] + a[15];
+ x3r = a[6] - a[14];
+ x3i = a[7] - a[15];
+ y4r = x0r + x2r;
+ y4i = x0i + x2i;
+ y6r = x0r - x2r;
+ y6i = x0i - x2i;
+ x0r = x1r - x3i;
+ x0i = x1i + x3r;
+ x2r = x1r + x3i;
+ x2i = x1i - x3r;
+ y5r = wn4r * (x0r - x0i);
+ y5i = wn4r * (x0r + x0i);
+ y7r = wn4r * (x2r - x2i);
+ y7i = wn4r * (x2r + x2i);
+ a[8] = y1r + y5r;
+ a[9] = y1i + y5i;
+ a[10] = y1r - y5r;
+ a[11] = y1i - y5i;
+ a[12] = y3r - y7i;
+ a[13] = y3i + y7r;
+ a[14] = y3r + y7i;
+ a[15] = y3i - y7r;
+ a[0] = y0r + y4r;
+ a[1] = y0i + y4i;
+ a[2] = y0r - y4r;
+ a[3] = y0i - y4i;
+ a[4] = y2r - y6i;
+ a[5] = y2i + y6r;
+ a[6] = y2r + y6i;
+ a[7] = y2i - y6r;
+}
+
+
+void cftf082(double *a, double *w)
+{
+ double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i,
+ y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,
+ y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+
+ wn4r = w[1];
+ wk1r = w[2];
+ wk1i = w[3];
+ y0r = a[0] - a[9];
+ y0i = a[1] + a[8];
+ y1r = a[0] + a[9];
+ y1i = a[1] - a[8];
+ x0r = a[4] - a[13];
+ x0i = a[5] + a[12];
+ y2r = wn4r * (x0r - x0i);
+ y2i = wn4r * (x0i + x0r);
+ x0r = a[4] + a[13];
+ x0i = a[5] - a[12];
+ y3r = wn4r * (x0r - x0i);
+ y3i = wn4r * (x0i + x0r);
+ x0r = a[2] - a[11];
+ x0i = a[3] + a[10];
+ y4r = wk1r * x0r - wk1i * x0i;
+ y4i = wk1r * x0i + wk1i * x0r;
+ x0r = a[2] + a[11];
+ x0i = a[3] - a[10];
+ y5r = wk1i * x0r - wk1r * x0i;
+ y5i = wk1i * x0i + wk1r * x0r;
+ x0r = a[6] - a[15];
+ x0i = a[7] + a[14];
+ y6r = wk1i * x0r - wk1r * x0i;
+ y6i = wk1i * x0i + wk1r * x0r;
+ x0r = a[6] + a[15];
+ x0i = a[7] - a[14];
+ y7r = wk1r * x0r - wk1i * x0i;
+ y7i = wk1r * x0i + wk1i * x0r;
+ x0r = y0r + y2r;
+ x0i = y0i + y2i;
+ x1r = y4r + y6r;
+ x1i = y4i + y6i;
+ a[0] = x0r + x1r;
+ a[1] = x0i + x1i;
+ a[2] = x0r - x1r;
+ a[3] = x0i - x1i;
+ x0r = y0r - y2r;
+ x0i = y0i - y2i;
+ x1r = y4r - y6r;
+ x1i = y4i - y6i;
+ a[4] = x0r - x1i;
+ a[5] = x0i + x1r;
+ a[6] = x0r + x1i;
+ a[7] = x0i - x1r;
+ x0r = y1r - y3i;
+ x0i = y1i + y3r;
+ x1r = y5r - y7r;
+ x1i = y5i - y7i;
+ a[8] = x0r + x1r;
+ a[9] = x0i + x1i;
+ a[10] = x0r - x1r;
+ a[11] = x0i - x1i;
+ x0r = y1r + y3i;
+ x0i = y1i - y3r;
+ x1r = y5r + y7r;
+ x1i = y5i + y7i;
+ a[12] = x0r - x1i;
+ a[13] = x0i + x1r;
+ a[14] = x0r + x1i;
+ a[15] = x0i - x1r;
+}
+
+
+void cftf040(double *a)
+{
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+ x0r = a[0] + a[4];
+ x0i = a[1] + a[5];
+ x1r = a[0] - a[4];
+ x1i = a[1] - a[5];
+ x2r = a[2] + a[6];
+ x2i = a[3] + a[7];
+ x3r = a[2] - a[6];
+ x3i = a[3] - a[7];
+ a[0] = x0r + x2r;
+ a[1] = x0i + x2i;
+ a[2] = x1r - x3i;
+ a[3] = x1i + x3r;
+ a[4] = x0r - x2r;
+ a[5] = x0i - x2i;
+ a[6] = x1r + x3i;
+ a[7] = x1i - x3r;
+}
+
+
+void cftb040(double *a)
+{
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+ x0r = a[0] + a[4];
+ x0i = a[1] + a[5];
+ x1r = a[0] - a[4];
+ x1i = a[1] - a[5];
+ x2r = a[2] + a[6];
+ x2i = a[3] + a[7];
+ x3r = a[2] - a[6];
+ x3i = a[3] - a[7];
+ a[0] = x0r + x2r;
+ a[1] = x0i + x2i;
+ a[2] = x1r + x3i;
+ a[3] = x1i - x3r;
+ a[4] = x0r - x2r;
+ a[5] = x0i - x2i;
+ a[6] = x1r - x3i;
+ a[7] = x1i + x3r;
+}
+
+
+void cftx020(double *a)
+{
+ double x0r, x0i;
+
+ x0r = a[0] - a[2];
+ x0i = a[1] - a[3];
+ a[0] += a[2];
+ a[1] += a[3];
+ a[2] = x0r;
+ a[3] = x0i;
+}
+
+
+void rftfsub(int n, double *a, int nc, double *c)
+{
+ int j, k, kk, ks, m;
+ double wkr, wki, xr, xi, yr, yi;
+
+ m = n >> 1;
+ ks = 2 * nc / m;
+ kk = 0;
+ for (j = 2; j < m; j += 2) {
+ k = n - j;
+ kk += ks;
+ wkr = 0.5 - c[nc - kk];
+ wki = c[kk];
+ xr = a[j] - a[k];
+ xi = a[j + 1] + a[k + 1];
+ yr = wkr * xr - wki * xi;
+ yi = wkr * xi + wki * xr;
+ a[j] -= yr;
+ a[j + 1] -= yi;
+ a[k] += yr;
+ a[k + 1] -= yi;
+ }
+}
+
+
+void rftbsub(int n, double *a, int nc, double *c)
+{
+ int j, k, kk, ks, m;
+ double wkr, wki, xr, xi, yr, yi;
+
+ m = n >> 1;
+ ks = 2 * nc / m;
+ kk = 0;
+ for (j = 2; j < m; j += 2) {
+ k = n - j;
+ kk += ks;
+ wkr = 0.5 - c[nc - kk];
+ wki = c[kk];
+ xr = a[j] - a[k];
+ xi = a[j + 1] + a[k + 1];
+ yr = wkr * xr + wki * xi;
+ yi = wkr * xi - wki * xr;
+ a[j] -= yr;
+ a[j + 1] -= yi;
+ a[k] += yr;
+ a[k + 1] -= yi;
+ }
+}
+
+
+void dctsub(int n, double *a, int nc, double *c)
+{
+ int j, k, kk, ks, m;
+ double wkr, wki, xr;
+
+ m = n >> 1;
+ ks = nc / n;
+ kk = 0;
+ for (j = 1; j < m; j++) {
+ k = n - j;
+ kk += ks;
+ wkr = c[kk] - c[nc - kk];
+ wki = c[kk] + c[nc - kk];
+ xr = wki * a[j] - wkr * a[k];
+ a[j] = wkr * a[j] + wki * a[k];
+ a[k] = xr;
+ }
+ a[m] *= c[0];
+}
+
+
+void dstsub(int n, double *a, int nc, double *c)
+{
+ int j, k, kk, ks, m;
+ double wkr, wki, xr;
+
+ m = n >> 1;
+ ks = nc / n;
+ kk = 0;
+ for (j = 1; j < m; j++) {
+ k = n - j;
+ kk += ks;
+ wkr = c[kk] - c[nc - kk];
+ wki = c[kk] + c[nc - kk];
+ xr = wki * a[k] - wkr * a[j];
+ a[k] = wkr * a[k] + wki * a[j];
+ a[j] = xr;
+ }
+ a[m] *= c[0];
+}
+
diff --git a/src/fft2d/fft2d/fftsg.f b/src/fft2d/fft2d/fftsg.f
new file mode 100644
index 0000000..c676066
--- /dev/null
+++ b/src/fft2d/fft2d/fftsg.f
@@ -0,0 +1,2967 @@
+! Fast Fourier/Cosine/Sine Transform
+! dimension :one
+! data length :power of 2
+! decimation :frequency
+! radix :split-radix
+! data :inplace
+! table :use
+! subroutines
+! cdft: Complex Discrete Fourier Transform
+! rdft: Real Discrete Fourier Transform
+! ddct: Discrete Cosine Transform
+! ddst: Discrete Sine Transform
+! dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+! dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+!
+!
+! -------- Complex DFT (Discrete Fourier Transform) --------
+! [definition]
+! <case1>
+! X(k) = sum_j=0^n-1 x(j)*exp(2*pi*i*j*k/n), 0<=k<n
+! <case2>
+! X(k) = sum_j=0^n-1 x(j)*exp(-2*pi*i*j*k/n), 0<=k<n
+! (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call cdft(2*n, 1, a, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call cdft(2*n, -1, a, ip, w)
+! [parameters]
+! 2*n :data length (integer)
+! n >= 1, n = power of 2
+! a(0:2*n-1) :input/output data (real*8)
+! input data
+! a(2*j) = Re(x(j)),
+! a(2*j+1) = Im(x(j)), 0<=j<n
+! output data
+! a(2*k) = Re(X(k)),
+! a(2*k+1) = Im(X(k)), 0<=k<n
+! ip(0:*) :work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! strictly,
+! length of ip >=
+! 2+2**(int(log(n+0.5)/log(2.0))/2).
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:n/2-1) :cos/sin table (real*8)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call cdft(2*n, -1, a, ip, w)
+! is
+! call cdft(2*n, 1, a, ip, w)
+! do j = 0, 2 * n - 1
+! a(j) = a(j) / n
+! end do
+! .
+!
+!
+! -------- Real DFT / Inverse of Real DFT --------
+! [definition]
+! <case1> RDFT
+! R(k) = sum_j=0^n-1 a(j)*cos(2*pi*j*k/n), 0<=k<=n/2
+! I(k) = sum_j=0^n-1 a(j)*sin(2*pi*j*k/n), 0<k<n/2
+! <case2> IRDFT (excluding scale)
+! a(k) = (R(0) + R(n/2)*cos(pi*k))/2 +
+! sum_j=1^n/2-1 R(j)*cos(2*pi*j*k/n) +
+! sum_j=1^n/2-1 I(j)*sin(2*pi*j*k/n), 0<=k<n
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call rdft(n, 1, a, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call rdft(n, -1, a, ip, w)
+! [parameters]
+! n :data length (integer)
+! n >= 2, n = power of 2
+! a(0:n-1) :input/output data (real*8)
+! <case1>
+! output data
+! a(2*k) = R(k), 0<=k<n/2
+! a(2*k+1) = I(k), 0<k<n/2
+! a(1) = R(n/2)
+! <case2>
+! input data
+! a(2*j) = R(j), 0<=j<n/2
+! a(2*j+1) = I(j), 0<j<n/2
+! a(1) = R(n/2)
+! ip(0:*) :work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n/2)
+! strictly,
+! length of ip >=
+! 2+2**(int(log(n/2+0.5)/log(2.0))/2).
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:n/2-1) :cos/sin table (real*8)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call rdft(n, 1, a, ip, w)
+! is
+! call rdft(n, -1, a, ip, w)
+! do j = 0, n - 1
+! a(j) = a(j) * 2 / n
+! end do
+! .
+!
+!
+! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+! [definition]
+! <case1> IDCT (excluding scale)
+! C(k) = sum_j=0^n-1 a(j)*cos(pi*j*(k+1/2)/n), 0<=k<n
+! <case2> DCT
+! C(k) = sum_j=0^n-1 a(j)*cos(pi*(j+1/2)*k/n), 0<=k<n
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call ddct(n, 1, a, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call ddct(n, -1, a, ip, w)
+! [parameters]
+! n :data length (integer)
+! n >= 2, n = power of 2
+! a(0:n-1) :input/output data (real*8)
+! output data
+! a(k) = C(k), 0<=k<n
+! ip(0:*) :work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n/2)
+! strictly,
+! length of ip >=
+! 2+2**(int(log(n/2+0.5)/log(2.0))/2).
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:n*5/4-1) :cos/sin table (real*8)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call ddct(n, -1, a, ip, w)
+! is
+! a(0) = a(0) / 2
+! call ddct(n, 1, a, ip, w)
+! do j = 0, n - 1
+! a(j) = a(j) * 2 / n
+! end do
+! .
+!
+!
+! -------- DST (Discrete Sine Transform) / Inverse of DST --------
+! [definition]
+! <case1> IDST (excluding scale)
+! S(k) = sum_j=1^n A(j)*sin(pi*j*(k+1/2)/n), 0<=k<n
+! <case2> DST
+! S(k) = sum_j=0^n-1 a(j)*sin(pi*(j+1/2)*k/n), 0<k<=n
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call ddst(n, 1, a, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call ddst(n, -1, a, ip, w)
+! [parameters]
+! n :data length (integer)
+! n >= 2, n = power of 2
+! a(0:n-1) :input/output data (real*8)
+! <case1>
+! input data
+! a(j) = A(j), 0<j<n
+! a(0) = A(n)
+! output data
+! a(k) = S(k), 0<=k<n
+! <case2>
+! output data
+! a(k) = S(k), 0<k<n
+! a(0) = S(n)
+! ip(0:*) :work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n/2)
+! strictly,
+! length of ip >=
+! 2+2**(int(log(n/2+0.5)/log(2.0))/2).
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:n*5/4-1) :cos/sin table (real*8)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call ddst(n, -1, a, ip, w)
+! is
+! a(0) = a(0) / 2
+! call ddst(n, 1, a, ip, w)
+! do j = 0, n - 1
+! a(j) = a(j) * 2 / n
+! end do
+! .
+!
+!
+! -------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+! [definition]
+! C(k) = sum_j=0^n a(j)*cos(pi*j*k/n), 0<=k<=n
+! [usage]
+! ip(0) = 0 ! first time only
+! call dfct(n, a, t, ip, w)
+! [parameters]
+! n :data length - 1 (integer)
+! n >= 2, n = power of 2
+! a(0:n) :input/output data (real*8)
+! output data
+! a(k) = C(k), 0<=k<=n
+! t(0:n/2) :work area (real*8)
+! ip(0:*) :work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n/4)
+! strictly,
+! length of ip >=
+! 2+2**(int(log(n/4+0.5)/log(2.0))/2).
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:n*5/8-1) :cos/sin table (real*8)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! a(0) = a(0) / 2
+! a(n) = a(n) / 2
+! call dfct(n, a, t, ip, w)
+! is
+! a(0) = a(0) / 2
+! a(n) = a(n) / 2
+! call dfct(n, a, t, ip, w)
+! do j = 0, n
+! a(j) = a(j) * 2 / n
+! end do
+! .
+!
+!
+! -------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+! [definition]
+! S(k) = sum_j=1^n-1 a(j)*sin(pi*j*k/n), 0<k<n
+! [usage]
+! ip(0) = 0 ! first time only
+! call dfst(n, a, t, ip, w)
+! [parameters]
+! n :data length + 1 (integer)
+! n >= 2, n = power of 2
+! a(0:n-1) :input/output data (real*8)
+! output data
+! a(k) = S(k), 0<k<n
+! (a(0) is used for work area)
+! t(0:n/2-1) :work area (real*8)
+! ip(0:*) :work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n/4)
+! strictly,
+! length of ip >=
+! 2+2**(int(log(n/4+0.5)/log(2.0))/2).
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:n*5/8-1) :cos/sin table (real*8)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call dfst(n, a, t, ip, w)
+! is
+! call dfst(n, a, t, ip, w)
+! do j = 1, n - 1
+! a(j) = a(j) * 2 / n
+! end do
+! .
+!
+!
+! Appendix :
+! The cos/sin table is recalculated when the larger table required.
+! w() and ip() are compatible with all routines.
+!
+!
+ subroutine cdft(n, isgn, a, ip, w)
+ integer n, isgn, ip(0 : *), nw
+ real*8 a(0 : n - 1), w(0 : *)
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ if (isgn .ge. 0) then
+ call cftfsub(n, a, ip, nw, w)
+ else
+ call cftbsub(n, a, ip, nw, w)
+ end if
+ end
+!
+ subroutine rdft(n, isgn, a, ip, w)
+ integer n, isgn, ip(0 : *), nw, nc
+ real*8 a(0 : n - 1), w(0 : *), xi
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n .gt. 4 * nc) then
+ nc = n / 4
+ call makect(nc, ip, w(nw))
+ end if
+ if (isgn .ge. 0) then
+ if (n .gt. 4) then
+ call cftfsub(n, a, ip, nw, w)
+ call rftfsub(n, a, nc, w(nw))
+ else if (n .eq. 4) then
+ call cftfsub(n, a, ip, nw, w)
+ end if
+ xi = a(0) - a(1)
+ a(0) = a(0) + a(1)
+ a(1) = xi
+ else
+ a(1) = 0.5d0 * (a(0) - a(1))
+ a(0) = a(0) - a(1)
+ if (n .gt. 4) then
+ call rftbsub(n, a, nc, w(nw))
+ call cftbsub(n, a, ip, nw, w)
+ else if (n .eq. 4) then
+ call cftbsub(n, a, ip, nw, w)
+ end if
+ end if
+ end
+!
+ subroutine ddct(n, isgn, a, ip, w)
+ integer n, isgn, ip(0 : *), j, nw, nc
+ real*8 a(0 : n - 1), w(0 : *), xr
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n .gt. nc) then
+ nc = n
+ call makect(nc, ip, w(nw))
+ end if
+ if (isgn .lt. 0) then
+ xr = a(n - 1)
+ do j = n - 2, 2, -2
+ a(j + 1) = a(j) - a(j - 1)
+ a(j) = a(j) + a(j - 1)
+ end do
+ a(1) = a(0) - xr
+ a(0) = a(0) + xr
+ if (n .gt. 4) then
+ call rftbsub(n, a, nc, w(nw))
+ call cftbsub(n, a, ip, nw, w)
+ else if (n .eq. 4) then
+ call cftbsub(n, a, ip, nw, w)
+ end if
+ end if
+ call dctsub(n, a, nc, w(nw))
+ if (isgn .ge. 0) then
+ if (n .gt. 4) then
+ call cftfsub(n, a, ip, nw, w)
+ call rftfsub(n, a, nc, w(nw))
+ else if (n .eq. 4) then
+ call cftfsub(n, a, ip, nw, w)
+ end if
+ xr = a(0) - a(1)
+ a(0) = a(0) + a(1)
+ do j = 2, n - 2, 2
+ a(j - 1) = a(j) - a(j + 1)
+ a(j) = a(j) + a(j + 1)
+ end do
+ a(n - 1) = xr
+ end if
+ end
+!
+ subroutine ddst(n, isgn, a, ip, w)
+ integer n, isgn, ip(0 : *), j, nw, nc
+ real*8 a(0 : n - 1), w(0 : *), xr
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n .gt. nc) then
+ nc = n
+ call makect(nc, ip, w(nw))
+ end if
+ if (isgn .lt. 0) then
+ xr = a(n - 1)
+ do j = n - 2, 2, -2
+ a(j + 1) = -a(j) - a(j - 1)
+ a(j) = a(j) - a(j - 1)
+ end do
+ a(1) = a(0) + xr
+ a(0) = a(0) - xr
+ if (n .gt. 4) then
+ call rftbsub(n, a, nc, w(nw))
+ call cftbsub(n, a, ip, nw, w)
+ else if (n .eq. 4) then
+ call cftbsub(n, a, ip, nw, w)
+ end if
+ end if
+ call dstsub(n, a, nc, w(nw))
+ if (isgn .ge. 0) then
+ if (n .gt. 4) then
+ call cftfsub(n, a, ip, nw, w)
+ call rftfsub(n, a, nc, w(nw))
+ else if (n .eq. 4) then
+ call cftfsub(n, a, ip, nw, w)
+ end if
+ xr = a(0) - a(1)
+ a(0) = a(0) + a(1)
+ do j = 2, n - 2, 2
+ a(j - 1) = -a(j) - a(j + 1)
+ a(j) = a(j) - a(j + 1)
+ end do
+ a(n - 1) = -xr
+ end if
+ end
+!
+ subroutine dfct(n, a, t, ip, w)
+ integer n, ip(0 : *), j, k, l, m, mh, nw, nc
+ real*8 a(0 : n), t(0 : n / 2), w(0 : *), xr, xi, yr, yi
+ nw = ip(0)
+ if (n .gt. 8 * nw) then
+ nw = n / 8
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n .gt. 2 * nc) then
+ nc = n / 2
+ call makect(nc, ip, w(nw))
+ end if
+ m = n / 2
+ yi = a(m)
+ xi = a(0) + a(n)
+ a(0) = a(0) - a(n)
+ t(0) = xi - yi
+ t(m) = xi + yi
+ if (n .gt. 2) then
+ mh = m / 2
+ do j = 1, mh - 1
+ k = m - j
+ xr = a(j) - a(n - j)
+ xi = a(j) + a(n - j)
+ yr = a(k) - a(n - k)
+ yi = a(k) + a(n - k)
+ a(j) = xr
+ a(k) = yr
+ t(j) = xi - yi
+ t(k) = xi + yi
+ end do
+ t(mh) = a(mh) + a(n - mh)
+ a(mh) = a(mh) - a(n - mh)
+ call dctsub(m, a, nc, w(nw))
+ if (m .gt. 4) then
+ call cftfsub(m, a, ip, nw, w)
+ call rftfsub(m, a, nc, w(nw))
+ else if (m .eq. 4) then
+ call cftfsub(m, a, ip, nw, w)
+ end if
+ a(n - 1) = a(0) - a(1)
+ a(1) = a(0) + a(1)
+ do j = m - 2, 2, -2
+ a(2 * j + 1) = a(j) + a(j + 1)
+ a(2 * j - 1) = a(j) - a(j + 1)
+ end do
+ l = 2
+ m = mh
+ do while (m .ge. 2)
+ call dctsub(m, t, nc, w(nw))
+ if (m .gt. 4) then
+ call cftfsub(m, t, ip, nw, w)
+ call rftfsub(m, t, nc, w(nw))
+ else if (m .eq. 4) then
+ call cftfsub(m, t, ip, nw, w)
+ end if
+ a(n - l) = t(0) - t(1)
+ a(l) = t(0) + t(1)
+ k = 0
+ do j = 2, m - 2, 2
+ k = k + 4 * l
+ a(k - l) = t(j) - t(j + 1)
+ a(k + l) = t(j) + t(j + 1)
+ end do
+ l = 2 * l
+ mh = m / 2
+ do j = 0, mh - 1
+ k = m - j
+ t(j) = t(m + k) - t(m + j)
+ t(k) = t(m + k) + t(m + j)
+ end do
+ t(mh) = t(m + mh)
+ m = mh
+ end do
+ a(l) = t(0)
+ a(n) = t(2) - t(1)
+ a(0) = t(2) + t(1)
+ else
+ a(1) = a(0)
+ a(2) = t(0)
+ a(0) = t(1)
+ end if
+ end
+!
+ subroutine dfst(n, a, t, ip, w)
+ integer n, ip(0 : *), j, k, l, m, mh, nw, nc
+ real*8 a(0 : n - 1), t(0 : n / 2 - 1), w(0 : *), xr, xi, yr, yi
+ nw = ip(0)
+ if (n .gt. 8 * nw) then
+ nw = n / 8
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n .gt. 2 * nc) then
+ nc = n / 2
+ call makect(nc, ip, w(nw))
+ end if
+ if (n .gt. 2) then
+ m = n / 2
+ mh = m / 2
+ do j = 1, mh - 1
+ k = m - j
+ xr = a(j) + a(n - j)
+ xi = a(j) - a(n - j)
+ yr = a(k) + a(n - k)
+ yi = a(k) - a(n - k)
+ a(j) = xr
+ a(k) = yr
+ t(j) = xi + yi
+ t(k) = xi - yi
+ end do
+ t(0) = a(mh) - a(n - mh)
+ a(mh) = a(mh) + a(n - mh)
+ a(0) = a(m)
+ call dstsub(m, a, nc, w(nw))
+ if (m .gt. 4) then
+ call cftfsub(m, a, ip, nw, w)
+ call rftfsub(m, a, nc, w(nw))
+ else if (m .eq. 4) then
+ call cftfsub(m, a, ip, nw, w)
+ end if
+ a(n - 1) = a(1) - a(0)
+ a(1) = a(0) + a(1)
+ do j = m - 2, 2, -2
+ a(2 * j + 1) = a(j) - a(j + 1)
+ a(2 * j - 1) = -a(j) - a(j + 1)
+ end do
+ l = 2
+ m = mh
+ do while (m .ge. 2)
+ call dstsub(m, t, nc, w(nw))
+ if (m .gt. 4) then
+ call cftfsub(m, t, ip, nw, w)
+ call rftfsub(m, t, nc, w(nw))
+ else if (m .eq. 4) then
+ call cftfsub(m, t, ip, nw, w)
+ end if
+ a(n - l) = t(1) - t(0)
+ a(l) = t(0) + t(1)
+ k = 0
+ do j = 2, m - 2, 2
+ k = k + 4 * l
+ a(k - l) = -t(j) - t(j + 1)
+ a(k + l) = t(j) - t(j + 1)
+ end do
+ l = 2 * l
+ mh = m / 2
+ do j = 1, mh - 1
+ k = m - j
+ t(j) = t(m + k) + t(m + j)
+ t(k) = t(m + k) - t(m + j)
+ end do
+ t(0) = t(m + mh)
+ m = mh
+ end do
+ a(l) = t(0)
+ end if
+ a(0) = 0
+ end
+!
+! -------- initializing routines --------
+!
+ subroutine makewt(nw, ip, w)
+ integer nw, ip(0 : *), j, nwh, nw0, nw1
+ real*8 w(0 : nw - 1), delta, wn4r, wk1r, wk1i, wk3r, wk3i
+ ip(0) = nw
+ ip(1) = 1
+ if (nw .gt. 2) then
+ nwh = nw / 2
+ delta = atan(1.0d0) / nwh
+ wn4r = cos(delta * nwh)
+ w(0) = 1
+ w(1) = wn4r
+ if (nwh .eq. 4) then
+ w(2) = cos(delta * 2)
+ w(3) = sin(delta * 2)
+ else if (nwh .gt. 4) then
+ call makeipt(nw, ip)
+ w(2) = 0.5d0 / cos(delta * 2)
+ w(3) = 0.5d0 / cos(delta * 6)
+ do j = 4, nwh - 4, 4
+ w(j) = cos(delta * j)
+ w(j + 1) = sin(delta * j)
+ w(j + 2) = cos(3 * delta * j)
+ w(j + 3) = -sin(3 * delta * j)
+ end do
+ end if
+ nw0 = 0
+ do while (nwh .gt. 2)
+ nw1 = nw0 + nwh
+ nwh = nwh / 2
+ w(nw1) = 1
+ w(nw1 + 1) = wn4r
+ if (nwh .eq. 4) then
+ wk1r = w(nw0 + 4)
+ wk1i = w(nw0 + 5)
+ w(nw1 + 2) = wk1r
+ w(nw1 + 3) = wk1i
+ else if (nwh .gt. 4) then
+ wk1r = w(nw0 + 4)
+ wk3r = w(nw0 + 6)
+ w(nw1 + 2) = 0.5d0 / wk1r
+ w(nw1 + 3) = 0.5d0 / wk3r
+ do j = 4, nwh - 4, 4
+ wk1r = w(nw0 + 2 * j)
+ wk1i = w(nw0 + 2 * j + 1)
+ wk3r = w(nw0 + 2 * j + 2)
+ wk3i = w(nw0 + 2 * j + 3)
+ w(nw1 + j) = wk1r
+ w(nw1 + j + 1) = wk1i
+ w(nw1 + j + 2) = wk3r
+ w(nw1 + j + 3) = wk3i
+ end do
+ end if
+ nw0 = nw1
+ end do
+ end if
+ end
+!
+ subroutine makeipt(nw, ip)
+ integer nw, ip(0 : *), j, l, m, m2, p, q
+ ip(2) = 0
+ ip(3) = 16
+ m = 2
+ l = nw
+ do while (l .gt. 32)
+ m2 = 2 * m
+ q = 8 * m2
+ do j = m, m2 - 1
+ p = 4 * ip(j)
+ ip(m + j) = p
+ ip(m2 + j) = p + q
+ end do
+ m = m2
+ l = l / 4
+ end do
+ end
+!
+ subroutine makect(nc, ip, c)
+ integer nc, ip(0 : *), j, nch
+ real*8 c(0 : nc - 1), delta
+ ip(1) = nc
+ if (nc .gt. 1) then
+ nch = nc / 2
+ delta = atan(1.0d0) / nch
+ c(0) = cos(delta * nch)
+ c(nch) = 0.5d0 * c(0)
+ do j = 1, nch - 1
+ c(j) = 0.5d0 * cos(delta * j)
+ c(nc - j) = 0.5d0 * sin(delta * j)
+ end do
+ end if
+ end
+!
+! -------- child routines --------
+!
+ subroutine cftfsub(n, a, ip, nw, w)
+ integer n, ip(0 : *), nw
+ real*8 a(0 : n - 1), w(0 : nw - 1)
+ if (n .gt. 8) then
+ if (n .gt. 32) then
+ call cftf1st(n, a, w(nw - n / 4))
+ if (n .gt. 512) then
+ call cftrec4(n, a, nw, w)
+ else if (n .gt. 128) then
+ call cftleaf(n, 1, a, nw, w)
+ else
+ call cftfx41(n, a, nw, w)
+ end if
+ call bitrv2(n, ip, a)
+ else if (n .eq. 32) then
+ call cftf161(a, w(nw - 8))
+ call bitrv216(a)
+ else
+ call cftf081(a, w)
+ call bitrv208(a)
+ end if
+ else if (n .eq. 8) then
+ call cftf040(a)
+ else if (n .eq. 4) then
+ call cftx020(a)
+ end if
+ end
+!
+ subroutine cftbsub(n, a, ip, nw, w)
+ integer n, ip(0 : *), nw
+ real*8 a(0 : n - 1), w(0 : nw - 1)
+ if (n .gt. 8) then
+ if (n .gt. 32) then
+ call cftb1st(n, a, w(nw - n / 4))
+ if (n .gt. 512) then
+ call cftrec4(n, a, nw, w)
+ else if (n .gt. 128) then
+ call cftleaf(n, 1, a, nw, w)
+ else
+ call cftfx41(n, a, nw, w)
+ end if
+ call bitrv2conj(n, ip, a)
+ else if (n .eq. 32) then
+ call cftf161(a, w(nw - 8))
+ call bitrv216neg(a)
+ else
+ call cftf081(a, w)
+ call bitrv208neg(a)
+ end if
+ else if (n .eq. 8) then
+ call cftb040(a)
+ else if (n .eq. 4) then
+ call cftx020(a)
+ end if
+ end
+!
+ subroutine bitrv2(n, ip, a)
+ integer n, ip(0 : *), j, j1, k, k1, l, m, nh, nm
+ real*8 a(0 : n - 1), xr, xi, yr, yi
+ m = 1
+ l = n / 4
+ do while (l .gt. 8)
+ m = m * 2
+ l = l / 4
+ end do
+ nh = n / 2
+ nm = 4 * m
+ if (l .eq. 8) then
+ do k = 0, m - 1
+ do j = 0, k - 1
+ j1 = 4 * j + 2 * ip(m + k)
+ k1 = 4 * k + 2 * ip(m + j)
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + 2 * nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 - nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + 2 * nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nh
+ k1 = k1 + 2
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - 2 * nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 + nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - 2 * nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + 2
+ k1 = k1 + nh
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + 2 * nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 - nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + 2 * nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nh
+ k1 = k1 - 2
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - 2 * nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 + nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - 2 * nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ end do
+ k1 = 4 * k + 2 * ip(m + k)
+ j1 = k1 + 2
+ k1 = k1 + nh
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + 2 * nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 - nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - 2
+ k1 = k1 - nh
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nh + 2
+ k1 = k1 + nh + 2
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nh + nm
+ k1 = k1 + 2 * nm - 2
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ end do
+ else
+ do k = 0, m - 1
+ do j = 0, k - 1
+ j1 = 4 * j + ip(m + k)
+ k1 = 4 * k + ip(m + j)
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nh
+ k1 = k1 + 2
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + 2
+ k1 = k1 + nh
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nh
+ k1 = k1 - 2
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ end do
+ k1 = 4 * k + ip(m + k)
+ j1 = k1 + 2
+ k1 = k1 + nh
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + nm
+ xr = a(j1)
+ xi = a(j1 + 1)
+ yr = a(k1)
+ yi = a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ end do
+ end if
+ end
+!
+ subroutine bitrv2conj(n, ip, a)
+ integer n, ip(0 : *), j, j1, k, k1, l, m, nh, nm
+ real*8 a(0 : n - 1), xr, xi, yr, yi
+ m = 1
+ l = n / 4
+ do while (l .gt. 8)
+ m = m * 2
+ l = l / 4
+ end do
+ nh = n / 2
+ nm = 4 * m
+ if (l .eq. 8) then
+ do k = 0, m - 1
+ do j = 0, k - 1
+ j1 = 4 * j + 2 * ip(m + k)
+ k1 = 4 * k + 2 * ip(m + j)
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + 2 * nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 - nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + 2 * nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nh
+ k1 = k1 + 2
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - 2 * nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 + nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - 2 * nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + 2
+ k1 = k1 + nh
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + 2 * nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 - nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + 2 * nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nh
+ k1 = k1 - 2
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - 2 * nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 + nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - 2 * nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ end do
+ k1 = 4 * k + 2 * ip(m + k)
+ j1 = k1 + 2
+ k1 = k1 + nh
+ a(j1 - 1) = -a(j1 - 1)
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ a(k1 + 3) = -a(k1 + 3)
+ j1 = j1 + nm
+ k1 = k1 + 2 * nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 - nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - 2
+ k1 = k1 - nh
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nh + 2
+ k1 = k1 + nh + 2
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nh + nm
+ k1 = k1 + 2 * nm - 2
+ a(j1 - 1) = -a(j1 - 1)
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ a(k1 + 3) = -a(k1 + 3)
+ end do
+ else
+ do k = 0, m - 1
+ do j = 0, k - 1
+ j1 = 4 * j + ip(m + k)
+ k1 = 4 * k + ip(m + j)
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nh
+ k1 = k1 + 2
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + 2
+ k1 = k1 + nh
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 + nm
+ k1 = k1 + nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nh
+ k1 = k1 - 2
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ j1 = j1 - nm
+ k1 = k1 - nm
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ end do
+ k1 = 4 * k + ip(m + k)
+ j1 = k1 + 2
+ k1 = k1 + nh
+ a(j1 - 1) = -a(j1 - 1)
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ a(k1 + 3) = -a(k1 + 3)
+ j1 = j1 + nm
+ k1 = k1 + nm
+ a(j1 - 1) = -a(j1 - 1)
+ xr = a(j1)
+ xi = -a(j1 + 1)
+ yr = a(k1)
+ yi = -a(k1 + 1)
+ a(j1) = yr
+ a(j1 + 1) = yi
+ a(k1) = xr
+ a(k1 + 1) = xi
+ a(k1 + 3) = -a(k1 + 3)
+ end do
+ end if
+ end
+!
+ subroutine bitrv216(a)
+ real*8 a(0 : 31), x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i
+ real*8 x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i
+ real*8 x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i
+ x1r = a(2)
+ x1i = a(3)
+ x2r = a(4)
+ x2i = a(5)
+ x3r = a(6)
+ x3i = a(7)
+ x4r = a(8)
+ x4i = a(9)
+ x5r = a(10)
+ x5i = a(11)
+ x7r = a(14)
+ x7i = a(15)
+ x8r = a(16)
+ x8i = a(17)
+ x10r = a(20)
+ x10i = a(21)
+ x11r = a(22)
+ x11i = a(23)
+ x12r = a(24)
+ x12i = a(25)
+ x13r = a(26)
+ x13i = a(27)
+ x14r = a(28)
+ x14i = a(29)
+ a(2) = x8r
+ a(3) = x8i
+ a(4) = x4r
+ a(5) = x4i
+ a(6) = x12r
+ a(7) = x12i
+ a(8) = x2r
+ a(9) = x2i
+ a(10) = x10r
+ a(11) = x10i
+ a(14) = x14r
+ a(15) = x14i
+ a(16) = x1r
+ a(17) = x1i
+ a(20) = x5r
+ a(21) = x5i
+ a(22) = x13r
+ a(23) = x13i
+ a(24) = x3r
+ a(25) = x3i
+ a(26) = x11r
+ a(27) = x11i
+ a(28) = x7r
+ a(29) = x7i
+ end
+!
+ subroutine bitrv216neg(a)
+ real*8 a(0 : 31), x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i
+ real*8 x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i
+ real*8 x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i
+ real*8 x13r, x13i, x14r, x14i, x15r, x15i
+ x1r = a(2)
+ x1i = a(3)
+ x2r = a(4)
+ x2i = a(5)
+ x3r = a(6)
+ x3i = a(7)
+ x4r = a(8)
+ x4i = a(9)
+ x5r = a(10)
+ x5i = a(11)
+ x6r = a(12)
+ x6i = a(13)
+ x7r = a(14)
+ x7i = a(15)
+ x8r = a(16)
+ x8i = a(17)
+ x9r = a(18)
+ x9i = a(19)
+ x10r = a(20)
+ x10i = a(21)
+ x11r = a(22)
+ x11i = a(23)
+ x12r = a(24)
+ x12i = a(25)
+ x13r = a(26)
+ x13i = a(27)
+ x14r = a(28)
+ x14i = a(29)
+ x15r = a(30)
+ x15i = a(31)
+ a(2) = x15r
+ a(3) = x15i
+ a(4) = x7r
+ a(5) = x7i
+ a(6) = x11r
+ a(7) = x11i
+ a(8) = x3r
+ a(9) = x3i
+ a(10) = x13r
+ a(11) = x13i
+ a(12) = x5r
+ a(13) = x5i
+ a(14) = x9r
+ a(15) = x9i
+ a(16) = x1r
+ a(17) = x1i
+ a(18) = x14r
+ a(19) = x14i
+ a(20) = x6r
+ a(21) = x6i
+ a(22) = x10r
+ a(23) = x10i
+ a(24) = x2r
+ a(25) = x2i
+ a(26) = x12r
+ a(27) = x12i
+ a(28) = x4r
+ a(29) = x4i
+ a(30) = x8r
+ a(31) = x8i
+ end
+!
+ subroutine bitrv208(a)
+ real*8 a(0 : 15), x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i
+ x1r = a(2)
+ x1i = a(3)
+ x3r = a(6)
+ x3i = a(7)
+ x4r = a(8)
+ x4i = a(9)
+ x6r = a(12)
+ x6i = a(13)
+ a(2) = x4r
+ a(3) = x4i
+ a(6) = x6r
+ a(7) = x6i
+ a(8) = x1r
+ a(9) = x1i
+ a(12) = x3r
+ a(13) = x3i
+ end
+!
+ subroutine bitrv208neg(a)
+ real*8 a(0 : 15), x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i
+ real*8 x5r, x5i, x6r, x6i, x7r, x7i
+ x1r = a(2)
+ x1i = a(3)
+ x2r = a(4)
+ x2i = a(5)
+ x3r = a(6)
+ x3i = a(7)
+ x4r = a(8)
+ x4i = a(9)
+ x5r = a(10)
+ x5i = a(11)
+ x6r = a(12)
+ x6i = a(13)
+ x7r = a(14)
+ x7i = a(15)
+ a(2) = x7r
+ a(3) = x7i
+ a(4) = x3r
+ a(5) = x3i
+ a(6) = x5r
+ a(7) = x5i
+ a(8) = x1r
+ a(9) = x1i
+ a(10) = x6r
+ a(11) = x6i
+ a(12) = x2r
+ a(13) = x2i
+ a(14) = x4r
+ a(15) = x4i
+ end
+!
+ subroutine cftf1st(n, a, w)
+ integer n, j, j0, j1, j2, j3, k, m, mh
+ real*8 a(0 : n - 1), w(0 : *)
+ real*8 wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i
+ real*8 wd1r, wd1i, wd3r, wd3i
+ real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+ mh = n / 8
+ m = 2 * mh
+ j1 = m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(0) + a(j2)
+ x0i = a(1) + a(j2 + 1)
+ x1r = a(0) - a(j2)
+ x1i = a(1) - a(j2 + 1)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ a(0) = x0r + x2r
+ a(1) = x0i + x2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i - x2i
+ a(j2) = x1r - x3i
+ a(j2 + 1) = x1i + x3r
+ a(j3) = x1r + x3i
+ a(j3 + 1) = x1i - x3r
+ wn4r = w(1)
+ csc1 = w(2)
+ csc3 = w(3)
+ wd1r = 1
+ wd1i = 0
+ wd3r = 1
+ wd3i = 0
+ k = 0
+ do j = 2, mh - 6, 4
+ k = k + 4
+ wk1r = csc1 * (wd1r + w(k))
+ wk1i = csc1 * (wd1i + w(k + 1))
+ wk3r = csc3 * (wd3r + w(k + 2))
+ wk3i = csc3 * (wd3i + w(k + 3))
+ wd1r = w(k)
+ wd1i = w(k + 1)
+ wd3r = w(k + 2)
+ wd3i = w(k + 3)
+ j1 = j + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j) + a(j2)
+ x0i = a(j + 1) + a(j2 + 1)
+ x1r = a(j) - a(j2)
+ x1i = a(j + 1) - a(j2 + 1)
+ y0r = a(j + 2) + a(j2 + 2)
+ y0i = a(j + 3) + a(j2 + 3)
+ y1r = a(j + 2) - a(j2 + 2)
+ y1i = a(j + 3) - a(j2 + 3)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ y2r = a(j1 + 2) + a(j3 + 2)
+ y2i = a(j1 + 3) + a(j3 + 3)
+ y3r = a(j1 + 2) - a(j3 + 2)
+ y3i = a(j1 + 3) - a(j3 + 3)
+ a(j) = x0r + x2r
+ a(j + 1) = x0i + x2i
+ a(j + 2) = y0r + y2r
+ a(j + 3) = y0i + y2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i - x2i
+ a(j1 + 2) = y0r - y2r
+ a(j1 + 3) = y0i - y2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j2) = wk1r * x0r - wk1i * x0i
+ a(j2 + 1) = wk1r * x0i + wk1i * x0r
+ x0r = y1r - y3i
+ x0i = y1i + y3r
+ a(j2 + 2) = wd1r * x0r - wd1i * x0i
+ a(j2 + 3) = wd1r * x0i + wd1i * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j3) = wk3r * x0r + wk3i * x0i
+ a(j3 + 1) = wk3r * x0i - wk3i * x0r
+ x0r = y1r + y3i
+ x0i = y1i - y3r
+ a(j3 + 2) = wd3r * x0r + wd3i * x0i
+ a(j3 + 3) = wd3r * x0i - wd3i * x0r
+ j0 = m - j
+ j1 = j0 + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j0) + a(j2)
+ x0i = a(j0 + 1) + a(j2 + 1)
+ x1r = a(j0) - a(j2)
+ x1i = a(j0 + 1) - a(j2 + 1)
+ y0r = a(j0 - 2) + a(j2 - 2)
+ y0i = a(j0 - 1) + a(j2 - 1)
+ y1r = a(j0 - 2) - a(j2 - 2)
+ y1i = a(j0 - 1) - a(j2 - 1)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ y2r = a(j1 - 2) + a(j3 - 2)
+ y2i = a(j1 - 1) + a(j3 - 1)
+ y3r = a(j1 - 2) - a(j3 - 2)
+ y3i = a(j1 - 1) - a(j3 - 1)
+ a(j0) = x0r + x2r
+ a(j0 + 1) = x0i + x2i
+ a(j0 - 2) = y0r + y2r
+ a(j0 - 1) = y0i + y2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i - x2i
+ a(j1 - 2) = y0r - y2r
+ a(j1 - 1) = y0i - y2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j2) = wk1i * x0r - wk1r * x0i
+ a(j2 + 1) = wk1i * x0i + wk1r * x0r
+ x0r = y1r - y3i
+ x0i = y1i + y3r
+ a(j2 - 2) = wd1i * x0r - wd1r * x0i
+ a(j2 - 1) = wd1i * x0i + wd1r * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j3) = wk3i * x0r + wk3r * x0i
+ a(j3 + 1) = wk3i * x0i - wk3r * x0r
+ x0r = y1r + y3i
+ x0i = y1i - y3r
+ a(j3 - 2) = wd3i * x0r + wd3r * x0i
+ a(j3 - 1) = wd3i * x0i - wd3r * x0r
+ end do
+ wk1r = csc1 * (wd1r + wn4r)
+ wk1i = csc1 * (wd1i + wn4r)
+ wk3r = csc3 * (wd3r - wn4r)
+ wk3i = csc3 * (wd3i - wn4r)
+ j0 = mh
+ j1 = j0 + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j0 - 2) + a(j2 - 2)
+ x0i = a(j0 - 1) + a(j2 - 1)
+ x1r = a(j0 - 2) - a(j2 - 2)
+ x1i = a(j0 - 1) - a(j2 - 1)
+ x2r = a(j1 - 2) + a(j3 - 2)
+ x2i = a(j1 - 1) + a(j3 - 1)
+ x3r = a(j1 - 2) - a(j3 - 2)
+ x3i = a(j1 - 1) - a(j3 - 1)
+ a(j0 - 2) = x0r + x2r
+ a(j0 - 1) = x0i + x2i
+ a(j1 - 2) = x0r - x2r
+ a(j1 - 1) = x0i - x2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j2 - 2) = wk1r * x0r - wk1i * x0i
+ a(j2 - 1) = wk1r * x0i + wk1i * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j3 - 2) = wk3r * x0r + wk3i * x0i
+ a(j3 - 1) = wk3r * x0i - wk3i * x0r
+ x0r = a(j0) + a(j2)
+ x0i = a(j0 + 1) + a(j2 + 1)
+ x1r = a(j0) - a(j2)
+ x1i = a(j0 + 1) - a(j2 + 1)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ a(j0) = x0r + x2r
+ a(j0 + 1) = x0i + x2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i - x2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j2) = wn4r * (x0r - x0i)
+ a(j2 + 1) = wn4r * (x0i + x0r)
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j3) = -wn4r * (x0r + x0i)
+ a(j3 + 1) = -wn4r * (x0i - x0r)
+ x0r = a(j0 + 2) + a(j2 + 2)
+ x0i = a(j0 + 3) + a(j2 + 3)
+ x1r = a(j0 + 2) - a(j2 + 2)
+ x1i = a(j0 + 3) - a(j2 + 3)
+ x2r = a(j1 + 2) + a(j3 + 2)
+ x2i = a(j1 + 3) + a(j3 + 3)
+ x3r = a(j1 + 2) - a(j3 + 2)
+ x3i = a(j1 + 3) - a(j3 + 3)
+ a(j0 + 2) = x0r + x2r
+ a(j0 + 3) = x0i + x2i
+ a(j1 + 2) = x0r - x2r
+ a(j1 + 3) = x0i - x2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j2 + 2) = wk1i * x0r - wk1r * x0i
+ a(j2 + 3) = wk1i * x0i + wk1r * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j3 + 2) = wk3i * x0r + wk3r * x0i
+ a(j3 + 3) = wk3i * x0i - wk3r * x0r
+ end
+!
+ subroutine cftb1st(n, a, w)
+ integer n, j, j0, j1, j2, j3, k, m, mh
+ real*8 a(0 : n - 1), w(0 : *)
+ real*8 wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i
+ real*8 wd1r, wd1i, wd3r, wd3i
+ real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+ mh = n / 8
+ m = 2 * mh
+ j1 = m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(0) + a(j2)
+ x0i = -a(1) - a(j2 + 1)
+ x1r = a(0) - a(j2)
+ x1i = -a(1) + a(j2 + 1)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ a(0) = x0r + x2r
+ a(1) = x0i - x2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i + x2i
+ a(j2) = x1r + x3i
+ a(j2 + 1) = x1i + x3r
+ a(j3) = x1r - x3i
+ a(j3 + 1) = x1i - x3r
+ wn4r = w(1)
+ csc1 = w(2)
+ csc3 = w(3)
+ wd1r = 1
+ wd1i = 0
+ wd3r = 1
+ wd3i = 0
+ k = 0
+ do j = 2, mh - 6, 4
+ k = k + 4
+ wk1r = csc1 * (wd1r + w(k))
+ wk1i = csc1 * (wd1i + w(k + 1))
+ wk3r = csc3 * (wd3r + w(k + 2))
+ wk3i = csc3 * (wd3i + w(k + 3))
+ wd1r = w(k)
+ wd1i = w(k + 1)
+ wd3r = w(k + 2)
+ wd3i = w(k + 3)
+ j1 = j + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j) + a(j2)
+ x0i = -a(j + 1) - a(j2 + 1)
+ x1r = a(j) - a(j2)
+ x1i = -a(j + 1) + a(j2 + 1)
+ y0r = a(j + 2) + a(j2 + 2)
+ y0i = -a(j + 3) - a(j2 + 3)
+ y1r = a(j + 2) - a(j2 + 2)
+ y1i = -a(j + 3) + a(j2 + 3)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ y2r = a(j1 + 2) + a(j3 + 2)
+ y2i = a(j1 + 3) + a(j3 + 3)
+ y3r = a(j1 + 2) - a(j3 + 2)
+ y3i = a(j1 + 3) - a(j3 + 3)
+ a(j) = x0r + x2r
+ a(j + 1) = x0i - x2i
+ a(j + 2) = y0r + y2r
+ a(j + 3) = y0i - y2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i + x2i
+ a(j1 + 2) = y0r - y2r
+ a(j1 + 3) = y0i + y2i
+ x0r = x1r + x3i
+ x0i = x1i + x3r
+ a(j2) = wk1r * x0r - wk1i * x0i
+ a(j2 + 1) = wk1r * x0i + wk1i * x0r
+ x0r = y1r + y3i
+ x0i = y1i + y3r
+ a(j2 + 2) = wd1r * x0r - wd1i * x0i
+ a(j2 + 3) = wd1r * x0i + wd1i * x0r
+ x0r = x1r - x3i
+ x0i = x1i - x3r
+ a(j3) = wk3r * x0r + wk3i * x0i
+ a(j3 + 1) = wk3r * x0i - wk3i * x0r
+ x0r = y1r - y3i
+ x0i = y1i - y3r
+ a(j3 + 2) = wd3r * x0r + wd3i * x0i
+ a(j3 + 3) = wd3r * x0i - wd3i * x0r
+ j0 = m - j
+ j1 = j0 + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j0) + a(j2)
+ x0i = -a(j0 + 1) - a(j2 + 1)
+ x1r = a(j0) - a(j2)
+ x1i = -a(j0 + 1) + a(j2 + 1)
+ y0r = a(j0 - 2) + a(j2 - 2)
+ y0i = -a(j0 - 1) - a(j2 - 1)
+ y1r = a(j0 - 2) - a(j2 - 2)
+ y1i = -a(j0 - 1) + a(j2 - 1)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ y2r = a(j1 - 2) + a(j3 - 2)
+ y2i = a(j1 - 1) + a(j3 - 1)
+ y3r = a(j1 - 2) - a(j3 - 2)
+ y3i = a(j1 - 1) - a(j3 - 1)
+ a(j0) = x0r + x2r
+ a(j0 + 1) = x0i - x2i
+ a(j0 - 2) = y0r + y2r
+ a(j0 - 1) = y0i - y2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i + x2i
+ a(j1 - 2) = y0r - y2r
+ a(j1 - 1) = y0i + y2i
+ x0r = x1r + x3i
+ x0i = x1i + x3r
+ a(j2) = wk1i * x0r - wk1r * x0i
+ a(j2 + 1) = wk1i * x0i + wk1r * x0r
+ x0r = y1r + y3i
+ x0i = y1i + y3r
+ a(j2 - 2) = wd1i * x0r - wd1r * x0i
+ a(j2 - 1) = wd1i * x0i + wd1r * x0r
+ x0r = x1r - x3i
+ x0i = x1i - x3r
+ a(j3) = wk3i * x0r + wk3r * x0i
+ a(j3 + 1) = wk3i * x0i - wk3r * x0r
+ x0r = y1r - y3i
+ x0i = y1i - y3r
+ a(j3 - 2) = wd3i * x0r + wd3r * x0i
+ a(j3 - 1) = wd3i * x0i - wd3r * x0r
+ end do
+ wk1r = csc1 * (wd1r + wn4r)
+ wk1i = csc1 * (wd1i + wn4r)
+ wk3r = csc3 * (wd3r - wn4r)
+ wk3i = csc3 * (wd3i - wn4r)
+ j0 = mh
+ j1 = j0 + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j0 - 2) + a(j2 - 2)
+ x0i = -a(j0 - 1) - a(j2 - 1)
+ x1r = a(j0 - 2) - a(j2 - 2)
+ x1i = -a(j0 - 1) + a(j2 - 1)
+ x2r = a(j1 - 2) + a(j3 - 2)
+ x2i = a(j1 - 1) + a(j3 - 1)
+ x3r = a(j1 - 2) - a(j3 - 2)
+ x3i = a(j1 - 1) - a(j3 - 1)
+ a(j0 - 2) = x0r + x2r
+ a(j0 - 1) = x0i - x2i
+ a(j1 - 2) = x0r - x2r
+ a(j1 - 1) = x0i + x2i
+ x0r = x1r + x3i
+ x0i = x1i + x3r
+ a(j2 - 2) = wk1r * x0r - wk1i * x0i
+ a(j2 - 1) = wk1r * x0i + wk1i * x0r
+ x0r = x1r - x3i
+ x0i = x1i - x3r
+ a(j3 - 2) = wk3r * x0r + wk3i * x0i
+ a(j3 - 1) = wk3r * x0i - wk3i * x0r
+ x0r = a(j0) + a(j2)
+ x0i = -a(j0 + 1) - a(j2 + 1)
+ x1r = a(j0) - a(j2)
+ x1i = -a(j0 + 1) + a(j2 + 1)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ a(j0) = x0r + x2r
+ a(j0 + 1) = x0i - x2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i + x2i
+ x0r = x1r + x3i
+ x0i = x1i + x3r
+ a(j2) = wn4r * (x0r - x0i)
+ a(j2 + 1) = wn4r * (x0i + x0r)
+ x0r = x1r - x3i
+ x0i = x1i - x3r
+ a(j3) = -wn4r * (x0r + x0i)
+ a(j3 + 1) = -wn4r * (x0i - x0r)
+ x0r = a(j0 + 2) + a(j2 + 2)
+ x0i = -a(j0 + 3) - a(j2 + 3)
+ x1r = a(j0 + 2) - a(j2 + 2)
+ x1i = -a(j0 + 3) + a(j2 + 3)
+ x2r = a(j1 + 2) + a(j3 + 2)
+ x2i = a(j1 + 3) + a(j3 + 3)
+ x3r = a(j1 + 2) - a(j3 + 2)
+ x3i = a(j1 + 3) - a(j3 + 3)
+ a(j0 + 2) = x0r + x2r
+ a(j0 + 3) = x0i - x2i
+ a(j1 + 2) = x0r - x2r
+ a(j1 + 3) = x0i + x2i
+ x0r = x1r + x3i
+ x0i = x1i + x3r
+ a(j2 + 2) = wk1i * x0r - wk1r * x0i
+ a(j2 + 3) = wk1i * x0i + wk1r * x0r
+ x0r = x1r - x3i
+ x0i = x1i - x3r
+ a(j3 + 2) = wk3i * x0r + wk3r * x0i
+ a(j3 + 3) = wk3i * x0i - wk3r * x0r
+ end
+!
+ subroutine cftrec4(n, a, nw, w)
+ integer n, nw, cfttree, isplt, j, k, m
+ real*8 a(0 : n - 1), w(0 : nw - 1)
+ m = n
+ do while (m .gt. 512)
+ m = m / 4
+ call cftmdl1(m, a(n - m), w(nw - m / 2))
+ end do
+ call cftleaf(m, 1, a(n - m), nw, w)
+ k = 0
+ do j = n - m, m, -m
+ k = k + 1
+ isplt = cfttree(m, j, k, a, nw, w)
+ call cftleaf(m, isplt, a(j - m), nw, w)
+ end do
+ end
+!
+ integer function cfttree(n, j, k, a, nw, w)
+ integer n, j, k, nw, i, isplt, m
+ real*8 a(0 : n - 1), w(0 : nw - 1)
+ if (mod(k, 4) .ne. 0) then
+ isplt = mod(k, 2)
+ if (isplt .ne. 0) then
+ call cftmdl1(n, a(j - n), w(nw - n / 2))
+ else
+ call cftmdl2(n, a(j - n), w(nw - n))
+ end if
+ else
+ m = n
+ i = k
+ do while (mod(i, 4) .eq. 0)
+ m = m * 4
+ i = i / 4
+ end do
+ isplt = mod(i, 2)
+ if (isplt .ne. 0) then
+ do while (m .gt. 128)
+ call cftmdl1(m, a(j - m), w(nw - m / 2))
+ m = m / 4
+ end do
+ else
+ do while (m .gt. 128)
+ call cftmdl2(m, a(j - m), w(nw - m))
+ m = m / 4
+ end do
+ end if
+ end if
+ cfttree = isplt
+ end
+!
+ subroutine cftleaf(n, isplt, a, nw, w)
+ integer n, isplt, nw
+ real*8 a(0 : n - 1), w(0 : nw - 1)
+ if (n .eq. 512) then
+ call cftmdl1(128, a, w(nw - 64))
+ call cftf161(a, w(nw - 8))
+ call cftf162(a(32), w(nw - 32))
+ call cftf161(a(64), w(nw - 8))
+ call cftf161(a(96), w(nw - 8))
+ call cftmdl2(128, a(128), w(nw - 128))
+ call cftf161(a(128), w(nw - 8))
+ call cftf162(a(160), w(nw - 32))
+ call cftf161(a(192), w(nw - 8))
+ call cftf162(a(224), w(nw - 32))
+ call cftmdl1(128, a(256), w(nw - 64))
+ call cftf161(a(256), w(nw - 8))
+ call cftf162(a(288), w(nw - 32))
+ call cftf161(a(320), w(nw - 8))
+ call cftf161(a(352), w(nw - 8))
+ if (isplt .ne. 0) then
+ call cftmdl1(128, a(384), w(nw - 64))
+ call cftf161(a(480), w(nw - 8))
+ else
+ call cftmdl2(128, a(384), w(nw - 128))
+ call cftf162(a(480), w(nw - 32))
+ end if
+ call cftf161(a(384), w(nw - 8))
+ call cftf162(a(416), w(nw - 32))
+ call cftf161(a(448), w(nw - 8))
+ else
+ call cftmdl1(64, a, w(nw - 32))
+ call cftf081(a, w(nw - 8))
+ call cftf082(a(16), w(nw - 8))
+ call cftf081(a(32), w(nw - 8))
+ call cftf081(a(48), w(nw - 8))
+ call cftmdl2(64, a(64), w(nw - 64))
+ call cftf081(a(64), w(nw - 8))
+ call cftf082(a(80), w(nw - 8))
+ call cftf081(a(96), w(nw - 8))
+ call cftf082(a(112), w(nw - 8))
+ call cftmdl1(64, a(128), w(nw - 32))
+ call cftf081(a(128), w(nw - 8))
+ call cftf082(a(144), w(nw - 8))
+ call cftf081(a(160), w(nw - 8))
+ call cftf081(a(176), w(nw - 8))
+ if (isplt .ne. 0) then
+ call cftmdl1(64, a(192), w(nw - 32))
+ call cftf081(a(240), w(nw - 8))
+ else
+ call cftmdl2(64, a(192), w(nw - 64))
+ call cftf082(a(240), w(nw - 8))
+ end if
+ call cftf081(a(192), w(nw - 8))
+ call cftf082(a(208), w(nw - 8))
+ call cftf081(a(224), w(nw - 8))
+ end if
+ end
+!
+ subroutine cftmdl1(n, a, w)
+ integer n, j, j0, j1, j2, j3, k, m, mh
+ real*8 a(0 : n - 1), w(0 : *)
+ real*8 wn4r, wk1r, wk1i, wk3r, wk3i
+ real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ mh = n / 8
+ m = 2 * mh
+ j1 = m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(0) + a(j2)
+ x0i = a(1) + a(j2 + 1)
+ x1r = a(0) - a(j2)
+ x1i = a(1) - a(j2 + 1)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ a(0) = x0r + x2r
+ a(1) = x0i + x2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i - x2i
+ a(j2) = x1r - x3i
+ a(j2 + 1) = x1i + x3r
+ a(j3) = x1r + x3i
+ a(j3 + 1) = x1i - x3r
+ wn4r = w(1)
+ k = 0
+ do j = 2, mh - 2, 2
+ k = k + 4
+ wk1r = w(k)
+ wk1i = w(k + 1)
+ wk3r = w(k + 2)
+ wk3i = w(k + 3)
+ j1 = j + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j) + a(j2)
+ x0i = a(j + 1) + a(j2 + 1)
+ x1r = a(j) - a(j2)
+ x1i = a(j + 1) - a(j2 + 1)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ a(j) = x0r + x2r
+ a(j + 1) = x0i + x2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i - x2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j2) = wk1r * x0r - wk1i * x0i
+ a(j2 + 1) = wk1r * x0i + wk1i * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j3) = wk3r * x0r + wk3i * x0i
+ a(j3 + 1) = wk3r * x0i - wk3i * x0r
+ j0 = m - j
+ j1 = j0 + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j0) + a(j2)
+ x0i = a(j0 + 1) + a(j2 + 1)
+ x1r = a(j0) - a(j2)
+ x1i = a(j0 + 1) - a(j2 + 1)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ a(j0) = x0r + x2r
+ a(j0 + 1) = x0i + x2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i - x2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j2) = wk1i * x0r - wk1r * x0i
+ a(j2 + 1) = wk1i * x0i + wk1r * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j3) = wk3i * x0r + wk3r * x0i
+ a(j3 + 1) = wk3i * x0i - wk3r * x0r
+ end do
+ j0 = mh
+ j1 = j0 + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j0) + a(j2)
+ x0i = a(j0 + 1) + a(j2 + 1)
+ x1r = a(j0) - a(j2)
+ x1i = a(j0 + 1) - a(j2 + 1)
+ x2r = a(j1) + a(j3)
+ x2i = a(j1 + 1) + a(j3 + 1)
+ x3r = a(j1) - a(j3)
+ x3i = a(j1 + 1) - a(j3 + 1)
+ a(j0) = x0r + x2r
+ a(j0 + 1) = x0i + x2i
+ a(j1) = x0r - x2r
+ a(j1 + 1) = x0i - x2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ a(j2) = wn4r * (x0r - x0i)
+ a(j2 + 1) = wn4r * (x0i + x0r)
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ a(j3) = -wn4r * (x0r + x0i)
+ a(j3 + 1) = -wn4r * (x0i - x0r)
+ end
+!
+ subroutine cftmdl2(n, a, w)
+ integer n, j, j0, j1, j2, j3, k, kr, m, mh
+ real*8 a(0 : n - 1), w(0 : *)
+ real*8 wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i
+ real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ real*8 y0r, y0i, y2r, y2i
+ mh = n / 8
+ m = 2 * mh
+ wn4r = w(1)
+ j1 = m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(0) - a(j2 + 1)
+ x0i = a(1) + a(j2)
+ x1r = a(0) + a(j2 + 1)
+ x1i = a(1) - a(j2)
+ x2r = a(j1) - a(j3 + 1)
+ x2i = a(j1 + 1) + a(j3)
+ x3r = a(j1) + a(j3 + 1)
+ x3i = a(j1 + 1) - a(j3)
+ y0r = wn4r * (x2r - x2i)
+ y0i = wn4r * (x2i + x2r)
+ a(0) = x0r + y0r
+ a(1) = x0i + y0i
+ a(j1) = x0r - y0r
+ a(j1 + 1) = x0i - y0i
+ y0r = wn4r * (x3r - x3i)
+ y0i = wn4r * (x3i + x3r)
+ a(j2) = x1r - y0i
+ a(j2 + 1) = x1i + y0r
+ a(j3) = x1r + y0i
+ a(j3 + 1) = x1i - y0r
+ k = 0
+ kr = 2 * m
+ do j = 2, mh - 2, 2
+ k = k + 4
+ wk1r = w(k)
+ wk1i = w(k + 1)
+ wk3r = w(k + 2)
+ wk3i = w(k + 3)
+ kr = kr - 4
+ wd1i = w(kr)
+ wd1r = w(kr + 1)
+ wd3i = w(kr + 2)
+ wd3r = w(kr + 3)
+ j1 = j + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j) - a(j2 + 1)
+ x0i = a(j + 1) + a(j2)
+ x1r = a(j) + a(j2 + 1)
+ x1i = a(j + 1) - a(j2)
+ x2r = a(j1) - a(j3 + 1)
+ x2i = a(j1 + 1) + a(j3)
+ x3r = a(j1) + a(j3 + 1)
+ x3i = a(j1 + 1) - a(j3)
+ y0r = wk1r * x0r - wk1i * x0i
+ y0i = wk1r * x0i + wk1i * x0r
+ y2r = wd1r * x2r - wd1i * x2i
+ y2i = wd1r * x2i + wd1i * x2r
+ a(j) = y0r + y2r
+ a(j + 1) = y0i + y2i
+ a(j1) = y0r - y2r
+ a(j1 + 1) = y0i - y2i
+ y0r = wk3r * x1r + wk3i * x1i
+ y0i = wk3r * x1i - wk3i * x1r
+ y2r = wd3r * x3r + wd3i * x3i
+ y2i = wd3r * x3i - wd3i * x3r
+ a(j2) = y0r + y2r
+ a(j2 + 1) = y0i + y2i
+ a(j3) = y0r - y2r
+ a(j3 + 1) = y0i - y2i
+ j0 = m - j
+ j1 = j0 + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j0) - a(j2 + 1)
+ x0i = a(j0 + 1) + a(j2)
+ x1r = a(j0) + a(j2 + 1)
+ x1i = a(j0 + 1) - a(j2)
+ x2r = a(j1) - a(j3 + 1)
+ x2i = a(j1 + 1) + a(j3)
+ x3r = a(j1) + a(j3 + 1)
+ x3i = a(j1 + 1) - a(j3)
+ y0r = wd1i * x0r - wd1r * x0i
+ y0i = wd1i * x0i + wd1r * x0r
+ y2r = wk1i * x2r - wk1r * x2i
+ y2i = wk1i * x2i + wk1r * x2r
+ a(j0) = y0r + y2r
+ a(j0 + 1) = y0i + y2i
+ a(j1) = y0r - y2r
+ a(j1 + 1) = y0i - y2i
+ y0r = wd3i * x1r + wd3r * x1i
+ y0i = wd3i * x1i - wd3r * x1r
+ y2r = wk3i * x3r + wk3r * x3i
+ y2i = wk3i * x3i - wk3r * x3r
+ a(j2) = y0r + y2r
+ a(j2 + 1) = y0i + y2i
+ a(j3) = y0r - y2r
+ a(j3 + 1) = y0i - y2i
+ end do
+ wk1r = w(m)
+ wk1i = w(m + 1)
+ j0 = mh
+ j1 = j0 + m
+ j2 = j1 + m
+ j3 = j2 + m
+ x0r = a(j0) - a(j2 + 1)
+ x0i = a(j0 + 1) + a(j2)
+ x1r = a(j0) + a(j2 + 1)
+ x1i = a(j0 + 1) - a(j2)
+ x2r = a(j1) - a(j3 + 1)
+ x2i = a(j1 + 1) + a(j3)
+ x3r = a(j1) + a(j3 + 1)
+ x3i = a(j1 + 1) - a(j3)
+ y0r = wk1r * x0r - wk1i * x0i
+ y0i = wk1r * x0i + wk1i * x0r
+ y2r = wk1i * x2r - wk1r * x2i
+ y2i = wk1i * x2i + wk1r * x2r
+ a(j0) = y0r + y2r
+ a(j0 + 1) = y0i + y2i
+ a(j1) = y0r - y2r
+ a(j1 + 1) = y0i - y2i
+ y0r = wk1i * x1r - wk1r * x1i
+ y0i = wk1i * x1i + wk1r * x1r
+ y2r = wk1r * x3r - wk1i * x3i
+ y2i = wk1r * x3i + wk1i * x3r
+ a(j2) = y0r - y2r
+ a(j2 + 1) = y0i - y2i
+ a(j3) = y0r + y2r
+ a(j3 + 1) = y0i + y2i
+ end
+!
+ subroutine cftfx41(n, a, nw, w)
+ integer n, nw
+ real*8 a(0 : n - 1), w(0 : nw - 1)
+ if (n .eq. 128) then
+ call cftf161(a, w(nw - 8))
+ call cftf162(a(32), w(nw - 32))
+ call cftf161(a(64), w(nw - 8))
+ call cftf161(a(96), w(nw - 8))
+ else
+ call cftf081(a, w(nw - 8))
+ call cftf082(a(16), w(nw - 8))
+ call cftf081(a(32), w(nw - 8))
+ call cftf081(a(48), w(nw - 8))
+ end if
+ end
+!
+ subroutine cftf161(a, w)
+ real*8 a(0 : 31), w(0 : *), wn4r, wk1r, wk1i
+ real*8 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+ real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+ real*8 y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i
+ real*8 y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i
+ wn4r = w(1)
+ wk1r = w(2)
+ wk1i = w(3)
+ x0r = a(0) + a(16)
+ x0i = a(1) + a(17)
+ x1r = a(0) - a(16)
+ x1i = a(1) - a(17)
+ x2r = a(8) + a(24)
+ x2i = a(9) + a(25)
+ x3r = a(8) - a(24)
+ x3i = a(9) - a(25)
+ y0r = x0r + x2r
+ y0i = x0i + x2i
+ y4r = x0r - x2r
+ y4i = x0i - x2i
+ y8r = x1r - x3i
+ y8i = x1i + x3r
+ y12r = x1r + x3i
+ y12i = x1i - x3r
+ x0r = a(2) + a(18)
+ x0i = a(3) + a(19)
+ x1r = a(2) - a(18)
+ x1i = a(3) - a(19)
+ x2r = a(10) + a(26)
+ x2i = a(11) + a(27)
+ x3r = a(10) - a(26)
+ x3i = a(11) - a(27)
+ y1r = x0r + x2r
+ y1i = x0i + x2i
+ y5r = x0r - x2r
+ y5i = x0i - x2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ y9r = wk1r * x0r - wk1i * x0i
+ y9i = wk1r * x0i + wk1i * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ y13r = wk1i * x0r - wk1r * x0i
+ y13i = wk1i * x0i + wk1r * x0r
+ x0r = a(4) + a(20)
+ x0i = a(5) + a(21)
+ x1r = a(4) - a(20)
+ x1i = a(5) - a(21)
+ x2r = a(12) + a(28)
+ x2i = a(13) + a(29)
+ x3r = a(12) - a(28)
+ x3i = a(13) - a(29)
+ y2r = x0r + x2r
+ y2i = x0i + x2i
+ y6r = x0r - x2r
+ y6i = x0i - x2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ y10r = wn4r * (x0r - x0i)
+ y10i = wn4r * (x0i + x0r)
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ y14r = wn4r * (x0r + x0i)
+ y14i = wn4r * (x0i - x0r)
+ x0r = a(6) + a(22)
+ x0i = a(7) + a(23)
+ x1r = a(6) - a(22)
+ x1i = a(7) - a(23)
+ x2r = a(14) + a(30)
+ x2i = a(15) + a(31)
+ x3r = a(14) - a(30)
+ x3i = a(15) - a(31)
+ y3r = x0r + x2r
+ y3i = x0i + x2i
+ y7r = x0r - x2r
+ y7i = x0i - x2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ y11r = wk1i * x0r - wk1r * x0i
+ y11i = wk1i * x0i + wk1r * x0r
+ x0r = x1r + x3i
+ x0i = x1i - x3r
+ y15r = wk1r * x0r - wk1i * x0i
+ y15i = wk1r * x0i + wk1i * x0r
+ x0r = y12r - y14r
+ x0i = y12i - y14i
+ x1r = y12r + y14r
+ x1i = y12i + y14i
+ x2r = y13r - y15r
+ x2i = y13i - y15i
+ x3r = y13r + y15r
+ x3i = y13i + y15i
+ a(24) = x0r + x2r
+ a(25) = x0i + x2i
+ a(26) = x0r - x2r
+ a(27) = x0i - x2i
+ a(28) = x1r - x3i
+ a(29) = x1i + x3r
+ a(30) = x1r + x3i
+ a(31) = x1i - x3r
+ x0r = y8r + y10r
+ x0i = y8i + y10i
+ x1r = y8r - y10r
+ x1i = y8i - y10i
+ x2r = y9r + y11r
+ x2i = y9i + y11i
+ x3r = y9r - y11r
+ x3i = y9i - y11i
+ a(16) = x0r + x2r
+ a(17) = x0i + x2i
+ a(18) = x0r - x2r
+ a(19) = x0i - x2i
+ a(20) = x1r - x3i
+ a(21) = x1i + x3r
+ a(22) = x1r + x3i
+ a(23) = x1i - x3r
+ x0r = y5r - y7i
+ x0i = y5i + y7r
+ x2r = wn4r * (x0r - x0i)
+ x2i = wn4r * (x0i + x0r)
+ x0r = y5r + y7i
+ x0i = y5i - y7r
+ x3r = wn4r * (x0r - x0i)
+ x3i = wn4r * (x0i + x0r)
+ x0r = y4r - y6i
+ x0i = y4i + y6r
+ x1r = y4r + y6i
+ x1i = y4i - y6r
+ a(8) = x0r + x2r
+ a(9) = x0i + x2i
+ a(10) = x0r - x2r
+ a(11) = x0i - x2i
+ a(12) = x1r - x3i
+ a(13) = x1i + x3r
+ a(14) = x1r + x3i
+ a(15) = x1i - x3r
+ x0r = y0r + y2r
+ x0i = y0i + y2i
+ x1r = y0r - y2r
+ x1i = y0i - y2i
+ x2r = y1r + y3r
+ x2i = y1i + y3i
+ x3r = y1r - y3r
+ x3i = y1i - y3i
+ a(0) = x0r + x2r
+ a(1) = x0i + x2i
+ a(2) = x0r - x2r
+ a(3) = x0i - x2i
+ a(4) = x1r - x3i
+ a(5) = x1i + x3r
+ a(6) = x1r + x3i
+ a(7) = x1i - x3r
+ end
+!
+ subroutine cftf162(a, w)
+ real*8 a(0 : 31), w(0 : *)
+ real*8 wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i
+ real*8 x0r, x0i, x1r, x1i, x2r, x2i
+ real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+ real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+ real*8 y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i
+ real*8 y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i
+ wn4r = w(1)
+ wk1r = w(4)
+ wk1i = w(5)
+ wk3r = w(6)
+ wk3i = -w(7)
+ wk2r = w(8)
+ wk2i = w(9)
+ x1r = a(0) - a(17)
+ x1i = a(1) + a(16)
+ x0r = a(8) - a(25)
+ x0i = a(9) + a(24)
+ x2r = wn4r * (x0r - x0i)
+ x2i = wn4r * (x0i + x0r)
+ y0r = x1r + x2r
+ y0i = x1i + x2i
+ y4r = x1r - x2r
+ y4i = x1i - x2i
+ x1r = a(0) + a(17)
+ x1i = a(1) - a(16)
+ x0r = a(8) + a(25)
+ x0i = a(9) - a(24)
+ x2r = wn4r * (x0r - x0i)
+ x2i = wn4r * (x0i + x0r)
+ y8r = x1r - x2i
+ y8i = x1i + x2r
+ y12r = x1r + x2i
+ y12i = x1i - x2r
+ x0r = a(2) - a(19)
+ x0i = a(3) + a(18)
+ x1r = wk1r * x0r - wk1i * x0i
+ x1i = wk1r * x0i + wk1i * x0r
+ x0r = a(10) - a(27)
+ x0i = a(11) + a(26)
+ x2r = wk3i * x0r - wk3r * x0i
+ x2i = wk3i * x0i + wk3r * x0r
+ y1r = x1r + x2r
+ y1i = x1i + x2i
+ y5r = x1r - x2r
+ y5i = x1i - x2i
+ x0r = a(2) + a(19)
+ x0i = a(3) - a(18)
+ x1r = wk3r * x0r - wk3i * x0i
+ x1i = wk3r * x0i + wk3i * x0r
+ x0r = a(10) + a(27)
+ x0i = a(11) - a(26)
+ x2r = wk1r * x0r + wk1i * x0i
+ x2i = wk1r * x0i - wk1i * x0r
+ y9r = x1r - x2r
+ y9i = x1i - x2i
+ y13r = x1r + x2r
+ y13i = x1i + x2i
+ x0r = a(4) - a(21)
+ x0i = a(5) + a(20)
+ x1r = wk2r * x0r - wk2i * x0i
+ x1i = wk2r * x0i + wk2i * x0r
+ x0r = a(12) - a(29)
+ x0i = a(13) + a(28)
+ x2r = wk2i * x0r - wk2r * x0i
+ x2i = wk2i * x0i + wk2r * x0r
+ y2r = x1r + x2r
+ y2i = x1i + x2i
+ y6r = x1r - x2r
+ y6i = x1i - x2i
+ x0r = a(4) + a(21)
+ x0i = a(5) - a(20)
+ x1r = wk2i * x0r - wk2r * x0i
+ x1i = wk2i * x0i + wk2r * x0r
+ x0r = a(12) + a(29)
+ x0i = a(13) - a(28)
+ x2r = wk2r * x0r - wk2i * x0i
+ x2i = wk2r * x0i + wk2i * x0r
+ y10r = x1r - x2r
+ y10i = x1i - x2i
+ y14r = x1r + x2r
+ y14i = x1i + x2i
+ x0r = a(6) - a(23)
+ x0i = a(7) + a(22)
+ x1r = wk3r * x0r - wk3i * x0i
+ x1i = wk3r * x0i + wk3i * x0r
+ x0r = a(14) - a(31)
+ x0i = a(15) + a(30)
+ x2r = wk1i * x0r - wk1r * x0i
+ x2i = wk1i * x0i + wk1r * x0r
+ y3r = x1r + x2r
+ y3i = x1i + x2i
+ y7r = x1r - x2r
+ y7i = x1i - x2i
+ x0r = a(6) + a(23)
+ x0i = a(7) - a(22)
+ x1r = wk1i * x0r + wk1r * x0i
+ x1i = wk1i * x0i - wk1r * x0r
+ x0r = a(14) + a(31)
+ x0i = a(15) - a(30)
+ x2r = wk3i * x0r - wk3r * x0i
+ x2i = wk3i * x0i + wk3r * x0r
+ y11r = x1r + x2r
+ y11i = x1i + x2i
+ y15r = x1r - x2r
+ y15i = x1i - x2i
+ x1r = y0r + y2r
+ x1i = y0i + y2i
+ x2r = y1r + y3r
+ x2i = y1i + y3i
+ a(0) = x1r + x2r
+ a(1) = x1i + x2i
+ a(2) = x1r - x2r
+ a(3) = x1i - x2i
+ x1r = y0r - y2r
+ x1i = y0i - y2i
+ x2r = y1r - y3r
+ x2i = y1i - y3i
+ a(4) = x1r - x2i
+ a(5) = x1i + x2r
+ a(6) = x1r + x2i
+ a(7) = x1i - x2r
+ x1r = y4r - y6i
+ x1i = y4i + y6r
+ x0r = y5r - y7i
+ x0i = y5i + y7r
+ x2r = wn4r * (x0r - x0i)
+ x2i = wn4r * (x0i + x0r)
+ a(8) = x1r + x2r
+ a(9) = x1i + x2i
+ a(10) = x1r - x2r
+ a(11) = x1i - x2i
+ x1r = y4r + y6i
+ x1i = y4i - y6r
+ x0r = y5r + y7i
+ x0i = y5i - y7r
+ x2r = wn4r * (x0r - x0i)
+ x2i = wn4r * (x0i + x0r)
+ a(12) = x1r - x2i
+ a(13) = x1i + x2r
+ a(14) = x1r + x2i
+ a(15) = x1i - x2r
+ x1r = y8r + y10r
+ x1i = y8i + y10i
+ x2r = y9r - y11r
+ x2i = y9i - y11i
+ a(16) = x1r + x2r
+ a(17) = x1i + x2i
+ a(18) = x1r - x2r
+ a(19) = x1i - x2i
+ x1r = y8r - y10r
+ x1i = y8i - y10i
+ x2r = y9r + y11r
+ x2i = y9i + y11i
+ a(20) = x1r - x2i
+ a(21) = x1i + x2r
+ a(22) = x1r + x2i
+ a(23) = x1i - x2r
+ x1r = y12r - y14i
+ x1i = y12i + y14r
+ x0r = y13r + y15i
+ x0i = y13i - y15r
+ x2r = wn4r * (x0r - x0i)
+ x2i = wn4r * (x0i + x0r)
+ a(24) = x1r + x2r
+ a(25) = x1i + x2i
+ a(26) = x1r - x2r
+ a(27) = x1i - x2i
+ x1r = y12r + y14i
+ x1i = y12i - y14r
+ x0r = y13r - y15i
+ x0i = y13i + y15r
+ x2r = wn4r * (x0r - x0i)
+ x2i = wn4r * (x0i + x0r)
+ a(28) = x1r - x2i
+ a(29) = x1i + x2r
+ a(30) = x1r + x2i
+ a(31) = x1i - x2r
+ end
+!
+ subroutine cftf081(a, w)
+ real*8 a(0 : 15), w(0 : *)
+ real*8 wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+ real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+ wn4r = w(1)
+ x0r = a(0) + a(8)
+ x0i = a(1) + a(9)
+ x1r = a(0) - a(8)
+ x1i = a(1) - a(9)
+ x2r = a(4) + a(12)
+ x2i = a(5) + a(13)
+ x3r = a(4) - a(12)
+ x3i = a(5) - a(13)
+ y0r = x0r + x2r
+ y0i = x0i + x2i
+ y2r = x0r - x2r
+ y2i = x0i - x2i
+ y1r = x1r - x3i
+ y1i = x1i + x3r
+ y3r = x1r + x3i
+ y3i = x1i - x3r
+ x0r = a(2) + a(10)
+ x0i = a(3) + a(11)
+ x1r = a(2) - a(10)
+ x1i = a(3) - a(11)
+ x2r = a(6) + a(14)
+ x2i = a(7) + a(15)
+ x3r = a(6) - a(14)
+ x3i = a(7) - a(15)
+ y4r = x0r + x2r
+ y4i = x0i + x2i
+ y6r = x0r - x2r
+ y6i = x0i - x2i
+ x0r = x1r - x3i
+ x0i = x1i + x3r
+ x2r = x1r + x3i
+ x2i = x1i - x3r
+ y5r = wn4r * (x0r - x0i)
+ y5i = wn4r * (x0r + x0i)
+ y7r = wn4r * (x2r - x2i)
+ y7i = wn4r * (x2r + x2i)
+ a(8) = y1r + y5r
+ a(9) = y1i + y5i
+ a(10) = y1r - y5r
+ a(11) = y1i - y5i
+ a(12) = y3r - y7i
+ a(13) = y3i + y7r
+ a(14) = y3r + y7i
+ a(15) = y3i - y7r
+ a(0) = y0r + y4r
+ a(1) = y0i + y4i
+ a(2) = y0r - y4r
+ a(3) = y0i - y4i
+ a(4) = y2r - y6i
+ a(5) = y2i + y6r
+ a(6) = y2r + y6i
+ a(7) = y2i - y6r
+ end
+!
+ subroutine cftf082(a, w)
+ real*8 a(0 : 15), w(0 : *)
+ real*8 wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i
+ real*8 y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i
+ real*8 y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i
+ wn4r = w(1)
+ wk1r = w(2)
+ wk1i = w(3)
+ y0r = a(0) - a(9)
+ y0i = a(1) + a(8)
+ y1r = a(0) + a(9)
+ y1i = a(1) - a(8)
+ x0r = a(4) - a(13)
+ x0i = a(5) + a(12)
+ y2r = wn4r * (x0r - x0i)
+ y2i = wn4r * (x0i + x0r)
+ x0r = a(4) + a(13)
+ x0i = a(5) - a(12)
+ y3r = wn4r * (x0r - x0i)
+ y3i = wn4r * (x0i + x0r)
+ x0r = a(2) - a(11)
+ x0i = a(3) + a(10)
+ y4r = wk1r * x0r - wk1i * x0i
+ y4i = wk1r * x0i + wk1i * x0r
+ x0r = a(2) + a(11)
+ x0i = a(3) - a(10)
+ y5r = wk1i * x0r - wk1r * x0i
+ y5i = wk1i * x0i + wk1r * x0r
+ x0r = a(6) - a(15)
+ x0i = a(7) + a(14)
+ y6r = wk1i * x0r - wk1r * x0i
+ y6i = wk1i * x0i + wk1r * x0r
+ x0r = a(6) + a(15)
+ x0i = a(7) - a(14)
+ y7r = wk1r * x0r - wk1i * x0i
+ y7i = wk1r * x0i + wk1i * x0r
+ x0r = y0r + y2r
+ x0i = y0i + y2i
+ x1r = y4r + y6r
+ x1i = y4i + y6i
+ a(0) = x0r + x1r
+ a(1) = x0i + x1i
+ a(2) = x0r - x1r
+ a(3) = x0i - x1i
+ x0r = y0r - y2r
+ x0i = y0i - y2i
+ x1r = y4r - y6r
+ x1i = y4i - y6i
+ a(4) = x0r - x1i
+ a(5) = x0i + x1r
+ a(6) = x0r + x1i
+ a(7) = x0i - x1r
+ x0r = y1r - y3i
+ x0i = y1i + y3r
+ x1r = y5r - y7r
+ x1i = y5i - y7i
+ a(8) = x0r + x1r
+ a(9) = x0i + x1i
+ a(10) = x0r - x1r
+ a(11) = x0i - x1i
+ x0r = y1r + y3i
+ x0i = y1i - y3r
+ x1r = y5r + y7r
+ x1i = y5i + y7i
+ a(12) = x0r - x1i
+ a(13) = x0i + x1r
+ a(14) = x0r + x1i
+ a(15) = x0i - x1r
+ end
+!
+ subroutine cftf040(a)
+ real*8 a(0 : 7), x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ x0r = a(0) + a(4)
+ x0i = a(1) + a(5)
+ x1r = a(0) - a(4)
+ x1i = a(1) - a(5)
+ x2r = a(2) + a(6)
+ x2i = a(3) + a(7)
+ x3r = a(2) - a(6)
+ x3i = a(3) - a(7)
+ a(0) = x0r + x2r
+ a(1) = x0i + x2i
+ a(2) = x1r - x3i
+ a(3) = x1i + x3r
+ a(4) = x0r - x2r
+ a(5) = x0i - x2i
+ a(6) = x1r + x3i
+ a(7) = x1i - x3r
+ end
+!
+ subroutine cftb040(a)
+ real*8 a(0 : 7), x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i
+ x0r = a(0) + a(4)
+ x0i = a(1) + a(5)
+ x1r = a(0) - a(4)
+ x1i = a(1) - a(5)
+ x2r = a(2) + a(6)
+ x2i = a(3) + a(7)
+ x3r = a(2) - a(6)
+ x3i = a(3) - a(7)
+ a(0) = x0r + x2r
+ a(1) = x0i + x2i
+ a(2) = x1r + x3i
+ a(3) = x1i - x3r
+ a(4) = x0r - x2r
+ a(5) = x0i - x2i
+ a(6) = x1r - x3i
+ a(7) = x1i + x3r
+ end
+!
+ subroutine cftx020(a)
+ real*8 a(0 : 3), x0r, x0i
+ x0r = a(0) - a(2)
+ x0i = a(1) - a(3)
+ a(0) = a(0) + a(2)
+ a(1) = a(1) + a(3)
+ a(2) = x0r
+ a(3) = x0i
+ end
+!
+ subroutine rftfsub(n, a, nc, c)
+ integer n, nc, j, k, kk, ks, m
+ real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr, xi, yr, yi
+ m = n / 2
+ ks = 2 * nc / m
+ kk = 0
+ do j = 2, m - 2, 2
+ k = n - j
+ kk = kk + ks
+ wkr = 0.5d0 - c(nc - kk)
+ wki = c(kk)
+ xr = a(j) - a(k)
+ xi = a(j + 1) + a(k + 1)
+ yr = wkr * xr - wki * xi
+ yi = wkr * xi + wki * xr
+ a(j) = a(j) - yr
+ a(j + 1) = a(j + 1) - yi
+ a(k) = a(k) + yr
+ a(k + 1) = a(k + 1) - yi
+ end do
+ end
+!
+ subroutine rftbsub(n, a, nc, c)
+ integer n, nc, j, k, kk, ks, m
+ real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr, xi, yr, yi
+ m = n / 2
+ ks = 2 * nc / m
+ kk = 0
+ do j = 2, m - 2, 2
+ k = n - j
+ kk = kk + ks
+ wkr = 0.5d0 - c(nc - kk)
+ wki = c(kk)
+ xr = a(j) - a(k)
+ xi = a(j + 1) + a(k + 1)
+ yr = wkr * xr + wki * xi
+ yi = wkr * xi - wki * xr
+ a(j) = a(j) - yr
+ a(j + 1) = a(j + 1) - yi
+ a(k) = a(k) + yr
+ a(k + 1) = a(k + 1) - yi
+ end do
+ end
+!
+ subroutine dctsub(n, a, nc, c)
+ integer n, nc, j, k, kk, ks, m
+ real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr
+ m = n / 2
+ ks = nc / n
+ kk = 0
+ do j = 1, m - 1
+ k = n - j
+ kk = kk + ks
+ wkr = c(kk) - c(nc - kk)
+ wki = c(kk) + c(nc - kk)
+ xr = wki * a(j) - wkr * a(k)
+ a(j) = wkr * a(j) + wki * a(k)
+ a(k) = xr
+ end do
+ a(m) = c(0) * a(m)
+ end
+!
+ subroutine dstsub(n, a, nc, c)
+ integer n, nc, j, k, kk, ks, m
+ real*8 a(0 : n - 1), c(0 : nc - 1), wkr, wki, xr
+ m = n / 2
+ ks = nc / n
+ kk = 0
+ do j = 1, m - 1
+ k = n - j
+ kk = kk + ks
+ wkr = c(kk) - c(nc - kk)
+ wki = c(kk) + c(nc - kk)
+ xr = wki * a(k) - wkr * a(j)
+ a(k) = wkr * a(k) + wki * a(j)
+ a(j) = xr
+ end do
+ a(m) = c(0) * a(m)
+ end
+!
diff --git a/src/fft2d/fft2d/fftsg2d.c b/src/fft2d/fft2d/fftsg2d.c
new file mode 100644
index 0000000..2ebfa93
--- /dev/null
+++ b/src/fft2d/fft2d/fftsg2d.c
@@ -0,0 +1,1190 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+ dimension :two
+ data length :power of 2
+ decimation :frequency
+ radix :split-radix, row-column
+ data :inplace
+ table :use
+functions
+ cdft2d: Complex Discrete Fourier Transform
+ rdft2d: Real Discrete Fourier Transform
+ ddct2d: Discrete Cosine Transform
+ ddst2d: Discrete Sine Transform
+function prototypes
+ void cdft2d(int, int, int, double **, double *, int *, double *);
+ void rdft2d(int, int, int, double **, double *, int *, double *);
+ void rdft2dsort(int, int, int, double **);
+ void ddct2d(int, int, int, double **, double *, int *, double *);
+ void ddst2d(int, int, int, double **, double *, int *, double *);
+necessary package
+ fftsg.c : 1D-FFT package
+macro definitions
+ USE_FFT2D_PTHREADS : default=not defined
+ FFT2D_MAX_THREADS : must be 2^N, default=4
+ FFT2D_THREADS_BEGIN_N : default=65536
+ USE_FFT2D_WINTHREADS : default=not defined
+ FFT2D_MAX_THREADS : must be 2^N, default=4
+ FFT2D_THREADS_BEGIN_N : default=131072
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+ [definition]
+ <case1>
+ X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] *
+ exp(2*pi*i*j1*k1/n1) *
+ exp(2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
+ <case2>
+ X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] *
+ exp(-2*pi*i*j1*k1/n1) *
+ exp(-2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
+ (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ cdft2d(n1, 2*n2, 1, a, t, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ cdft2d(n1, 2*n2, -1, a, t, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 1, n1 = power of 2
+ 2*n2 :data length (int)
+ n2 >= 1, n2 = power of 2
+ a[0...n1-1][0...2*n2-1]
+ :input/output data (double **)
+ input data
+ a[j1][2*j2] = Re(x[j1][j2]),
+ a[j1][2*j2+1] = Im(x[j1][j2]),
+ 0<=j1<n1, 0<=j2<n2
+ output data
+ a[k1][2*k2] = Re(X[k1][k2]),
+ a[k1][2*k2+1] = Im(X[k1][k2]),
+ 0<=k1<n1, 0<=k2<n2
+ t[0...*]
+ :work area (double *)
+ length of t >= 8*n1, if single thread,
+ length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads,
+ t is dynamically allocated, if t == NULL.
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1, n2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1/2, n2/2)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ cdft2d(n1, 2*n2, -1, a, t, ip, w);
+ is
+ cdft2d(n1, 2*n2, 1, a, t, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= 2 * n2 - 1; j2++) {
+ a[j1][j2] *= 1.0 / n1 / n2;
+ }
+ }
+ .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+ [definition]
+ <case1> RDFT
+ R[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+ cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
+ 0<=k1<n1, 0<=k2<n2
+ I[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+ sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
+ 0<=k1<n1, 0<=k2<n2
+ <case2> IRDFT (excluding scale)
+ a[k1][k2] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
+ (R[j1][j2] *
+ cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) +
+ I[j1][j2] *
+ sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)),
+ 0<=k1<n1, 0<=k2<n2
+ (notes: R[n1-k1][n2-k2] = R[k1][k2],
+ I[n1-k1][n2-k2] = -I[k1][k2],
+ R[n1-k1][0] = R[k1][0],
+ I[n1-k1][0] = -I[k1][0],
+ R[0][n2-k2] = R[0][k2],
+ I[0][n2-k2] = -I[0][k2],
+ 0<k1<n1, 0<k2<n2)
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ rdft2d(n1, n2, 1, a, t, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ rdft2d(n1, n2, -1, a, t, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 2, n1 = power of 2
+ n2 :data length (int)
+ n2 >= 2, n2 = power of 2
+ a[0...n1-1][0...n2-1]
+ :input/output data (double **)
+ <case1>
+ output data
+ a[k1][2*k2] = R[k1][k2] = R[n1-k1][n2-k2],
+ a[k1][2*k2+1] = I[k1][k2] = -I[n1-k1][n2-k2],
+ 0<k1<n1, 0<k2<n2/2,
+ a[0][2*k2] = R[0][k2] = R[0][n2-k2],
+ a[0][2*k2+1] = I[0][k2] = -I[0][n2-k2],
+ 0<k2<n2/2,
+ a[k1][0] = R[k1][0] = R[n1-k1][0],
+ a[k1][1] = I[k1][0] = -I[n1-k1][0],
+ a[n1-k1][1] = R[k1][n2/2] = R[n1-k1][n2/2],
+ a[n1-k1][0] = -I[k1][n2/2] = I[n1-k1][n2/2],
+ 0<k1<n1/2,
+ a[0][0] = R[0][0],
+ a[0][1] = R[0][n2/2],
+ a[n1/2][0] = R[n1/2][0],
+ a[n1/2][1] = R[n1/2][n2/2]
+ <case2>
+ input data
+ a[j1][2*j2] = R[j1][j2] = R[n1-j1][n2-j2],
+ a[j1][2*j2+1] = I[j1][j2] = -I[n1-j1][n2-j2],
+ 0<j1<n1, 0<j2<n2/2,
+ a[0][2*j2] = R[0][j2] = R[0][n2-j2],
+ a[0][2*j2+1] = I[0][j2] = -I[0][n2-j2],
+ 0<j2<n2/2,
+ a[j1][0] = R[j1][0] = R[n1-j1][0],
+ a[j1][1] = I[j1][0] = -I[n1-j1][0],
+ a[n1-j1][1] = R[j1][n2/2] = R[n1-j1][n2/2],
+ a[n1-j1][0] = -I[j1][n2/2] = I[n1-j1][n2/2],
+ 0<j1<n1/2,
+ a[0][0] = R[0][0],
+ a[0][1] = R[0][n2/2],
+ a[n1/2][0] = R[n1/2][0],
+ a[n1/2][1] = R[n1/2][n2/2]
+ ---- output ordering ----
+ rdft2d(n1, n2, 1, a, t, ip, w);
+ rdft2dsort(n1, n2, 1, a);
+ // stored data is a[0...n1-1][0...n2+1]:
+ // a[k1][2*k2] = R[k1][k2],
+ // a[k1][2*k2+1] = I[k1][k2],
+ // 0<=k1<n1, 0<=k2<=n2/2.
+ // the stored data is larger than the input data!
+ ---- input ordering ----
+ rdft2dsort(n1, n2, -1, a);
+ rdft2d(n1, n2, -1, a, t, ip, w);
+ t[0...*]
+ :work area (double *)
+ length of t >= 8*n1, if single thread,
+ length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads,
+ t is dynamically allocated, if t == NULL.
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1, n2/2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1/2, n2/4) + n2/4
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ rdft2d(n1, n2, 1, a, t, ip, w);
+ is
+ rdft2d(n1, n2, -1, a, t, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2] *= 2.0 / n1 / n2;
+ }
+ }
+ .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+ [definition]
+ <case1> IDCT (excluding scale)
+ C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+ cos(pi*j1*(k1+1/2)/n1) *
+ cos(pi*j2*(k2+1/2)/n2),
+ 0<=k1<n1, 0<=k2<n2
+ <case2> DCT
+ C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+ cos(pi*(j1+1/2)*k1/n1) *
+ cos(pi*(j2+1/2)*k2/n2),
+ 0<=k1<n1, 0<=k2<n2
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ ddct2d(n1, n2, 1, a, t, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ ddct2d(n1, n2, -1, a, t, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 2, n1 = power of 2
+ n2 :data length (int)
+ n2 >= 2, n2 = power of 2
+ a[0...n1-1][0...n2-1]
+ :input/output data (double **)
+ output data
+ a[k1][k2] = C[k1][k2], 0<=k1<n1, 0<=k2<n2
+ t[0...*]
+ :work area (double *)
+ length of t >= 4*n1, if single thread,
+ length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads,
+ t is dynamically allocated, if t == NULL.
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1/2, n2/2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1*3/2, n2*3/2)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ ddct2d(n1, n2, -1, a, t, ip, w);
+ is
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ a[j1][0] *= 0.5;
+ }
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[0][j2] *= 0.5;
+ }
+ ddct2d(n1, n2, 1, a, t, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2] *= 4.0 / n1 / n2;
+ }
+ }
+ .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+ [definition]
+ <case1> IDST (excluding scale)
+ S[k1][k2] = sum_j1=1^n1 sum_j2=1^n2 A[j1][j2] *
+ sin(pi*j1*(k1+1/2)/n1) *
+ sin(pi*j2*(k2+1/2)/n2),
+ 0<=k1<n1, 0<=k2<n2
+ <case2> DST
+ S[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+ sin(pi*(j1+1/2)*k1/n1) *
+ sin(pi*(j2+1/2)*k2/n2),
+ 0<k1<=n1, 0<k2<=n2
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ ddst2d(n1, n2, 1, a, t, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ ddst2d(n1, n2, -1, a, t, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 2, n1 = power of 2
+ n2 :data length (int)
+ n2 >= 2, n2 = power of 2
+ a[0...n1-1][0...n2-1]
+ :input/output data (double **)
+ <case1>
+ input data
+ a[j1][j2] = A[j1][j2], 0<j1<n1, 0<j2<n2,
+ a[j1][0] = A[j1][n2], 0<j1<n1,
+ a[0][j2] = A[n1][j2], 0<j2<n2,
+ a[0][0] = A[n1][n2]
+ (i.e. A[j1][j2] = a[j1 % n1][j2 % n2])
+ output data
+ a[k1][k2] = S[k1][k2], 0<=k1<n1, 0<=k2<n2
+ <case2>
+ output data
+ a[k1][k2] = S[k1][k2], 0<k1<n1, 0<k2<n2,
+ a[k1][0] = S[k1][n2], 0<k1<n1,
+ a[0][k2] = S[n1][k2], 0<k2<n2,
+ a[0][0] = S[n1][n2]
+ (i.e. S[k1][k2] = a[k1 % n1][k2 % n2])
+ t[0...*]
+ :work area (double *)
+ length of t >= 4*n1, if single thread,
+ length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads,
+ t is dynamically allocated, if t == NULL.
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1/2, n2/2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1*3/2, n2*3/2)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ ddst2d(n1, n2, -1, a, t, ip, w);
+ is
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ a[j1][0] *= 0.5;
+ }
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[0][j2] *= 0.5;
+ }
+ ddst2d(n1, n2, 1, a, t, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2] *= 4.0 / n1 / n2;
+ }
+ }
+ .
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#define fft2d_alloc_error_check(p) { \
+ if ((p) == NULL) { \
+ fprintf(stderr, "fft2d memory allocation error\n"); \
+ exit(1); \
+ } \
+}
+
+
+#ifdef USE_FFT2D_PTHREADS
+#define USE_FFT2D_THREADS
+#ifndef FFT2D_MAX_THREADS
+#define FFT2D_MAX_THREADS 4
+#endif
+#ifndef FFT2D_THREADS_BEGIN_N
+#define FFT2D_THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#define fft2d_thread_t pthread_t
+#define fft2d_thread_create(thp,func,argp) { \
+ if (pthread_create(thp, NULL, func, (void *) (argp)) != 0) { \
+ fprintf(stderr, "fft2d thread error\n"); \
+ exit(1); \
+ } \
+}
+#define fft2d_thread_wait(th) { \
+ if (pthread_join(th, NULL) != 0) { \
+ fprintf(stderr, "fft2d thread error\n"); \
+ exit(1); \
+ } \
+}
+#endif /* USE_FFT2D_PTHREADS */
+
+
+#ifdef USE_FFT2D_WINTHREADS
+#define USE_FFT2D_THREADS
+#ifndef FFT2D_MAX_THREADS
+#define FFT2D_MAX_THREADS 4
+#endif
+#ifndef FFT2D_THREADS_BEGIN_N
+#define FFT2D_THREADS_BEGIN_N 131072
+#endif
+#include <windows.h>
+#define fft2d_thread_t HANDLE
+#define fft2d_thread_create(thp,func,argp) { \
+ DWORD thid; \
+ *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) (func), (LPVOID) (argp), 0, &thid); \
+ if (*(thp) == 0) { \
+ fprintf(stderr, "fft2d thread error\n"); \
+ exit(1); \
+ } \
+}
+#define fft2d_thread_wait(th) { \
+ WaitForSingleObject(th, INFINITE); \
+ CloseHandle(th); \
+}
+#endif /* USE_FFT2D_WINTHREADS */
+
+
+void cdft2d(int n1, int n2, int isgn, double **a, double *t,
+ int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void cdft(int n, int isgn, double *a, int *ip, double *w);
+ void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t,
+ int *ip, double *w);
+#ifdef USE_FFT2D_THREADS
+ void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a,
+ int *ip, double *w);
+ void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t,
+ int *ip, double *w);
+#endif /* USE_FFT2D_THREADS */
+ int n, itnull, nthread, nt, i;
+
+ n = n1 << 1;
+ if (n < n2) {
+ n = n2;
+ }
+ if (n > (ip[0] << 2)) {
+ makewt(n >> 2, ip, w);
+ }
+ itnull = 0;
+ if (t == NULL) {
+ itnull = 1;
+ nthread = 1;
+#ifdef USE_FFT2D_THREADS
+ nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+ nt = 8 * nthread * n1;
+ if (n2 == 4 * nthread) {
+ nt >>= 1;
+ } else if (n2 < 4 * nthread) {
+ nt >>= 2;
+ }
+ t = (double *) malloc(sizeof(double) * nt);
+ fft2d_alloc_error_check(t);
+ }
+#ifdef USE_FFT2D_THREADS
+ if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
+ xdft2d0_subth(n1, n2, 0, isgn, a, ip, w);
+ cdft2d_subth(n1, n2, isgn, a, t, ip, w);
+ } else
+#endif /* USE_FFT2D_THREADS */
+ {
+ for (i = 0; i < n1; i++) {
+ cdft(n2, isgn, a[i], ip, w);
+ }
+ cdft2d_sub(n1, n2, isgn, a, t, ip, w);
+ }
+ if (itnull != 0) {
+ free(t);
+ }
+}
+
+
+void rdft2d(int n1, int n2, int isgn, double **a, double *t,
+ int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void rdft(int n, int isgn, double *a, int *ip, double *w);
+ void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t,
+ int *ip, double *w);
+ void rdft2d_sub(int n1, int n2, int isgn, double **a);
+#ifdef USE_FFT2D_THREADS
+ void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a,
+ int *ip, double *w);
+ void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t,
+ int *ip, double *w);
+#endif /* USE_FFT2D_THREADS */
+ int n, nw, nc, itnull, nthread, nt, i;
+
+ n = n1 << 1;
+ if (n < n2) {
+ n = n2;
+ }
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n2 > (nc << 2)) {
+ nc = n2 >> 2;
+ makect(nc, ip, w + nw);
+ }
+ itnull = 0;
+ if (t == NULL) {
+ itnull = 1;
+ nthread = 1;
+#ifdef USE_FFT2D_THREADS
+ nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+ nt = 8 * nthread * n1;
+ if (n2 == 4 * nthread) {
+ nt >>= 1;
+ } else if (n2 < 4 * nthread) {
+ nt >>= 2;
+ }
+ t = (double *) malloc(sizeof(double) * nt);
+ fft2d_alloc_error_check(t);
+ }
+#ifdef USE_FFT2D_THREADS
+ if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
+ if (isgn < 0) {
+ rdft2d_sub(n1, n2, isgn, a);
+ cdft2d_subth(n1, n2, isgn, a, t, ip, w);
+ }
+ xdft2d0_subth(n1, n2, 1, isgn, a, ip, w);
+ if (isgn >= 0) {
+ cdft2d_subth(n1, n2, isgn, a, t, ip, w);
+ rdft2d_sub(n1, n2, isgn, a);
+ }
+ } else
+#endif /* USE_FFT2D_THREADS */
+ {
+ if (isgn < 0) {
+ rdft2d_sub(n1, n2, isgn, a);
+ cdft2d_sub(n1, n2, isgn, a, t, ip, w);
+ }
+ for (i = 0; i < n1; i++) {
+ rdft(n2, isgn, a[i], ip, w);
+ }
+ if (isgn >= 0) {
+ cdft2d_sub(n1, n2, isgn, a, t, ip, w);
+ rdft2d_sub(n1, n2, isgn, a);
+ }
+ }
+ if (itnull != 0) {
+ free(t);
+ }
+}
+
+
+void rdft2dsort(int n1, int n2, int isgn, double **a)
+{
+ int n1h, i;
+ double x, y;
+
+ n1h = n1 >> 1;
+ if (isgn < 0) {
+ for (i = n1h + 1; i < n1; i++) {
+ a[i][0] = a[i][n2 + 1];
+ a[i][1] = a[i][n2];
+ }
+ a[0][1] = a[0][n2];
+ a[n1h][1] = a[n1h][n2];
+ } else {
+ for (i = n1h + 1; i < n1; i++) {
+ y = a[i][0];
+ x = a[i][1];
+ a[i][n2] = x;
+ a[i][n2 + 1] = y;
+ a[n1 - i][n2] = x;
+ a[n1 - i][n2 + 1] = -y;
+ a[i][0] = a[n1 - i][0];
+ a[i][1] = -a[n1 - i][1];
+ }
+ a[0][n2] = a[0][1];
+ a[0][n2 + 1] = 0;
+ a[0][1] = 0;
+ a[n1h][n2] = a[n1h][1];
+ a[n1h][n2 + 1] = 0;
+ a[n1h][1] = 0;
+ }
+}
+
+
+void ddct2d(int n1, int n2, int isgn, double **a, double *t,
+ int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void ddct(int n, int isgn, double *a, int *ip, double *w);
+ void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a,
+ double *t, int *ip, double *w);
+#ifdef USE_FFT2D_THREADS
+ void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a,
+ int *ip, double *w);
+ void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a,
+ double *t, int *ip, double *w);
+#endif /* USE_FFT2D_THREADS */
+ int n, nw, nc, itnull, nthread, nt, i;
+
+ n = n1;
+ if (n < n2) {
+ n = n2;
+ }
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n > nc) {
+ nc = n;
+ makect(nc, ip, w + nw);
+ }
+ itnull = 0;
+ if (t == NULL) {
+ itnull = 1;
+ nthread = 1;
+#ifdef USE_FFT2D_THREADS
+ nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+ nt = 4 * nthread * n1;
+ if (n2 == 2 * nthread) {
+ nt >>= 1;
+ } else if (n2 < 2 * nthread) {
+ nt >>= 2;
+ }
+ t = (double *) malloc(sizeof(double) * nt);
+ fft2d_alloc_error_check(t);
+ }
+#ifdef USE_FFT2D_THREADS
+ if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
+ ddxt2d0_subth(n1, n2, 0, isgn, a, ip, w);
+ ddxt2d_subth(n1, n2, 0, isgn, a, t, ip, w);
+ } else
+#endif /* USE_FFT2D_THREADS */
+ {
+ for (i = 0; i < n1; i++) {
+ ddct(n2, isgn, a[i], ip, w);
+ }
+ ddxt2d_sub(n1, n2, 0, isgn, a, t, ip, w);
+ }
+ if (itnull != 0) {
+ free(t);
+ }
+}
+
+
+void ddst2d(int n1, int n2, int isgn, double **a, double *t,
+ int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void ddst(int n, int isgn, double *a, int *ip, double *w);
+ void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a,
+ double *t, int *ip, double *w);
+#ifdef USE_FFT2D_THREADS
+ void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a,
+ int *ip, double *w);
+ void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a,
+ double *t, int *ip, double *w);
+#endif /* USE_FFT2D_THREADS */
+ int n, nw, nc, itnull, nthread, nt, i;
+
+ n = n1;
+ if (n < n2) {
+ n = n2;
+ }
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n > nc) {
+ nc = n;
+ makect(nc, ip, w + nw);
+ }
+ itnull = 0;
+ if (t == NULL) {
+ itnull = 1;
+ nthread = 1;
+#ifdef USE_FFT2D_THREADS
+ nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+ nt = 4 * nthread * n1;
+ if (n2 == 2 * nthread) {
+ nt >>= 1;
+ } else if (n2 < 2 * nthread) {
+ nt >>= 2;
+ }
+ t = (double *) malloc(sizeof(double) * nt);
+ fft2d_alloc_error_check(t);
+ }
+#ifdef USE_FFT2D_THREADS
+ if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
+ ddxt2d0_subth(n1, n2, 1, isgn, a, ip, w);
+ ddxt2d_subth(n1, n2, 1, isgn, a, t, ip, w);
+ } else
+#endif /* USE_FFT2D_THREADS */
+ {
+ for (i = 0; i < n1; i++) {
+ ddst(n2, isgn, a[i], ip, w);
+ }
+ ddxt2d_sub(n1, n2, 1, isgn, a, t, ip, w);
+ }
+ if (itnull != 0) {
+ free(t);
+ }
+}
+
+
+/* -------- child routines -------- */
+
+
+void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t,
+ int *ip, double *w)
+{
+ void cdft(int n, int isgn, double *a, int *ip, double *w);
+ int i, j;
+
+ if (n2 > 4) {
+ for (j = 0; j < n2; j += 8) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][j];
+ t[2 * i + 1] = a[i][j + 1];
+ t[2 * n1 + 2 * i] = a[i][j + 2];
+ t[2 * n1 + 2 * i + 1] = a[i][j + 3];
+ t[4 * n1 + 2 * i] = a[i][j + 4];
+ t[4 * n1 + 2 * i + 1] = a[i][j + 5];
+ t[6 * n1 + 2 * i] = a[i][j + 6];
+ t[6 * n1 + 2 * i + 1] = a[i][j + 7];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+ cdft(2 * n1, isgn, &t[4 * n1], ip, w);
+ cdft(2 * n1, isgn, &t[6 * n1], ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][j] = t[2 * i];
+ a[i][j + 1] = t[2 * i + 1];
+ a[i][j + 2] = t[2 * n1 + 2 * i];
+ a[i][j + 3] = t[2 * n1 + 2 * i + 1];
+ a[i][j + 4] = t[4 * n1 + 2 * i];
+ a[i][j + 5] = t[4 * n1 + 2 * i + 1];
+ a[i][j + 6] = t[6 * n1 + 2 * i];
+ a[i][j + 7] = t[6 * n1 + 2 * i + 1];
+ }
+ }
+ } else if (n2 == 4) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][0];
+ t[2 * i + 1] = a[i][1];
+ t[2 * n1 + 2 * i] = a[i][2];
+ t[2 * n1 + 2 * i + 1] = a[i][3];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][0] = t[2 * i];
+ a[i][1] = t[2 * i + 1];
+ a[i][2] = t[2 * n1 + 2 * i];
+ a[i][3] = t[2 * n1 + 2 * i + 1];
+ }
+ } else if (n2 == 2) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][0];
+ t[2 * i + 1] = a[i][1];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][0] = t[2 * i];
+ a[i][1] = t[2 * i + 1];
+ }
+ }
+}
+
+
+void rdft2d_sub(int n1, int n2, int isgn, double **a)
+{
+ int n1h, i, j;
+ double xi;
+
+ n1h = n1 >> 1;
+ if (isgn < 0) {
+ for (i = 1; i < n1h; i++) {
+ j = n1 - i;
+ xi = a[i][0] - a[j][0];
+ a[i][0] += a[j][0];
+ a[j][0] = xi;
+ xi = a[j][1] - a[i][1];
+ a[i][1] += a[j][1];
+ a[j][1] = xi;
+ }
+ } else {
+ for (i = 1; i < n1h; i++) {
+ j = n1 - i;
+ a[j][0] = 0.5 * (a[i][0] - a[j][0]);
+ a[i][0] -= a[j][0];
+ a[j][1] = 0.5 * (a[i][1] + a[j][1]);
+ a[i][1] -= a[j][1];
+ }
+ }
+}
+
+
+void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a,
+ double *t, int *ip, double *w)
+{
+ void ddct(int n, int isgn, double *a, int *ip, double *w);
+ void ddst(int n, int isgn, double *a, int *ip, double *w);
+ int i, j;
+
+ if (n2 > 2) {
+ for (j = 0; j < n2; j += 4) {
+ for (i = 0; i < n1; i++) {
+ t[i] = a[i][j];
+ t[n1 + i] = a[i][j + 1];
+ t[2 * n1 + i] = a[i][j + 2];
+ t[3 * n1 + i] = a[i][j + 3];
+ }
+ if (ics == 0) {
+ ddct(n1, isgn, t, ip, w);
+ ddct(n1, isgn, &t[n1], ip, w);
+ ddct(n1, isgn, &t[2 * n1], ip, w);
+ ddct(n1, isgn, &t[3 * n1], ip, w);
+ } else {
+ ddst(n1, isgn, t, ip, w);
+ ddst(n1, isgn, &t[n1], ip, w);
+ ddst(n1, isgn, &t[2 * n1], ip, w);
+ ddst(n1, isgn, &t[3 * n1], ip, w);
+ }
+ for (i = 0; i < n1; i++) {
+ a[i][j] = t[i];
+ a[i][j + 1] = t[n1 + i];
+ a[i][j + 2] = t[2 * n1 + i];
+ a[i][j + 3] = t[3 * n1 + i];
+ }
+ }
+ } else if (n2 == 2) {
+ for (i = 0; i < n1; i++) {
+ t[i] = a[i][0];
+ t[n1 + i] = a[i][1];
+ }
+ if (ics == 0) {
+ ddct(n1, isgn, t, ip, w);
+ ddct(n1, isgn, &t[n1], ip, w);
+ } else {
+ ddst(n1, isgn, t, ip, w);
+ ddst(n1, isgn, &t[n1], ip, w);
+ }
+ for (i = 0; i < n1; i++) {
+ a[i][0] = t[i];
+ a[i][1] = t[n1 + i];
+ }
+ }
+}
+
+
+#ifdef USE_FFT2D_THREADS
+struct fft2d_arg_st {
+ int nthread;
+ int n0;
+ int n1;
+ int n2;
+ int ic;
+ int isgn;
+ double **a;
+ double *t;
+ int *ip;
+ double *w;
+};
+typedef struct fft2d_arg_st fft2d_arg_t;
+
+
+void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a,
+ int *ip, double *w)
+{
+ void *xdft2d0_th(void *p);
+ fft2d_thread_t th[FFT2D_MAX_THREADS];
+ fft2d_arg_t ag[FFT2D_MAX_THREADS];
+ int nthread, i;
+
+ nthread = FFT2D_MAX_THREADS;
+ if (nthread > n1) {
+ nthread = n1;
+ }
+ for (i = 0; i < nthread; i++) {
+ ag[i].nthread = nthread;
+ ag[i].n0 = i;
+ ag[i].n1 = n1;
+ ag[i].n2 = n2;
+ ag[i].ic = icr;
+ ag[i].isgn = isgn;
+ ag[i].a = a;
+ ag[i].ip = ip;
+ ag[i].w = w;
+ fft2d_thread_create(&th[i], xdft2d0_th, &ag[i]);
+ }
+ for (i = 0; i < nthread; i++) {
+ fft2d_thread_wait(th[i]);
+ }
+}
+
+
+void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t,
+ int *ip, double *w)
+{
+ void *cdft2d_th(void *p);
+ fft2d_thread_t th[FFT2D_MAX_THREADS];
+ fft2d_arg_t ag[FFT2D_MAX_THREADS];
+ int nthread, nt, i;
+
+ nthread = FFT2D_MAX_THREADS;
+ nt = 8 * n1;
+ if (n2 == 4 * FFT2D_MAX_THREADS) {
+ nt >>= 1;
+ } else if (n2 < 4 * FFT2D_MAX_THREADS) {
+ nthread = n2 >> 1;
+ nt >>= 2;
+ }
+ for (i = 0; i < nthread; i++) {
+ ag[i].nthread = nthread;
+ ag[i].n0 = i;
+ ag[i].n1 = n1;
+ ag[i].n2 = n2;
+ ag[i].isgn = isgn;
+ ag[i].a = a;
+ ag[i].t = &t[nt * i];
+ ag[i].ip = ip;
+ ag[i].w = w;
+ fft2d_thread_create(&th[i], cdft2d_th, &ag[i]);
+ }
+ for (i = 0; i < nthread; i++) {
+ fft2d_thread_wait(th[i]);
+ }
+}
+
+
+void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a,
+ int *ip, double *w)
+{
+ void *ddxt2d0_th(void *p);
+ fft2d_thread_t th[FFT2D_MAX_THREADS];
+ fft2d_arg_t ag[FFT2D_MAX_THREADS];
+ int nthread, i;
+
+ nthread = FFT2D_MAX_THREADS;
+ if (nthread > n1) {
+ nthread = n1;
+ }
+ for (i = 0; i < nthread; i++) {
+ ag[i].nthread = nthread;
+ ag[i].n0 = i;
+ ag[i].n1 = n1;
+ ag[i].n2 = n2;
+ ag[i].ic = ics;
+ ag[i].isgn = isgn;
+ ag[i].a = a;
+ ag[i].ip = ip;
+ ag[i].w = w;
+ fft2d_thread_create(&th[i], ddxt2d0_th, &ag[i]);
+ }
+ for (i = 0; i < nthread; i++) {
+ fft2d_thread_wait(th[i]);
+ }
+}
+
+
+void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a,
+ double *t, int *ip, double *w)
+{
+ void *ddxt2d_th(void *p);
+ fft2d_thread_t th[FFT2D_MAX_THREADS];
+ fft2d_arg_t ag[FFT2D_MAX_THREADS];
+ int nthread, nt, i;
+
+ nthread = FFT2D_MAX_THREADS;
+ nt = 4 * n1;
+ if (n2 == 2 * FFT2D_MAX_THREADS) {
+ nt >>= 1;
+ } else if (n2 < 2 * FFT2D_MAX_THREADS) {
+ nthread = n2;
+ nt >>= 2;
+ }
+ for (i = 0; i < nthread; i++) {
+ ag[i].nthread = nthread;
+ ag[i].n0 = i;
+ ag[i].n1 = n1;
+ ag[i].n2 = n2;
+ ag[i].ic = ics;
+ ag[i].isgn = isgn;
+ ag[i].a = a;
+ ag[i].t = &t[nt * i];
+ ag[i].ip = ip;
+ ag[i].w = w;
+ fft2d_thread_create(&th[i], ddxt2d_th, &ag[i]);
+ }
+ for (i = 0; i < nthread; i++) {
+ fft2d_thread_wait(th[i]);
+ }
+}
+
+
+void *xdft2d0_th(void *p)
+{
+ void cdft(int n, int isgn, double *a, int *ip, double *w);
+ void rdft(int n, int isgn, double *a, int *ip, double *w);
+ int nthread, n0, n1, n2, icr, isgn, *ip, i;
+ double **a, *w;
+
+ nthread = ((fft2d_arg_t *) p)->nthread;
+ n0 = ((fft2d_arg_t *) p)->n0;
+ n1 = ((fft2d_arg_t *) p)->n1;
+ n2 = ((fft2d_arg_t *) p)->n2;
+ icr = ((fft2d_arg_t *) p)->ic;
+ isgn = ((fft2d_arg_t *) p)->isgn;
+ a = ((fft2d_arg_t *) p)->a;
+ ip = ((fft2d_arg_t *) p)->ip;
+ w = ((fft2d_arg_t *) p)->w;
+ if (icr == 0) {
+ for (i = n0; i < n1; i += nthread) {
+ cdft(n2, isgn, a[i], ip, w);
+ }
+ } else {
+ for (i = n0; i < n1; i += nthread) {
+ rdft(n2, isgn, a[i], ip, w);
+ }
+ }
+ return (void *) 0;
+}
+
+
+void *cdft2d_th(void *p)
+{
+ void cdft(int n, int isgn, double *a, int *ip, double *w);
+ int nthread, n0, n1, n2, isgn, *ip, i, j;
+ double **a, *t, *w;
+
+ nthread = ((fft2d_arg_t *) p)->nthread;
+ n0 = ((fft2d_arg_t *) p)->n0;
+ n1 = ((fft2d_arg_t *) p)->n1;
+ n2 = ((fft2d_arg_t *) p)->n2;
+ isgn = ((fft2d_arg_t *) p)->isgn;
+ a = ((fft2d_arg_t *) p)->a;
+ t = ((fft2d_arg_t *) p)->t;
+ ip = ((fft2d_arg_t *) p)->ip;
+ w = ((fft2d_arg_t *) p)->w;
+ if (n2 > 4 * nthread) {
+ for (j = 8 * n0; j < n2; j += 8 * nthread) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][j];
+ t[2 * i + 1] = a[i][j + 1];
+ t[2 * n1 + 2 * i] = a[i][j + 2];
+ t[2 * n1 + 2 * i + 1] = a[i][j + 3];
+ t[4 * n1 + 2 * i] = a[i][j + 4];
+ t[4 * n1 + 2 * i + 1] = a[i][j + 5];
+ t[6 * n1 + 2 * i] = a[i][j + 6];
+ t[6 * n1 + 2 * i + 1] = a[i][j + 7];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+ cdft(2 * n1, isgn, &t[4 * n1], ip, w);
+ cdft(2 * n1, isgn, &t[6 * n1], ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][j] = t[2 * i];
+ a[i][j + 1] = t[2 * i + 1];
+ a[i][j + 2] = t[2 * n1 + 2 * i];
+ a[i][j + 3] = t[2 * n1 + 2 * i + 1];
+ a[i][j + 4] = t[4 * n1 + 2 * i];
+ a[i][j + 5] = t[4 * n1 + 2 * i + 1];
+ a[i][j + 6] = t[6 * n1 + 2 * i];
+ a[i][j + 7] = t[6 * n1 + 2 * i + 1];
+ }
+ }
+ } else if (n2 == 4 * nthread) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][4 * n0];
+ t[2 * i + 1] = a[i][4 * n0 + 1];
+ t[2 * n1 + 2 * i] = a[i][4 * n0 + 2];
+ t[2 * n1 + 2 * i + 1] = a[i][4 * n0 + 3];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][4 * n0] = t[2 * i];
+ a[i][4 * n0 + 1] = t[2 * i + 1];
+ a[i][4 * n0 + 2] = t[2 * n1 + 2 * i];
+ a[i][4 * n0 + 3] = t[2 * n1 + 2 * i + 1];
+ }
+ } else if (n2 == 2 * nthread) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][2 * n0];
+ t[2 * i + 1] = a[i][2 * n0 + 1];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][2 * n0] = t[2 * i];
+ a[i][2 * n0 + 1] = t[2 * i + 1];
+ }
+ }
+ return (void *) 0;
+}
+
+
+void *ddxt2d0_th(void *p)
+{
+ void ddct(int n, int isgn, double *a, int *ip, double *w);
+ void ddst(int n, int isgn, double *a, int *ip, double *w);
+ int nthread, n0, n1, n2, ics, isgn, *ip, i;
+ double **a, *w;
+
+ nthread = ((fft2d_arg_t *) p)->nthread;
+ n0 = ((fft2d_arg_t *) p)->n0;
+ n1 = ((fft2d_arg_t *) p)->n1;
+ n2 = ((fft2d_arg_t *) p)->n2;
+ ics = ((fft2d_arg_t *) p)->ic;
+ isgn = ((fft2d_arg_t *) p)->isgn;
+ a = ((fft2d_arg_t *) p)->a;
+ ip = ((fft2d_arg_t *) p)->ip;
+ w = ((fft2d_arg_t *) p)->w;
+ if (ics == 0) {
+ for (i = n0; i < n1; i += nthread) {
+ ddct(n2, isgn, a[i], ip, w);
+ }
+ } else {
+ for (i = n0; i < n1; i += nthread) {
+ ddst(n2, isgn, a[i], ip, w);
+ }
+ }
+ return (void *) 0;
+}
+
+
+void *ddxt2d_th(void *p)
+{
+ void ddct(int n, int isgn, double *a, int *ip, double *w);
+ void ddst(int n, int isgn, double *a, int *ip, double *w);
+ int nthread, n0, n1, n2, ics, isgn, *ip, i, j;
+ double **a, *t, *w;
+
+ nthread = ((fft2d_arg_t *) p)->nthread;
+ n0 = ((fft2d_arg_t *) p)->n0;
+ n1 = ((fft2d_arg_t *) p)->n1;
+ n2 = ((fft2d_arg_t *) p)->n2;
+ ics = ((fft2d_arg_t *) p)->ic;
+ isgn = ((fft2d_arg_t *) p)->isgn;
+ a = ((fft2d_arg_t *) p)->a;
+ t = ((fft2d_arg_t *) p)->t;
+ ip = ((fft2d_arg_t *) p)->ip;
+ w = ((fft2d_arg_t *) p)->w;
+ if (n2 > 2 * nthread) {
+ for (j = 4 * n0; j < n2; j += 4 * nthread) {
+ for (i = 0; i < n1; i++) {
+ t[i] = a[i][j];
+ t[n1 + i] = a[i][j + 1];
+ t[2 * n1 + i] = a[i][j + 2];
+ t[3 * n1 + i] = a[i][j + 3];
+ }
+ if (ics == 0) {
+ ddct(n1, isgn, t, ip, w);
+ ddct(n1, isgn, &t[n1], ip, w);
+ ddct(n1, isgn, &t[2 * n1], ip, w);
+ ddct(n1, isgn, &t[3 * n1], ip, w);
+ } else {
+ ddst(n1, isgn, t, ip, w);
+ ddst(n1, isgn, &t[n1], ip, w);
+ ddst(n1, isgn, &t[2 * n1], ip, w);
+ ddst(n1, isgn, &t[3 * n1], ip, w);
+ }
+ for (i = 0; i < n1; i++) {
+ a[i][j] = t[i];
+ a[i][j + 1] = t[n1 + i];
+ a[i][j + 2] = t[2 * n1 + i];
+ a[i][j + 3] = t[3 * n1 + i];
+ }
+ }
+ } else if (n2 == 2 * nthread) {
+ for (i = 0; i < n1; i++) {
+ t[i] = a[i][2 * n0];
+ t[n1 + i] = a[i][2 * n0 + 1];
+ }
+ if (ics == 0) {
+ ddct(n1, isgn, t, ip, w);
+ ddct(n1, isgn, &t[n1], ip, w);
+ } else {
+ ddst(n1, isgn, t, ip, w);
+ ddst(n1, isgn, &t[n1], ip, w);
+ }
+ for (i = 0; i < n1; i++) {
+ a[i][2 * n0] = t[i];
+ a[i][2 * n0 + 1] = t[n1 + i];
+ }
+ } else if (n2 == nthread) {
+ for (i = 0; i < n1; i++) {
+ t[i] = a[i][n0];
+ }
+ if (ics == 0) {
+ ddct(n1, isgn, t, ip, w);
+ } else {
+ ddst(n1, isgn, t, ip, w);
+ }
+ for (i = 0; i < n1; i++) {
+ a[i][n0] = t[i];
+ }
+ }
+ return (void *) 0;
+}
+#endif /* USE_FFT2D_THREADS */
+
diff --git a/src/fft2d/fft2d/fftsg2d.f b/src/fft2d/fft2d/fftsg2d.f
new file mode 100644
index 0000000..49a9d6c
--- /dev/null
+++ b/src/fft2d/fft2d/fftsg2d.f
@@ -0,0 +1,562 @@
+! Fast Fourier/Cosine/Sine Transform
+! dimension :two
+! data length :power of 2
+! decimation :frequency
+! radix :split-radix, row-column
+! data :inplace
+! table :use
+! subroutines
+! cdft2d: Complex Discrete Fourier Transform
+! rdft2d: Real Discrete Fourier Transform
+! ddct2d: Discrete Cosine Transform
+! ddst2d: Discrete Sine Transform
+! necessary package
+! fftsg.f : 1D-FFT package
+!
+!
+! -------- Complex DFT (Discrete Fourier Transform) --------
+! [definition]
+! <case1>
+! X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) *
+! exp(2*pi*i*j1*k1/n1) *
+! exp(2*pi*i*j2*k2/n2),
+! 0<=k1<n1, 0<=k2<n2
+! <case2>
+! X(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 x(j1,j2) *
+! exp(-2*pi*i*j1*k1/n1) *
+! exp(-2*pi*i*j2*k2/n2),
+! 0<=k1<n1, 0<=k2<n2
+! (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call cdft2d(n1max, 2*n1, n2, 1, a, t, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call cdft2d(n1max, 2*n1, n2, -1, a, t, ip, w)
+! [parameters]
+! n1max :row size of the 2D array (integer)
+! 2*n1 :data length (integer)
+! n1 >= 1, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 1, n2 = power of 2
+! a(0:2*n1-1,0:n2-1)
+! :input/output data (real*8)
+! input data
+! a(2*j1,j2) = Re(x(j1,j2)),
+! a(2*j1+1,j2) = Im(x(j1,j2)),
+! 0<=j1<n1, 0<=j2<n2
+! output data
+! a(2*k1,k2) = Re(X(k1,k2)),
+! a(2*k1+1,k2) = Im(X(k1,k2)),
+! 0<=k1<n1, 0<=k2<n2
+! t(0:8*n2-1)
+! :work area (real*8)
+! length of t >= 8*n2
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1, n2))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1/2, n2/2)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call cdft2d(n1max, 2*n1, n2, -1, a, t, ip, w)
+! is
+! call cdft2d(n1max, 2*n1, n2, 1, a, t, ip, w)
+! do j2 = 0, n2 - 1
+! do j1 = 0, 2 * n1 - 1
+! a(j1, j2) = a(j1, j2) * (1.0d0 / n1 / n2)
+! end do
+! end do
+! .
+!
+!
+! -------- Real DFT / Inverse of Real DFT --------
+! [definition]
+! <case1> RDFT
+! R(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) *
+! cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
+! 0<=k1<n1, 0<=k2<n2
+! I(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) *
+! sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
+! 0<=k1<n1, 0<=k2<n2
+! <case2> IRDFT (excluding scale)
+! a(k1,k2) = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
+! (R(j1,j2) *
+! cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) +
+! I(j1,j2) *
+! sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)),
+! 0<=k1<n1, 0<=k2<n2
+! (notes: R(n1-k1,n2-k2) = R(k1,k2),
+! I(n1-k1,n2-k2) = -I(k1,k2),
+! R(n1-k1,0) = R(k1,0),
+! I(n1-k1,0) = -I(k1,0),
+! R(0,n2-k2) = R(0,k2),
+! I(0,n2-k2) = -I(0,k2),
+! 0<k1<n1, 0<k2<n2)
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call rdft2d(n1max, n1, n2, 1, a, t, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call rdft2d(n1max, n1, n2, -1, a, t, ip, w)
+! [parameters]
+! n1max :row size of the 2D array (integer)
+! n1 :data length (integer)
+! n1 >= 2, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 2, n2 = power of 2
+! a(0:n1-1,0:n2-1)
+! :input/output data (real*8)
+! <case1>
+! output data
+! a(2*k1,k2) = R(k1,k2) = R(n1-k1,n2-k2),
+! a(2*k1+1,k2) = I(k1,k2) = -I(n1-k1,n2-k2),
+! 0<k1<n1/2, 0<k2<n2,
+! a(2*k1,0) = R(k1,0) = R(n1-k1,0),
+! a(2*k1+1,0) = I(k1,0) = -I(n1-k1,0),
+! 0<k1<n1/2,
+! a(0,k2) = R(0,k2) = R(0,n2-k2),
+! a(1,k2) = I(0,k2) = -I(0,n2-k2),
+! a(1,n2-k2) = R(n1/2,k2) = R(n1/2,n2-k2),
+! a(0,n2-k2) = -I(n1/2,k2) = I(n1/2,n2-k2),
+! 0<k2<n2/2,
+! a(0,0) = R(0,0),
+! a(1,0) = R(n1/2,0),
+! a(0,n2/2) = R(0,n2/2),
+! a(1,n2/2) = R(n1/2,n2/2)
+! <case2>
+! input data
+! a(2*j1,j2) = R(j1,j2) = R(n1-j1,n2-j2),
+! a(2*j1+1,j2) = I(j1,j2) = -I(n1-j1,n2-j2),
+! 0<j1<n1/2, 0<j2<n2,
+! a(2*j1,0) = R(j1,0) = R(n1-j1,0),
+! a(2*j1+1,0) = I(j1,0) = -I(n1-j1,0),
+! 0<j1<n1/2,
+! a(0,j2) = R(0,j2) = R(0,n2-j2),
+! a(1,j2) = I(0,j2) = -I(0,n2-j2),
+! a(1,n2-j2) = R(n1/2,j2) = R(n1/2,n2-j2),
+! a(0,n2-j2) = -I(n1/2,j2) = I(n1/2,n2-j2),
+! 0<j2<n2/2,
+! a(0,0) = R(0,0),
+! a(1,0) = R(n1/2,0),
+! a(0,n2/2) = R(0,n2/2),
+! a(1,n2/2) = R(n1/2,n2/2)
+! ---- output ordering ----
+! call rdft2d(n1max, n1, n2, 1, a, t, ip, w)
+! call rdft2dsort(n1max, n1, n2, 1, a)
+! ! stored data is a(0:n1-1,0:n2+1):
+! ! a(2*k1,k2) = R(k1,k2),
+! ! a(2*k1+1,k2) = I(k1,k2),
+! ! 0<=k1<=n1/2, 0<=k2<n2.
+! ! the stored data is larger than the input data!
+! ---- input ordering ----
+! call rdft2dsort(n1max, n1, n2, -1, a)
+! call rdft2d(n1max, n1, n2, -1, a, t, ip, w)
+! t(0:8*n2-1)
+! :work area (real*8)
+! length of t >= 8*n2
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1/2, n2))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1/4, n2/2) + n1/4
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call rdft2d(n1max, n1, n2, 1, a, t, ip, w)
+! is
+! call rdft2d(n1max, n1, n2, -1, a, t, ip, w)
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1, j2) = a(j1, j2) * (2.0d0 / n1 / n2)
+! end do
+! end do
+! .
+!
+!
+! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+! [definition]
+! <case1> IDCT (excluding scale)
+! C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) *
+! cos(pi*j1*(k1+1/2)/n1) *
+! cos(pi*j2*(k2+1/2)/n2),
+! 0<=k1<n1, 0<=k2<n2
+! <case2> DCT
+! C(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) *
+! cos(pi*(j1+1/2)*k1/n1) *
+! cos(pi*(j2+1/2)*k2/n2),
+! 0<=k1<n1, 0<=k2<n2
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
+! [parameters]
+! n1max :row size of the 2D array (integer)
+! n1 :data length (integer)
+! n1 >= 2, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 2, n2 = power of 2
+! a(0:n1-1,0:n2-1)
+! :input/output data (real*8)
+! output data
+! a(k1,k2) = C(k1,k2), 0<=k1<n1, 0<=k2<n2
+! t(0:4*n2-1)
+! :work area (real*8)
+! length of t >= 4*n2
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1/2, n2/2))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1*3/2, n2*3/2)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call ddct2d(n1max, n1, n2, -1, a, t, ip, w)
+! is
+! do j1 = 0, n1 - 1
+! a(j1, 0) = a(j1, 0) * 0.5d0
+! end do
+! do j2 = 0, n2 - 1
+! a(0, j2) = a(0, j2) * 0.5d0
+! end do
+! call ddct2d(n1max, n1, n2, 1, a, t, ip, w)
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1, j2) = a(j1, j2) * (4.0d0 / n1 / n2)
+! end do
+! end do
+! .
+!
+!
+! -------- DST (Discrete Sine Transform) / Inverse of DST --------
+! [definition]
+! <case1> IDST (excluding scale)
+! S(k1,k2) = sum_j1=1^n1 sum_j2=1^n2 A(j1,j2) *
+! sin(pi*j1*(k1+1/2)/n1) *
+! sin(pi*j2*(k2+1/2)/n2),
+! 0<=k1<n1, 0<=k2<n2
+! <case2> DST
+! S(k1,k2) = sum_j1=0^n1-1 sum_j2=0^n2-1 a(j1,j2) *
+! sin(pi*(j1+1/2)*k1/n1) *
+! sin(pi*(j2+1/2)*k2/n2),
+! 0<k1<=n1, 0<k2<=n2
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
+! [parameters]
+! n1max :row size of the 2D array (integer)
+! n1 :data length (integer)
+! n1 >= 2, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 2, n2 = power of 2
+! a(0:n1-1,0:n2-1)
+! :input/output data (real*8)
+! <case1>
+! input data
+! a(j1,j2) = A(j1,j2), 0<j1<n1, 0<j2<n2,
+! a(j1,0) = A(j1,n2), 0<j1<n1,
+! a(0,j2) = A(n1,j2), 0<j2<n2,
+! a(0,0) = A(n1,n2)
+! (i.e. A(j1,j2) = a(mod(j1,n1),mod(j2,n2)))
+! output data
+! a(k1,k2) = S(k1,k2), 0<=k1<n1, 0<=k2<n2
+! <case2>
+! output data
+! a(k1,k2) = S(k1,k2), 0<k1<n1, 0<k2<n2,
+! a(k1,0) = S(k1,n2), 0<k1<n1,
+! a(0,k2) = S(n1,k2), 0<k2<n2,
+! a(0,0) = S(n1,n2)
+! (i.e. S(k1,k2) = a(mod(k1,n1),mod(k2,n2)))
+! t(0:4*n2-1)
+! :work area (real*8)
+! length of t >= 4*n2
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1/2, n2/2))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1*3/2, n2*3/2)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call ddst2d(n1max, n1, n2, -1, a, t, ip, w)
+! is
+! do j1 = 0, n1 - 1
+! a(j1, 0) = a(j1, 0) * 0.5d0
+! end do
+! do j2 = 0, n2 - 1
+! a(0, j2) = a(0, j2) * 0.5d0
+! end do
+! call ddst2d(n1max, n1, n2, 1, a, t, ip, w)
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1, j2) = a(j1, j2) * (4.0d0 / n1 / n2)
+! end do
+! end do
+! .
+!
+!
+ subroutine cdft2d(n1max, n1, n2, isgn, a, t, ip, w)
+ integer n1max, n1, n2, isgn, ip(0 : *), n, j
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 8 * n2 - 1),
+ & w(0 : *)
+ n = max(n1, 2 * n2)
+ if (n .gt. 4 * ip(0)) then
+ call makewt(n / 4, ip, w)
+ end if
+ do j = 0, n2 - 1
+ call cdft(n1, isgn, a(0, j), ip, w)
+ end do
+ call cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
+ end
+!
+ subroutine rdft2d(n1max, n1, n2, isgn, a, t, ip, w)
+ integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, j
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 8 * n2 - 1),
+ & w(0 : *)
+ n = max(n1, 2 * n2)
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n1 .gt. 4 * nc) then
+ nc = n1 / 4
+ call makect(nc, ip, w(nw))
+ end if
+ if (isgn .lt. 0) then
+ call rdft2d_sub(n1max, n1, n2, isgn, a)
+ call cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
+ end if
+ do j = 0, n2 - 1
+ call rdft(n1, isgn, a(0, j), ip, w)
+ end do
+ if (isgn .ge. 0) then
+ call cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
+ call rdft2d_sub(n1max, n1, n2, isgn, a)
+ end if
+ end
+!
+ subroutine rdft2dsort(n1max, n1, n2, isgn, a)
+ integer n1max, n1, n2, isgn, n2h, j
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), x, y
+ n2h = n2 / 2
+ if (isgn .lt. 0) then
+ do j = n2h + 1, n2 - 1
+ a(0, j) = a(n1 + 1, j)
+ a(1, j) = a(n1, j)
+ end do
+ a(1, 0) = a(n1, 0)
+ a(1, n2h) = a(n1, n2h)
+ else
+ do j = n2h + 1, n2 - 1
+ y = a(0, j)
+ x = a(1, j)
+ a(n1, j) = x
+ a(n1 + 1, j) = y
+ a(n1, n2 - j) = x
+ a(n1 + 1, n2 - j) = -y
+ a(0, j) = a(0, n2 - j)
+ a(1, j) = -a(1, n2 - j)
+ end do
+ a(n1, 0) = a(1, 0)
+ a(n1 + 1, 0) = 0
+ a(1, 0) = 0
+ a(n1, n2h) = a(1, n2h)
+ a(n1 + 1, n2h) = 0
+ a(1, n2h) = 0
+ end if
+ end
+!
+ subroutine ddct2d(n1max, n1, n2, isgn, a, t, ip, w)
+ integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, j
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 4 * n2 - 1),
+ & w(0 : *)
+ n = max(n1, n2)
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n .gt. nc) then
+ nc = n
+ call makect(nc, ip, w(nw))
+ end if
+ do j = 0, n2 - 1
+ call ddct(n1, isgn, a(0, j), ip, w)
+ end do
+ call ddxt2d_sub(n1max, n1, n2, 0, isgn, a, t, ip, w)
+ end
+!
+ subroutine ddst2d(n1max, n1, n2, isgn, a, t, ip, w)
+ integer n1max, n1, n2, isgn, ip(0 : *), n, nw, nc, j
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 4 * n2 - 1),
+ & w(0 : *)
+ n = max(n1, n2)
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n .gt. nc) then
+ nc = n
+ call makect(nc, ip, w(nw))
+ end if
+ do j = 0, n2 - 1
+ call ddst(n1, isgn, a(0, j), ip, w)
+ end do
+ call ddxt2d_sub(n1max, n1, n2, 1, isgn, a, t, ip, w)
+ end
+!
+! -------- child routines --------
+!
+ subroutine cdft2d_sub(n1max, n1, n2, isgn, a, t, ip, w)
+ integer n1max, n1, n2, isgn, ip(0 : *), i, j
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 8 * n2 - 1),
+ & w(0 : *)
+ if (n1 .gt. 4) then
+ do i = 0, n1 - 8, 8
+ do j = 0, n2 - 1
+ t(2 * j) = a(i, j)
+ t(2 * j + 1) = a(i + 1, j)
+ t(2 * n2 + 2 * j) = a(i + 2, j)
+ t(2 * n2 + 2 * j + 1) = a(i + 3, j)
+ t(4 * n2 + 2 * j) = a(i + 4, j)
+ t(4 * n2 + 2 * j + 1) = a(i + 5, j)
+ t(6 * n2 + 2 * j) = a(i + 6, j)
+ t(6 * n2 + 2 * j + 1) = a(i + 7, j)
+ end do
+ call cdft(2 * n2, isgn, t, ip, w)
+ call cdft(2 * n2, isgn, t(2 * n2), ip, w)
+ call cdft(2 * n2, isgn, t(4 * n2), ip, w)
+ call cdft(2 * n2, isgn, t(6 * n2), ip, w)
+ do j = 0, n2 - 1
+ a(i, j) = t(2 * j)
+ a(i + 1, j) = t(2 * j + 1)
+ a(i + 2, j) = t(2 * n2 + 2 * j)
+ a(i + 3, j) = t(2 * n2 + 2 * j + 1)
+ a(i + 4, j) = t(4 * n2 + 2 * j)
+ a(i + 5, j) = t(4 * n2 + 2 * j + 1)
+ a(i + 6, j) = t(6 * n2 + 2 * j)
+ a(i + 7, j) = t(6 * n2 + 2 * j + 1)
+ end do
+ end do
+ else if (n1 .eq. 4) then
+ do j = 0, n2 - 1
+ t(2 * j) = a(0, j)
+ t(2 * j + 1) = a(1, j)
+ t(2 * n2 + 2 * j) = a(2, j)
+ t(2 * n2 + 2 * j + 1) = a(3, j)
+ end do
+ call cdft(2 * n2, isgn, t, ip, w)
+ call cdft(2 * n2, isgn, t(2 * n2), ip, w)
+ do j = 0, n2 - 1
+ a(0, j) = t(2 * j)
+ a(1, j) = t(2 * j + 1)
+ a(2, j) = t(2 * n2 + 2 * j)
+ a(3, j) = t(2 * n2 + 2 * j + 1)
+ end do
+ else if (n1 .eq. 2) then
+ do j = 0, n2 - 1
+ t(2 * j) = a(0, j)
+ t(2 * j + 1) = a(1, j)
+ end do
+ call cdft(2 * n2, isgn, t, ip, w)
+ do j = 0, n2 - 1
+ a(0, j) = t(2 * j)
+ a(1, j) = t(2 * j + 1)
+ end do
+ end if
+ end
+!
+ subroutine rdft2d_sub(n1max, n1, n2, isgn, a)
+ integer n1max, n1, n2, isgn, n2h, i, j
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), xi
+ n2h = n2 / 2
+ if (isgn .lt. 0) then
+ do i = 1, n2h - 1
+ j = n2 - i
+ xi = a(0, i) - a(0, j)
+ a(0, i) = a(0, i) + a(0, j)
+ a(0, j) = xi
+ xi = a(1, j) - a(1, i)
+ a(1, i) = a(1, i) + a(1, j)
+ a(1, j) = xi
+ end do
+ else
+ do i = 1, n2h - 1
+ j = n2 - i
+ a(0, j) = 0.5d0 * (a(0, i) - a(0, j))
+ a(0, i) = a(0, i) - a(0, j)
+ a(1, j) = 0.5d0 * (a(1, i) + a(1, j))
+ a(1, i) = a(1, i) - a(1, j)
+ end do
+ end if
+ end
+!
+ subroutine ddxt2d_sub(n1max, n1, n2, ics, isgn, a, t,
+ & ip, w)
+ integer n1max, n1, n2, ics, isgn, ip(0 : *), i, j
+ real*8 a(0 : n1max - 1, 0 : n2 - 1), t(0 : 4 * n2 - 1),
+ & w(0 : *)
+ if (n1 .gt. 2) then
+ do i = 0, n1 - 4, 4
+ do j = 0, n2 - 1
+ t(j) = a(i, j)
+ t(n2 + j) = a(i + 1, j)
+ t(2 * n2 + j) = a(i + 2, j)
+ t(3 * n2 + j) = a(i + 3, j)
+ end do
+ if (ics .eq. 0) then
+ call ddct(n2, isgn, t, ip, w)
+ call ddct(n2, isgn, t(n2), ip, w)
+ call ddct(n2, isgn, t(2 * n2), ip, w)
+ call ddct(n2, isgn, t(3 * n2), ip, w)
+ else
+ call ddst(n2, isgn, t, ip, w)
+ call ddst(n2, isgn, t(n2), ip, w)
+ call ddst(n2, isgn, t(2 * n2), ip, w)
+ call ddst(n2, isgn, t(3 * n2), ip, w)
+ end if
+ do j = 0, n2 - 1
+ a(i, j) = t(j)
+ a(i + 1, j) = t(n2 + j)
+ a(i + 2, j) = t(2 * n2 + j)
+ a(i + 3, j) = t(3 * n2 + j)
+ end do
+ end do
+ else if (n1 .eq. 2) then
+ do j = 0, n2 - 1
+ t(j) = a(0, j)
+ t(n2 + j) = a(1, j)
+ end do
+ if (ics .eq. 0) then
+ call ddct(n2, isgn, t, ip, w)
+ call ddct(n2, isgn, t(n2), ip, w)
+ else
+ call ddst(n2, isgn, t, ip, w)
+ call ddst(n2, isgn, t(n2), ip, w)
+ end if
+ do j = 0, n2 - 1
+ a(0, j) = t(j)
+ a(1, j) = t(n2 + j)
+ end do
+ end if
+ end
+!
diff --git a/src/fft2d/fft2d/fftsg3d.c b/src/fft2d/fft2d/fftsg3d.c
new file mode 100644
index 0000000..223c0e2
--- /dev/null
+++ b/src/fft2d/fft2d/fftsg3d.c
@@ -0,0 +1,1695 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+ dimension :three
+ data length :power of 2
+ decimation :frequency
+ radix :split-radix, row-column
+ data :inplace
+ table :use
+functions
+ cdft3d: Complex Discrete Fourier Transform
+ rdft3d: Real Discrete Fourier Transform
+ ddct3d: Discrete Cosine Transform
+ ddst3d: Discrete Sine Transform
+function prototypes
+ void cdft3d(int, int, int, int, double ***, double *, int *, double *);
+ void rdft3d(int, int, int, int, double ***, double *, int *, double *);
+ void rdft3dsort(int, int, int, int, double ***);
+ void ddct3d(int, int, int, int, double ***, double *, int *, double *);
+ void ddst3d(int, int, int, int, double ***, double *, int *, double *);
+necessary package
+ fftsg.c : 1D-FFT package
+macro definitions
+ USE_FFT3D_PTHREADS : default=not defined
+ FFT3D_MAX_THREADS : must be 2^N, default=4
+ FFT3D_THREADS_BEGIN_N : default=65536
+ USE_FFT3D_WINTHREADS : default=not defined
+ FFT3D_MAX_THREADS : must be 2^N, default=4
+ FFT3D_THREADS_BEGIN_N : default=131072
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+ [definition]
+ <case1>
+ X[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+ x[j1][j2][j3] *
+ exp(2*pi*i*j1*k1/n1) *
+ exp(2*pi*i*j2*k2/n2) *
+ exp(2*pi*i*j3*k3/n3),
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ <case2>
+ X[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+ x[j1][j2][j3] *
+ exp(-2*pi*i*j1*k1/n1) *
+ exp(-2*pi*i*j2*k2/n2) *
+ exp(-2*pi*i*j3*k3/n3),
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ cdft3d(n1, n2, 2*n3, 1, a, t, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ cdft3d(n1, n2, 2*n3, -1, a, t, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 1, n1 = power of 2
+ n2 :data length (int)
+ n2 >= 1, n2 = power of 2
+ 2*n3 :data length (int)
+ n3 >= 1, n3 = power of 2
+ a[0...n1-1][0...n2-1][0...2*n3-1]
+ :input/output data (double ***)
+ input data
+ a[j1][j2][2*j3] = Re(x[j1][j2][j3]),
+ a[j1][j2][2*j3+1] = Im(x[j1][j2][j3]),
+ 0<=j1<n1, 0<=j2<n2, 0<=j3<n3
+ output data
+ a[k1][k2][2*k3] = Re(X[k1][k2][k3]),
+ a[k1][k2][2*k3+1] = Im(X[k1][k2][k3]),
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ t[0...*]
+ :work area (double *)
+ length of t >= max(8*n1, 8*n2), if single thread,
+ length of t >= max(8*n1, 8*n2)*FFT3D_MAX_THREADS,
+ if multi threads,
+ t is dynamically allocated, if t == NULL.
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1, n2, n3))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1/2, n2/2, n3/2)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ cdft3d(n1, n2, 2*n3, -1, a, t, ip, w);
+ is
+ cdft3d(n1, n2, 2*n3, 1, a, t, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ for (j3 = 0; j3 <= 2 * n3 - 1; j3++) {
+ a[j1][j2][j3] *= 1.0 / n1 / n2 / n3;
+ }
+ }
+ }
+ .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+ [definition]
+ <case1> RDFT
+ R[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+ a[j1][j2][j3] *
+ cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
+ 2*pi*j3*k3/n3),
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ I[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+ a[j1][j2][j3] *
+ sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
+ 2*pi*j3*k3/n3),
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ <case2> IRDFT (excluding scale)
+ a[k1][k2][k3] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+ (R[j1][j2][j3] *
+ cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
+ 2*pi*j3*k3/n3) +
+ I[j1][j2][j3] *
+ sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
+ 2*pi*j3*k3/n3)),
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ (notes: R[(n1-k1)%n1][(n2-k2)%n2][(n3-k3)%n3] = R[k1][k2][k3],
+ I[(n1-k1)%n1][(n2-k2)%n2][(n3-k3)%n3] = -I[k1][k2][k3],
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3)
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ rdft3d(n1, n2, n3, 1, a, t, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ rdft3d(n1, n2, n3, -1, a, t, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 2, n1 = power of 2
+ n2 :data length (int)
+ n2 >= 2, n2 = power of 2
+ n3 :data length (int)
+ n3 >= 2, n3 = power of 2
+ a[0...n1-1][0...n2-1][0...n3-1]
+ :input/output data (double ***)
+ <case1>
+ output data
+ a[k1][k2][2*k3] = R[k1][k2][k3]
+ = R[(n1-k1)%n1][(n2-k2)%n2][n3-k3],
+ a[k1][k2][2*k3+1] = I[k1][k2][k3]
+ = -I[(n1-k1)%n1][(n2-k2)%n2][n3-k3],
+ 0<=k1<n1, 0<=k2<n2, 0<k3<n3/2,
+ (n%m : n mod m),
+ a[k1][k2][0] = R[k1][k2][0]
+ = R[(n1-k1)%n1][n2-k2][0],
+ a[k1][k2][1] = I[k1][k2][0]
+ = -I[(n1-k1)%n1][n2-k2][0],
+ a[k1][n2-k2][1] = R[k1][k2][n3/2]
+ = R[(n1-k1)%n1][n2-k2][n3/2],
+ a[k1][n2-k2][0] = -I[k1][k2][n3/2]
+ = I[(n1-k1)%n1][n2-k2][n3/2],
+ 0<=k1<n1, 0<k2<n2/2,
+ a[k1][0][0] = R[k1][0][0]
+ = R[n1-k1][0][0],
+ a[k1][0][1] = I[k1][0][0]
+ = -I[n1-k1][0][0],
+ a[k1][n2/2][0] = R[k1][n2/2][0]
+ = R[n1-k1][n2/2][0],
+ a[k1][n2/2][1] = I[k1][n2/2][0]
+ = -I[n1-k1][n2/2][0],
+ a[n1-k1][0][1] = R[k1][0][n3/2]
+ = R[n1-k1][0][n3/2],
+ a[n1-k1][0][0] = -I[k1][0][n3/2]
+ = I[n1-k1][0][n3/2],
+ a[n1-k1][n2/2][1] = R[k1][n2/2][n3/2]
+ = R[n1-k1][n2/2][n3/2],
+ a[n1-k1][n2/2][0] = -I[k1][n2/2][n3/2]
+ = I[n1-k1][n2/2][n3/2],
+ 0<k1<n1/2,
+ a[0][0][0] = R[0][0][0],
+ a[0][0][1] = R[0][0][n3/2],
+ a[0][n2/2][0] = R[0][n2/2][0],
+ a[0][n2/2][1] = R[0][n2/2][n3/2],
+ a[n1/2][0][0] = R[n1/2][0][0],
+ a[n1/2][0][1] = R[n1/2][0][n3/2],
+ a[n1/2][n2/2][0] = R[n1/2][n2/2][0],
+ a[n1/2][n2/2][1] = R[n1/2][n2/2][n3/2]
+ <case2>
+ input data
+ a[j1][j2][2*j3] = R[j1][j2][j3]
+ = R[(n1-j1)%n1][(n2-j2)%n2][n3-j3],
+ a[j1][j2][2*j3+1] = I[j1][j2][j3]
+ = -I[(n1-j1)%n1][(n2-j2)%n2][n3-j3],
+ 0<=j1<n1, 0<=j2<n2, 0<j3<n3/2,
+ a[j1][j2][0] = R[j1][j2][0]
+ = R[(n1-j1)%n1][n2-j2][0],
+ a[j1][j2][1] = I[j1][j2][0]
+ = -I[(n1-j1)%n1][n2-j2][0],
+ a[j1][n2-j2][1] = R[j1][j2][n3/2]
+ = R[(n1-j1)%n1][n2-j2][n3/2],
+ a[j1][n2-j2][0] = -I[j1][j2][n3/2]
+ = I[(n1-j1)%n1][n2-j2][n3/2],
+ 0<=j1<n1, 0<j2<n2/2,
+ a[j1][0][0] = R[j1][0][0]
+ = R[n1-j1][0][0],
+ a[j1][0][1] = I[j1][0][0]
+ = -I[n1-j1][0][0],
+ a[j1][n2/2][0] = R[j1][n2/2][0]
+ = R[n1-j1][n2/2][0],
+ a[j1][n2/2][1] = I[j1][n2/2][0]
+ = -I[n1-j1][n2/2][0],
+ a[n1-j1][0][1] = R[j1][0][n3/2]
+ = R[n1-j1][0][n3/2],
+ a[n1-j1][0][0] = -I[j1][0][n3/2]
+ = I[n1-j1][0][n3/2],
+ a[n1-j1][n2/2][1] = R[j1][n2/2][n3/2]
+ = R[n1-j1][n2/2][n3/2],
+ a[n1-j1][n2/2][0] = -I[j1][n2/2][n3/2]
+ = I[n1-j1][n2/2][n3/2],
+ 0<j1<n1/2,
+ a[0][0][0] = R[0][0][0],
+ a[0][0][1] = R[0][0][n3/2],
+ a[0][n2/2][0] = R[0][n2/2][0],
+ a[0][n2/2][1] = R[0][n2/2][n3/2],
+ a[n1/2][0][0] = R[n1/2][0][0],
+ a[n1/2][0][1] = R[n1/2][0][n3/2],
+ a[n1/2][n2/2][0] = R[n1/2][n2/2][0],
+ a[n1/2][n2/2][1] = R[n1/2][n2/2][n3/2]
+ ---- output ordering ----
+ rdft3d(n1, n2, n3, 1, a, t, ip, w);
+ rdft3dsort(n1, n2, n3, 1, a);
+ // stored data is a[0...n1-1][0...n2-1][0...n3+1]:
+ // a[k1][k2][2*k3] = R[k1][k2][k3],
+ // a[k1][k2][2*k3+1] = I[k1][k2][k3],
+ // 0<=k1<n1, 0<=k2<n2, 0<=k3<=n3/2.
+ // the stored data is larger than the input data!
+ ---- input ordering ----
+ rdft3dsort(n1, n2, n3, -1, a);
+ rdft3d(n1, n2, n3, -1, a, t, ip, w);
+ t[0...*]
+ :work area (double *)
+ length of t >= max(8*n1, 8*n2), if single thread,
+ length of t >= max(8*n1, 8*n2)*FFT3D_MAX_THREADS,
+ if multi threads,
+ t is dynamically allocated, if t == NULL.
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1, n2, n3/2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1/2, n2/2, n3/4) + n3/4
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ rdft3d(n1, n2, n3, 1, a, t, ip, w);
+ is
+ rdft3d(n1, n2, n3, -1, a, t, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ for (j3 = 0; j3 <= n3 - 1; j3++) {
+ a[j1][j2][j3] *= 2.0 / n1 / n2 / n3;
+ }
+ }
+ }
+ .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+ [definition]
+ <case1> IDCT (excluding scale)
+ C[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+ a[j1][j2][j3] *
+ cos(pi*j1*(k1+1/2)/n1) *
+ cos(pi*j2*(k2+1/2)/n2) *
+ cos(pi*j3*(k3+1/2)/n3),
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ <case2> DCT
+ C[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+ a[j1][j2][j3] *
+ cos(pi*(j1+1/2)*k1/n1) *
+ cos(pi*(j2+1/2)*k2/n2) *
+ cos(pi*(j3+1/2)*k3/n3),
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ ddct3d(n1, n2, n3, 1, a, t, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ ddct3d(n1, n2, n3, -1, a, t, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 2, n1 = power of 2
+ n2 :data length (int)
+ n2 >= 2, n2 = power of 2
+ n3 :data length (int)
+ n3 >= 2, n3 = power of 2
+ a[0...n1-1][0...n2-1][0...n3-1]
+ :input/output data (double ***)
+ output data
+ a[k1][k2][k3] = C[k1][k2][k3],
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ t[0...*]
+ :work area (double *)
+ length of t >= max(4*n1, 4*n2), if single thread,
+ length of t >= max(4*n1, 4*n2)*FFT3D_MAX_THREADS,
+ if multi threads,
+ t is dynamically allocated, if t == NULL.
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1/2, n2/2, n3/2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1*3/2, n2*3/2, n3*3/2)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ ddct3d(n1, n2, n3, -1, a, t, ip, w);
+ is
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2][0] *= 0.5;
+ }
+ for (j3 = 0; j3 <= n3 - 1; j3++) {
+ a[j1][0][j3] *= 0.5;
+ }
+ }
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ for (j3 = 0; j3 <= n3 - 1; j3++) {
+ a[0][j2][j3] *= 0.5;
+ }
+ }
+ ddct3d(n1, n2, n3, 1, a, t, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ for (j3 = 0; j3 <= n3 - 1; j3++) {
+ a[j1][j2][j3] *= 8.0 / n1 / n2 / n3;
+ }
+ }
+ }
+ .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+ [definition]
+ <case1> IDST (excluding scale)
+ S[k1][k2][k3] = sum_j1=1^n1 sum_j2=1^n2 sum_j3=1^n3
+ A[j1][j2][j3] *
+ sin(pi*j1*(k1+1/2)/n1) *
+ sin(pi*j2*(k2+1/2)/n2) *
+ sin(pi*j3*(k3+1/2)/n3),
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ <case2> DST
+ S[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+ a[j1][j2][j3] *
+ sin(pi*(j1+1/2)*k1/n1) *
+ sin(pi*(j2+1/2)*k2/n2) *
+ sin(pi*(j3+1/2)*k3/n3),
+ 0<k1<=n1, 0<k2<=n2, 0<k3<=n3
+ [usage]
+ <case1>
+ ip[0] = 0; // first time only
+ ddst3d(n1, n2, n3, 1, a, t, ip, w);
+ <case2>
+ ip[0] = 0; // first time only
+ ddst3d(n1, n2, n3, -1, a, t, ip, w);
+ [parameters]
+ n1 :data length (int)
+ n1 >= 2, n1 = power of 2
+ n2 :data length (int)
+ n2 >= 2, n2 = power of 2
+ n3 :data length (int)
+ n3 >= 2, n3 = power of 2
+ a[0...n1-1][0...n2-1][0...n3-1]
+ :input/output data (double ***)
+ <case1>
+ input data
+ a[j1%n1][j2%n2][j3%n3] = A[j1][j2][j3],
+ 0<j1<=n1, 0<j2<=n2, 0<j3<=n3,
+ (n%m : n mod m)
+ output data
+ a[k1][k2][k3] = S[k1][k2][k3],
+ 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+ <case2>
+ output data
+ a[k1%n1][k2%n2][k3%n3] = S[k1][k2][k3],
+ 0<k1<=n1, 0<k2<=n2, 0<k3<=n3
+ t[0...*]
+ :work area (double *)
+ length of t >= max(4*n1, 4*n2), if single thread,
+ length of t >= max(4*n1, 4*n2)*FFT3D_MAX_THREADS,
+ if multi threads,
+ t is dynamically allocated, if t == NULL.
+ ip[0...*]
+ :work area for bit reversal (int *)
+ length of ip >= 2+sqrt(n)
+ (n = max(n1/2, n2/2, n3/2))
+ ip[0],ip[1] are pointers of the cos/sin table.
+ w[0...*]
+ :cos/sin table (double *)
+ length of w >= max(n1*3/2, n2*3/2, n3*3/2)
+ w[],ip[] are initialized if ip[0] == 0.
+ [remark]
+ Inverse of
+ ddst3d(n1, n2, n3, -1, a, t, ip, w);
+ is
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2][0] *= 0.5;
+ }
+ for (j3 = 0; j3 <= n3 - 1; j3++) {
+ a[j1][0][j3] *= 0.5;
+ }
+ }
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ for (j3 = 0; j3 <= n3 - 1; j3++) {
+ a[0][j2][j3] *= 0.5;
+ }
+ }
+ ddst3d(n1, n2, n3, 1, a, t, ip, w);
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ for (j3 = 0; j3 <= n3 - 1; j3++) {
+ a[j1][j2][j3] *= 8.0 / n1 / n2 / n3;
+ }
+ }
+ }
+ .
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#define fft3d_alloc_error_check(p) { \
+ if ((p) == NULL) { \
+ fprintf(stderr, "fft3d memory allocation error\n"); \
+ exit(1); \
+ } \
+}
+
+
+#ifdef USE_FFT3D_PTHREADS
+#define USE_FFT3D_THREADS
+#ifndef FFT3D_MAX_THREADS
+#define FFT3D_MAX_THREADS 4
+#endif
+#ifndef FFT3D_THREADS_BEGIN_N
+#define FFT3D_THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#define fft3d_thread_t pthread_t
+#define fft3d_thread_create(thp,func,argp) { \
+ if (pthread_create(thp, NULL, func, (void *) (argp)) != 0) { \
+ fprintf(stderr, "fft3d thread error\n"); \
+ exit(1); \
+ } \
+}
+#define fft3d_thread_wait(th) { \
+ if (pthread_join(th, NULL) != 0) { \
+ fprintf(stderr, "fft3d thread error\n"); \
+ exit(1); \
+ } \
+}
+#endif /* USE_FFT3D_PTHREADS */
+
+
+#ifdef USE_FFT3D_WINTHREADS
+#define USE_FFT3D_THREADS
+#ifndef FFT3D_MAX_THREADS
+#define FFT3D_MAX_THREADS 4
+#endif
+#ifndef FFT3D_THREADS_BEGIN_N
+#define FFT3D_THREADS_BEGIN_N 131072
+#endif
+#include <windows.h>
+#define fft3d_thread_t HANDLE
+#define fft3d_thread_create(thp,func,argp) { \
+ DWORD thid; \
+ *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) (func), (LPVOID) (argp), 0, &thid); \
+ if (*(thp) == 0) { \
+ fprintf(stderr, "fft3d thread error\n"); \
+ exit(1); \
+ } \
+}
+#define fft3d_thread_wait(th) { \
+ WaitForSingleObject(th, INFINITE); \
+ CloseHandle(th); \
+}
+#endif /* USE_FFT3D_WINTHREADS */
+
+
+void cdft3d(int n1, int n2, int n3, int isgn, double ***a,
+ double *t, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn,
+ double ***a, double *t, int *ip, double *w);
+ void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a,
+ double *t, int *ip, double *w);
+#ifdef USE_FFT3D_THREADS
+ void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn,
+ double ***a, double *t, int *ip, double *w);
+ void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a,
+ double *t, int *ip, double *w);
+#endif /* USE_FFT3D_THREADS */
+ int n, itnull, nt;
+
+ n = n1;
+ if (n < n2) {
+ n = n2;
+ }
+ n <<= 1;
+ if (n < n3) {
+ n = n3;
+ }
+ if (n > (ip[0] << 2)) {
+ makewt(n >> 2, ip, w);
+ }
+ itnull = 0;
+ if (t == NULL) {
+ itnull = 1;
+ nt = n1;
+ if (nt < n2) {
+ nt = n2;
+ }
+ nt *= 8;
+#ifdef USE_FFT3D_THREADS
+ nt *= FFT3D_MAX_THREADS;
+#endif /* USE_FFT3D_THREADS */
+ if (n3 == 4) {
+ nt >>= 1;
+ } else if (n3 < 4) {
+ nt >>= 2;
+ }
+ t = (double *) malloc(sizeof(double) * nt);
+ fft3d_alloc_error_check(t);
+ }
+#ifdef USE_FFT3D_THREADS
+ if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
+ xdft3da_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
+ cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
+ } else
+#endif /* USE_FFT3D_THREADS */
+ {
+ xdft3da_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
+ cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
+ }
+ if (itnull != 0) {
+ free(t);
+ }
+}
+
+
+void rdft3d(int n1, int n2, int n3, int isgn, double ***a,
+ double *t, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn,
+ double ***a, double *t, int *ip, double *w);
+ void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a,
+ double *t, int *ip, double *w);
+ void rdft3d_sub(int n1, int n2, int n3, int isgn, double ***a);
+#ifdef USE_FFT3D_THREADS
+ void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn,
+ double ***a, double *t, int *ip, double *w);
+ void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a,
+ double *t, int *ip, double *w);
+#endif /* USE_FFT3D_THREADS */
+ int n, nw, nc, itnull, nt;
+
+ n = n1;
+ if (n < n2) {
+ n = n2;
+ }
+ n <<= 1;
+ if (n < n3) {
+ n = n3;
+ }
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n3 > (nc << 2)) {
+ nc = n3 >> 2;
+ makect(nc, ip, w + nw);
+ }
+ itnull = 0;
+ if (t == NULL) {
+ itnull = 1;
+ nt = n1;
+ if (nt < n2) {
+ nt = n2;
+ }
+ nt *= 8;
+#ifdef USE_FFT3D_THREADS
+ nt *= FFT3D_MAX_THREADS;
+#endif /* USE_FFT3D_THREADS */
+ if (n3 == 4) {
+ nt >>= 1;
+ } else if (n3 < 4) {
+ nt >>= 2;
+ }
+ t = (double *) malloc(sizeof(double) * nt);
+ fft3d_alloc_error_check(t);
+ }
+#ifdef USE_FFT3D_THREADS
+ if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
+ if (isgn < 0) {
+ rdft3d_sub(n1, n2, n3, isgn, a);
+ cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
+ }
+ xdft3da_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
+ if (isgn >= 0) {
+ cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
+ rdft3d_sub(n1, n2, n3, isgn, a);
+ }
+ } else
+#endif /* USE_FFT3D_THREADS */
+ {
+ if (isgn < 0) {
+ rdft3d_sub(n1, n2, n3, isgn, a);
+ cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
+ }
+ xdft3da_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
+ if (isgn >= 0) {
+ cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
+ rdft3d_sub(n1, n2, n3, isgn, a);
+ }
+ }
+ if (itnull != 0) {
+ free(t);
+ }
+}
+
+
+void rdft3dsort(int n1, int n2, int n3, int isgn, double ***a)
+{
+ int n1h, n2h, i, j;
+ double x, y;
+
+ n1h = n1 >> 1;
+ n2h = n2 >> 1;
+ if (isgn < 0) {
+ for (i = 0; i < n1; i++) {
+ for (j = n2h + 1; j < n2; j++) {
+ a[i][j][0] = a[i][j][n3 + 1];
+ a[i][j][1] = a[i][j][n3];
+ }
+ }
+ for (i = n1h + 1; i < n1; i++) {
+ a[i][0][0] = a[i][0][n3 + 1];
+ a[i][0][1] = a[i][0][n3];
+ a[i][n2h][0] = a[i][n2h][n3 + 1];
+ a[i][n2h][1] = a[i][n2h][n3];
+ }
+ a[0][0][1] = a[0][0][n3];
+ a[0][n2h][1] = a[0][n2h][n3];
+ a[n1h][0][1] = a[n1h][0][n3];
+ a[n1h][n2h][1] = a[n1h][n2h][n3];
+ } else {
+ for (j = n2h + 1; j < n2; j++) {
+ y = a[0][j][0];
+ x = a[0][j][1];
+ a[0][j][n3] = x;
+ a[0][j][n3 + 1] = y;
+ a[0][n2 - j][n3] = x;
+ a[0][n2 - j][n3 + 1] = -y;
+ a[0][j][0] = a[0][n2 - j][0];
+ a[0][j][1] = -a[0][n2 - j][1];
+ }
+ for (i = 1; i < n1; i++) {
+ for (j = n2h + 1; j < n2; j++) {
+ y = a[i][j][0];
+ x = a[i][j][1];
+ a[i][j][n3] = x;
+ a[i][j][n3 + 1] = y;
+ a[n1 - i][n2 - j][n3] = x;
+ a[n1 - i][n2 - j][n3 + 1] = -y;
+ a[i][j][0] = a[n1 - i][n2 - j][0];
+ a[i][j][1] = -a[n1 - i][n2 - j][1];
+ }
+ }
+ for (i = n1h + 1; i < n1; i++) {
+ y = a[i][0][0];
+ x = a[i][0][1];
+ a[i][0][n3] = x;
+ a[i][0][n3 + 1] = y;
+ a[n1 - i][0][n3] = x;
+ a[n1 - i][0][n3 + 1] = -y;
+ a[i][0][0] = a[n1 - i][0][0];
+ a[i][0][1] = -a[n1 - i][0][1];
+ y = a[i][n2h][0];
+ x = a[i][n2h][1];
+ a[i][n2h][n3] = x;
+ a[i][n2h][n3 + 1] = y;
+ a[n1 - i][n2h][n3] = x;
+ a[n1 - i][n2h][n3 + 1] = -y;
+ a[i][n2h][0] = a[n1 - i][n2h][0];
+ a[i][n2h][1] = -a[n1 - i][n2h][1];
+ }
+ a[0][0][n3] = a[0][0][1];
+ a[0][0][n3 + 1] = 0;
+ a[0][0][1] = 0;
+ a[0][n2h][n3] = a[0][n2h][1];
+ a[0][n2h][n3 + 1] = 0;
+ a[0][n2h][1] = 0;
+ a[n1h][0][n3] = a[n1h][0][1];
+ a[n1h][0][n3 + 1] = 0;
+ a[n1h][0][1] = 0;
+ a[n1h][n2h][n3] = a[n1h][n2h][1];
+ a[n1h][n2h][n3 + 1] = 0;
+ a[n1h][n2h][1] = 0;
+ }
+}
+
+
+void ddct3d(int n1, int n2, int n3, int isgn, double ***a,
+ double *t, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w);
+ void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w);
+#ifdef USE_FFT3D_THREADS
+ void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w);
+ void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w);
+#endif /* USE_FFT3D_THREADS */
+ int n, nw, nc, itnull, nt;
+
+ n = n1;
+ if (n < n2) {
+ n = n2;
+ }
+ if (n < n3) {
+ n = n3;
+ }
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n > nc) {
+ nc = n;
+ makect(nc, ip, w + nw);
+ }
+ itnull = 0;
+ if (t == NULL) {
+ itnull = 1;
+ nt = n1;
+ if (nt < n2) {
+ nt = n2;
+ }
+ nt *= 4;
+#ifdef USE_FFT3D_THREADS
+ nt *= FFT3D_MAX_THREADS;
+#endif /* USE_FFT3D_THREADS */
+ if (n3 == 2) {
+ nt >>= 1;
+ }
+ t = (double *) malloc(sizeof(double) * nt);
+ fft3d_alloc_error_check(t);
+ }
+#ifdef USE_FFT3D_THREADS
+ if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
+ ddxt3da_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
+ ddxt3db_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
+ } else
+#endif /* USE_FFT3D_THREADS */
+ {
+ ddxt3da_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
+ ddxt3db_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
+ }
+ if (itnull != 0) {
+ free(t);
+ }
+}
+
+
+void ddst3d(int n1, int n2, int n3, int isgn, double ***a,
+ double *t, int *ip, double *w)
+{
+ void makewt(int nw, int *ip, double *w);
+ void makect(int nc, int *ip, double *c);
+ void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w);
+ void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w);
+#ifdef USE_FFT3D_THREADS
+ void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w);
+ void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w);
+#endif /* USE_FFT3D_THREADS */
+ int n, nw, nc, itnull, nt;
+
+ n = n1;
+ if (n < n2) {
+ n = n2;
+ }
+ if (n < n3) {
+ n = n3;
+ }
+ nw = ip[0];
+ if (n > (nw << 2)) {
+ nw = n >> 2;
+ makewt(nw, ip, w);
+ }
+ nc = ip[1];
+ if (n > nc) {
+ nc = n;
+ makect(nc, ip, w + nw);
+ }
+ itnull = 0;
+ if (t == NULL) {
+ itnull = 1;
+ nt = n1;
+ if (nt < n2) {
+ nt = n2;
+ }
+ nt *= 4;
+#ifdef USE_FFT3D_THREADS
+ nt *= FFT3D_MAX_THREADS;
+#endif /* USE_FFT3D_THREADS */
+ if (n3 == 2) {
+ nt >>= 1;
+ }
+ t = (double *) malloc(sizeof(double) * nt);
+ fft3d_alloc_error_check(t);
+ }
+#ifdef USE_FFT3D_THREADS
+ if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
+ ddxt3da_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
+ ddxt3db_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
+ } else
+#endif /* USE_FFT3D_THREADS */
+ {
+ ddxt3da_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
+ ddxt3db_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
+ }
+ if (itnull != 0) {
+ free(t);
+ }
+}
+
+
+/* -------- child routines -------- */
+
+
+void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn,
+ double ***a, double *t, int *ip, double *w)
+{
+ void cdft(int n, int isgn, double *a, int *ip, double *w);
+ void rdft(int n, int isgn, double *a, int *ip, double *w);
+ int i, j, k;
+
+ for (i = 0; i < n1; i++) {
+ if (icr == 0) {
+ for (j = 0; j < n2; j++) {
+ cdft(n3, isgn, a[i][j], ip, w);
+ }
+ } else if (isgn >= 0) {
+ for (j = 0; j < n2; j++) {
+ rdft(n3, isgn, a[i][j], ip, w);
+ }
+ }
+ if (n3 > 4) {
+ for (k = 0; k < n3; k += 8) {
+ for (j = 0; j < n2; j++) {
+ t[2 * j] = a[i][j][k];
+ t[2 * j + 1] = a[i][j][k + 1];
+ t[2 * n2 + 2 * j] = a[i][j][k + 2];
+ t[2 * n2 + 2 * j + 1] = a[i][j][k + 3];
+ t[4 * n2 + 2 * j] = a[i][j][k + 4];
+ t[4 * n2 + 2 * j + 1] = a[i][j][k + 5];
+ t[6 * n2 + 2 * j] = a[i][j][k + 6];
+ t[6 * n2 + 2 * j + 1] = a[i][j][k + 7];
+ }
+ cdft(2 * n2, isgn, t, ip, w);
+ cdft(2 * n2, isgn, &t[2 * n2], ip, w);
+ cdft(2 * n2, isgn, &t[4 * n2], ip, w);
+ cdft(2 * n2, isgn, &t[6 * n2], ip, w);
+ for (j = 0; j < n2; j++) {
+ a[i][j][k] = t[2 * j];
+ a[i][j][k + 1] = t[2 * j + 1];
+ a[i][j][k + 2] = t[2 * n2 + 2 * j];
+ a[i][j][k + 3] = t[2 * n2 + 2 * j + 1];
+ a[i][j][k + 4] = t[4 * n2 + 2 * j];
+ a[i][j][k + 5] = t[4 * n2 + 2 * j + 1];
+ a[i][j][k + 6] = t[6 * n2 + 2 * j];
+ a[i][j][k + 7] = t[6 * n2 + 2 * j + 1];
+ }
+ }
+ } else if (n3 == 4) {
+ for (j = 0; j < n2; j++) {
+ t[2 * j] = a[i][j][0];
+ t[2 * j + 1] = a[i][j][1];
+ t[2 * n2 + 2 * j] = a[i][j][2];
+ t[2 * n2 + 2 * j + 1] = a[i][j][3];
+ }
+ cdft(2 * n2, isgn, t, ip, w);
+ cdft(2 * n2, isgn, &t[2 * n2], ip, w);
+ for (j = 0; j < n2; j++) {
+ a[i][j][0] = t[2 * j];
+ a[i][j][1] = t[2 * j + 1];
+ a[i][j][2] = t[2 * n2 + 2 * j];
+ a[i][j][3] = t[2 * n2 + 2 * j + 1];
+ }
+ } else if (n3 == 2) {
+ for (j = 0; j < n2; j++) {
+ t[2 * j] = a[i][j][0];
+ t[2 * j + 1] = a[i][j][1];
+ }
+ cdft(2 * n2, isgn, t, ip, w);
+ for (j = 0; j < n2; j++) {
+ a[i][j][0] = t[2 * j];
+ a[i][j][1] = t[2 * j + 1];
+ }
+ }
+ if (icr != 0 && isgn < 0) {
+ for (j = 0; j < n2; j++) {
+ rdft(n3, isgn, a[i][j], ip, w);
+ }
+ }
+ }
+}
+
+
+void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a,
+ double *t, int *ip, double *w)
+{
+ void cdft(int n, int isgn, double *a, int *ip, double *w);
+ int i, j, k;
+
+ if (n3 > 4) {
+ for (j = 0; j < n2; j++) {
+ for (k = 0; k < n3; k += 8) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][j][k];
+ t[2 * i + 1] = a[i][j][k + 1];
+ t[2 * n1 + 2 * i] = a[i][j][k + 2];
+ t[2 * n1 + 2 * i + 1] = a[i][j][k + 3];
+ t[4 * n1 + 2 * i] = a[i][j][k + 4];
+ t[4 * n1 + 2 * i + 1] = a[i][j][k + 5];
+ t[6 * n1 + 2 * i] = a[i][j][k + 6];
+ t[6 * n1 + 2 * i + 1] = a[i][j][k + 7];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+ cdft(2 * n1, isgn, &t[4 * n1], ip, w);
+ cdft(2 * n1, isgn, &t[6 * n1], ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][j][k] = t[2 * i];
+ a[i][j][k + 1] = t[2 * i + 1];
+ a[i][j][k + 2] = t[2 * n1 + 2 * i];
+ a[i][j][k + 3] = t[2 * n1 + 2 * i + 1];
+ a[i][j][k + 4] = t[4 * n1 + 2 * i];
+ a[i][j][k + 5] = t[4 * n1 + 2 * i + 1];
+ a[i][j][k + 6] = t[6 * n1 + 2 * i];
+ a[i][j][k + 7] = t[6 * n1 + 2 * i + 1];
+ }
+ }
+ }
+ } else if (n3 == 4) {
+ for (j = 0; j < n2; j++) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][j][0];
+ t[2 * i + 1] = a[i][j][1];
+ t[2 * n1 + 2 * i] = a[i][j][2];
+ t[2 * n1 + 2 * i + 1] = a[i][j][3];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][j][0] = t[2 * i];
+ a[i][j][1] = t[2 * i + 1];
+ a[i][j][2] = t[2 * n1 + 2 * i];
+ a[i][j][3] = t[2 * n1 + 2 * i + 1];
+ }
+ }
+ } else if (n3 == 2) {
+ for (j = 0; j < n2; j++) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][j][0];
+ t[2 * i + 1] = a[i][j][1];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][j][0] = t[2 * i];
+ a[i][j][1] = t[2 * i + 1];
+ }
+ }
+ }
+}
+
+
+void rdft3d_sub(int n1, int n2, int n3, int isgn, double ***a)
+{
+ int n1h, n2h, i, j, k, l;
+ double xi;
+
+ n1h = n1 >> 1;
+ n2h = n2 >> 1;
+ if (isgn < 0) {
+ for (i = 1; i < n1h; i++) {
+ j = n1 - i;
+ xi = a[i][0][0] - a[j][0][0];
+ a[i][0][0] += a[j][0][0];
+ a[j][0][0] = xi;
+ xi = a[j][0][1] - a[i][0][1];
+ a[i][0][1] += a[j][0][1];
+ a[j][0][1] = xi;
+ xi = a[i][n2h][0] - a[j][n2h][0];
+ a[i][n2h][0] += a[j][n2h][0];
+ a[j][n2h][0] = xi;
+ xi = a[j][n2h][1] - a[i][n2h][1];
+ a[i][n2h][1] += a[j][n2h][1];
+ a[j][n2h][1] = xi;
+ for (k = 1; k < n2h; k++) {
+ l = n2 - k;
+ xi = a[i][k][0] - a[j][l][0];
+ a[i][k][0] += a[j][l][0];
+ a[j][l][0] = xi;
+ xi = a[j][l][1] - a[i][k][1];
+ a[i][k][1] += a[j][l][1];
+ a[j][l][1] = xi;
+ xi = a[j][k][0] - a[i][l][0];
+ a[j][k][0] += a[i][l][0];
+ a[i][l][0] = xi;
+ xi = a[i][l][1] - a[j][k][1];
+ a[j][k][1] += a[i][l][1];
+ a[i][l][1] = xi;
+ }
+ }
+ for (k = 1; k < n2h; k++) {
+ l = n2 - k;
+ xi = a[0][k][0] - a[0][l][0];
+ a[0][k][0] += a[0][l][0];
+ a[0][l][0] = xi;
+ xi = a[0][l][1] - a[0][k][1];
+ a[0][k][1] += a[0][l][1];
+ a[0][l][1] = xi;
+ xi = a[n1h][k][0] - a[n1h][l][0];
+ a[n1h][k][0] += a[n1h][l][0];
+ a[n1h][l][0] = xi;
+ xi = a[n1h][l][1] - a[n1h][k][1];
+ a[n1h][k][1] += a[n1h][l][1];
+ a[n1h][l][1] = xi;
+ }
+ } else {
+ for (i = 1; i < n1h; i++) {
+ j = n1 - i;
+ a[j][0][0] = 0.5 * (a[i][0][0] - a[j][0][0]);
+ a[i][0][0] -= a[j][0][0];
+ a[j][0][1] = 0.5 * (a[i][0][1] + a[j][0][1]);
+ a[i][0][1] -= a[j][0][1];
+ a[j][n2h][0] = 0.5 * (a[i][n2h][0] - a[j][n2h][0]);
+ a[i][n2h][0] -= a[j][n2h][0];
+ a[j][n2h][1] = 0.5 * (a[i][n2h][1] + a[j][n2h][1]);
+ a[i][n2h][1] -= a[j][n2h][1];
+ for (k = 1; k < n2h; k++) {
+ l = n2 - k;
+ a[j][l][0] = 0.5 * (a[i][k][0] - a[j][l][0]);
+ a[i][k][0] -= a[j][l][0];
+ a[j][l][1] = 0.5 * (a[i][k][1] + a[j][l][1]);
+ a[i][k][1] -= a[j][l][1];
+ a[i][l][0] = 0.5 * (a[j][k][0] - a[i][l][0]);
+ a[j][k][0] -= a[i][l][0];
+ a[i][l][1] = 0.5 * (a[j][k][1] + a[i][l][1]);
+ a[j][k][1] -= a[i][l][1];
+ }
+ }
+ for (k = 1; k < n2h; k++) {
+ l = n2 - k;
+ a[0][l][0] = 0.5 * (a[0][k][0] - a[0][l][0]);
+ a[0][k][0] -= a[0][l][0];
+ a[0][l][1] = 0.5 * (a[0][k][1] + a[0][l][1]);
+ a[0][k][1] -= a[0][l][1];
+ a[n1h][l][0] = 0.5 * (a[n1h][k][0] - a[n1h][l][0]);
+ a[n1h][k][0] -= a[n1h][l][0];
+ a[n1h][l][1] = 0.5 * (a[n1h][k][1] + a[n1h][l][1]);
+ a[n1h][k][1] -= a[n1h][l][1];
+ }
+ }
+}
+
+
+void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w)
+{
+ void ddct(int n, int isgn, double *a, int *ip, double *w);
+ void ddst(int n, int isgn, double *a, int *ip, double *w);
+ int i, j, k;
+
+ for (i = 0; i < n1; i++) {
+ if (ics == 0) {
+ for (j = 0; j < n2; j++) {
+ ddct(n3, isgn, a[i][j], ip, w);
+ }
+ } else {
+ for (j = 0; j < n2; j++) {
+ ddst(n3, isgn, a[i][j], ip, w);
+ }
+ }
+ if (n3 > 2) {
+ for (k = 0; k < n3; k += 4) {
+ for (j = 0; j < n2; j++) {
+ t[j] = a[i][j][k];
+ t[n2 + j] = a[i][j][k + 1];
+ t[2 * n2 + j] = a[i][j][k + 2];
+ t[3 * n2 + j] = a[i][j][k + 3];
+ }
+ if (ics == 0) {
+ ddct(n2, isgn, t, ip, w);
+ ddct(n2, isgn, &t[n2], ip, w);
+ ddct(n2, isgn, &t[2 * n2], ip, w);
+ ddct(n2, isgn, &t[3 * n2], ip, w);
+ } else {
+ ddst(n2, isgn, t, ip, w);
+ ddst(n2, isgn, &t[n2], ip, w);
+ ddst(n2, isgn, &t[2 * n2], ip, w);
+ ddst(n2, isgn, &t[3 * n2], ip, w);
+ }
+ for (j = 0; j < n2; j++) {
+ a[i][j][k] = t[j];
+ a[i][j][k + 1] = t[n2 + j];
+ a[i][j][k + 2] = t[2 * n2 + j];
+ a[i][j][k + 3] = t[3 * n2 + j];
+ }
+ }
+ } else if (n3 == 2) {
+ for (j = 0; j < n2; j++) {
+ t[j] = a[i][j][0];
+ t[n2 + j] = a[i][j][1];
+ }
+ if (ics == 0) {
+ ddct(n2, isgn, t, ip, w);
+ ddct(n2, isgn, &t[n2], ip, w);
+ } else {
+ ddst(n2, isgn, t, ip, w);
+ ddst(n2, isgn, &t[n2], ip, w);
+ }
+ for (j = 0; j < n2; j++) {
+ a[i][j][0] = t[j];
+ a[i][j][1] = t[n2 + j];
+ }
+ }
+ }
+}
+
+
+void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w)
+{
+ void ddct(int n, int isgn, double *a, int *ip, double *w);
+ void ddst(int n, int isgn, double *a, int *ip, double *w);
+ int i, j, k;
+
+ if (n3 > 2) {
+ for (j = 0; j < n2; j++) {
+ for (k = 0; k < n3; k += 4) {
+ for (i = 0; i < n1; i++) {
+ t[i] = a[i][j][k];
+ t[n1 + i] = a[i][j][k + 1];
+ t[2 * n1 + i] = a[i][j][k + 2];
+ t[3 * n1 + i] = a[i][j][k + 3];
+ }
+ if (ics == 0) {
+ ddct(n1, isgn, t, ip, w);
+ ddct(n1, isgn, &t[n1], ip, w);
+ ddct(n1, isgn, &t[2 * n1], ip, w);
+ ddct(n1, isgn, &t[3 * n1], ip, w);
+ } else {
+ ddst(n1, isgn, t, ip, w);
+ ddst(n1, isgn, &t[n1], ip, w);
+ ddst(n1, isgn, &t[2 * n1], ip, w);
+ ddst(n1, isgn, &t[3 * n1], ip, w);
+ }
+ for (i = 0; i < n1; i++) {
+ a[i][j][k] = t[i];
+ a[i][j][k + 1] = t[n1 + i];
+ a[i][j][k + 2] = t[2 * n1 + i];
+ a[i][j][k + 3] = t[3 * n1 + i];
+ }
+ }
+ }
+ } else if (n3 == 2) {
+ for (j = 0; j < n2; j++) {
+ for (i = 0; i < n1; i++) {
+ t[i] = a[i][j][0];
+ t[n1 + i] = a[i][j][1];
+ }
+ if (ics == 0) {
+ ddct(n1, isgn, t, ip, w);
+ ddct(n1, isgn, &t[n1], ip, w);
+ } else {
+ ddst(n1, isgn, t, ip, w);
+ ddst(n1, isgn, &t[n1], ip, w);
+ }
+ for (i = 0; i < n1; i++) {
+ a[i][j][0] = t[i];
+ a[i][j][1] = t[n1 + i];
+ }
+ }
+ }
+}
+
+
+#ifdef USE_FFT3D_THREADS
+struct fft3d_arg_st {
+ int nthread;
+ int n0;
+ int n1;
+ int n2;
+ int n3;
+ int ic;
+ int isgn;
+ double ***a;
+ double *t;
+ int *ip;
+ double *w;
+};
+typedef struct fft3d_arg_st fft3d_arg_t;
+
+
+void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn,
+ double ***a, double *t, int *ip, double *w)
+{
+ void *xdft3da_th(void *p);
+ fft3d_thread_t th[FFT3D_MAX_THREADS];
+ fft3d_arg_t ag[FFT3D_MAX_THREADS];
+ int nthread, nt, i;
+
+ nthread = FFT3D_MAX_THREADS;
+ if (nthread > n1) {
+ nthread = n1;
+ }
+ nt = 8 * n2;
+ if (n3 == 4) {
+ nt >>= 1;
+ } else if (n3 < 4) {
+ nt >>= 2;
+ }
+ for (i = 0; i < nthread; i++) {
+ ag[i].nthread = nthread;
+ ag[i].n0 = i;
+ ag[i].n1 = n1;
+ ag[i].n2 = n2;
+ ag[i].n3 = n3;
+ ag[i].ic = icr;
+ ag[i].isgn = isgn;
+ ag[i].a = a;
+ ag[i].t = &t[nt * i];
+ ag[i].ip = ip;
+ ag[i].w = w;
+ fft3d_thread_create(&th[i], xdft3da_th, &ag[i]);
+ }
+ for (i = 0; i < nthread; i++) {
+ fft3d_thread_wait(th[i]);
+ }
+}
+
+
+void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a,
+ double *t, int *ip, double *w)
+{
+ void *cdft3db_th(void *p);
+ fft3d_thread_t th[FFT3D_MAX_THREADS];
+ fft3d_arg_t ag[FFT3D_MAX_THREADS];
+ int nthread, nt, i;
+
+ nthread = FFT3D_MAX_THREADS;
+ if (nthread > n2) {
+ nthread = n2;
+ }
+ nt = 8 * n1;
+ if (n3 == 4) {
+ nt >>= 1;
+ } else if (n3 < 4) {
+ nt >>= 2;
+ }
+ for (i = 0; i < nthread; i++) {
+ ag[i].nthread = nthread;
+ ag[i].n0 = i;
+ ag[i].n1 = n1;
+ ag[i].n2 = n2;
+ ag[i].n3 = n3;
+ ag[i].isgn = isgn;
+ ag[i].a = a;
+ ag[i].t = &t[nt * i];
+ ag[i].ip = ip;
+ ag[i].w = w;
+ fft3d_thread_create(&th[i], cdft3db_th, &ag[i]);
+ }
+ for (i = 0; i < nthread; i++) {
+ fft3d_thread_wait(th[i]);
+ }
+}
+
+
+void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w)
+{
+ void *ddxt3da_th(void *p);
+ fft3d_thread_t th[FFT3D_MAX_THREADS];
+ fft3d_arg_t ag[FFT3D_MAX_THREADS];
+ int nthread, nt, i;
+
+ nthread = FFT3D_MAX_THREADS;
+ if (nthread > n1) {
+ nthread = n1;
+ }
+ nt = 4 * n2;
+ if (n3 == 2) {
+ nt >>= 1;
+ }
+ for (i = 0; i < nthread; i++) {
+ ag[i].nthread = nthread;
+ ag[i].n0 = i;
+ ag[i].n1 = n1;
+ ag[i].n2 = n2;
+ ag[i].n3 = n3;
+ ag[i].ic = ics;
+ ag[i].isgn = isgn;
+ ag[i].a = a;
+ ag[i].t = &t[nt * i];
+ ag[i].ip = ip;
+ ag[i].w = w;
+ fft3d_thread_create(&th[i], ddxt3da_th, &ag[i]);
+ }
+ for (i = 0; i < nthread; i++) {
+ fft3d_thread_wait(th[i]);
+ }
+}
+
+
+void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn,
+ double ***a, double *t, int *ip, double *w)
+{
+ void *ddxt3db_th(void *p);
+ fft3d_thread_t th[FFT3D_MAX_THREADS];
+ fft3d_arg_t ag[FFT3D_MAX_THREADS];
+ int nthread, nt, i;
+
+ nthread = FFT3D_MAX_THREADS;
+ if (nthread > n2) {
+ nthread = n2;
+ }
+ nt = 4 * n1;
+ if (n3 == 2) {
+ nt >>= 1;
+ }
+ for (i = 0; i < nthread; i++) {
+ ag[i].nthread = nthread;
+ ag[i].n0 = i;
+ ag[i].n1 = n1;
+ ag[i].n2 = n2;
+ ag[i].n3 = n3;
+ ag[i].ic = ics;
+ ag[i].isgn = isgn;
+ ag[i].a = a;
+ ag[i].t = &t[nt * i];
+ ag[i].ip = ip;
+ ag[i].w = w;
+ fft3d_thread_create(&th[i], ddxt3db_th, &ag[i]);
+ }
+ for (i = 0; i < nthread; i++) {
+ fft3d_thread_wait(th[i]);
+ }
+}
+
+
+void *xdft3da_th(void *p)
+{
+ void cdft(int n, int isgn, double *a, int *ip, double *w);
+ void rdft(int n, int isgn, double *a, int *ip, double *w);
+ int nthread, n0, n1, n2, n3, icr, isgn, *ip, i, j, k;
+ double ***a, *t, *w;
+
+ nthread = ((fft3d_arg_t *) p)->nthread;
+ n0 = ((fft3d_arg_t *) p)->n0;
+ n1 = ((fft3d_arg_t *) p)->n1;
+ n2 = ((fft3d_arg_t *) p)->n2;
+ n3 = ((fft3d_arg_t *) p)->n3;
+ icr = ((fft3d_arg_t *) p)->ic;
+ isgn = ((fft3d_arg_t *) p)->isgn;
+ a = ((fft3d_arg_t *) p)->a;
+ t = ((fft3d_arg_t *) p)->t;
+ ip = ((fft3d_arg_t *) p)->ip;
+ w = ((fft3d_arg_t *) p)->w;
+ for (i = n0; i < n1; i += nthread) {
+ if (icr == 0) {
+ for (j = 0; j < n2; j++) {
+ cdft(n3, isgn, a[i][j], ip, w);
+ }
+ } else if (isgn >= 0) {
+ for (j = 0; j < n2; j++) {
+ rdft(n3, isgn, a[i][j], ip, w);
+ }
+ }
+ if (n3 > 4) {
+ for (k = 0; k < n3; k += 8) {
+ for (j = 0; j < n2; j++) {
+ t[2 * j] = a[i][j][k];
+ t[2 * j + 1] = a[i][j][k + 1];
+ t[2 * n2 + 2 * j] = a[i][j][k + 2];
+ t[2 * n2 + 2 * j + 1] = a[i][j][k + 3];
+ t[4 * n2 + 2 * j] = a[i][j][k + 4];
+ t[4 * n2 + 2 * j + 1] = a[i][j][k + 5];
+ t[6 * n2 + 2 * j] = a[i][j][k + 6];
+ t[6 * n2 + 2 * j + 1] = a[i][j][k + 7];
+ }
+ cdft(2 * n2, isgn, t, ip, w);
+ cdft(2 * n2, isgn, &t[2 * n2], ip, w);
+ cdft(2 * n2, isgn, &t[4 * n2], ip, w);
+ cdft(2 * n2, isgn, &t[6 * n2], ip, w);
+ for (j = 0; j < n2; j++) {
+ a[i][j][k] = t[2 * j];
+ a[i][j][k + 1] = t[2 * j + 1];
+ a[i][j][k + 2] = t[2 * n2 + 2 * j];
+ a[i][j][k + 3] = t[2 * n2 + 2 * j + 1];
+ a[i][j][k + 4] = t[4 * n2 + 2 * j];
+ a[i][j][k + 5] = t[4 * n2 + 2 * j + 1];
+ a[i][j][k + 6] = t[6 * n2 + 2 * j];
+ a[i][j][k + 7] = t[6 * n2 + 2 * j + 1];
+ }
+ }
+ } else if (n3 == 4) {
+ for (j = 0; j < n2; j++) {
+ t[2 * j] = a[i][j][0];
+ t[2 * j + 1] = a[i][j][1];
+ t[2 * n2 + 2 * j] = a[i][j][2];
+ t[2 * n2 + 2 * j + 1] = a[i][j][3];
+ }
+ cdft(2 * n2, isgn, t, ip, w);
+ cdft(2 * n2, isgn, &t[2 * n2], ip, w);
+ for (j = 0; j < n2; j++) {
+ a[i][j][0] = t[2 * j];
+ a[i][j][1] = t[2 * j + 1];
+ a[i][j][2] = t[2 * n2 + 2 * j];
+ a[i][j][3] = t[2 * n2 + 2 * j + 1];
+ }
+ } else if (n3 == 2) {
+ for (j = 0; j < n2; j++) {
+ t[2 * j] = a[i][j][0];
+ t[2 * j + 1] = a[i][j][1];
+ }
+ cdft(2 * n2, isgn, t, ip, w);
+ for (j = 0; j < n2; j++) {
+ a[i][j][0] = t[2 * j];
+ a[i][j][1] = t[2 * j + 1];
+ }
+ }
+ if (icr != 0 && isgn < 0) {
+ for (j = 0; j < n2; j++) {
+ rdft(n3, isgn, a[i][j], ip, w);
+ }
+ }
+ }
+ return (void *) 0;
+}
+
+
+void *cdft3db_th(void *p)
+{
+ void cdft(int n, int isgn, double *a, int *ip, double *w);
+ int nthread, n0, n1, n2, n3, isgn, *ip, i, j, k;
+ double ***a, *t, *w;
+
+ nthread = ((fft3d_arg_t *) p)->nthread;
+ n0 = ((fft3d_arg_t *) p)->n0;
+ n1 = ((fft3d_arg_t *) p)->n1;
+ n2 = ((fft3d_arg_t *) p)->n2;
+ n3 = ((fft3d_arg_t *) p)->n3;
+ isgn = ((fft3d_arg_t *) p)->isgn;
+ a = ((fft3d_arg_t *) p)->a;
+ t = ((fft3d_arg_t *) p)->t;
+ ip = ((fft3d_arg_t *) p)->ip;
+ w = ((fft3d_arg_t *) p)->w;
+ if (n3 > 4) {
+ for (j = n0; j < n2; j += nthread) {
+ for (k = 0; k < n3; k += 8) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][j][k];
+ t[2 * i + 1] = a[i][j][k + 1];
+ t[2 * n1 + 2 * i] = a[i][j][k + 2];
+ t[2 * n1 + 2 * i + 1] = a[i][j][k + 3];
+ t[4 * n1 + 2 * i] = a[i][j][k + 4];
+ t[4 * n1 + 2 * i + 1] = a[i][j][k + 5];
+ t[6 * n1 + 2 * i] = a[i][j][k + 6];
+ t[6 * n1 + 2 * i + 1] = a[i][j][k + 7];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+ cdft(2 * n1, isgn, &t[4 * n1], ip, w);
+ cdft(2 * n1, isgn, &t[6 * n1], ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][j][k] = t[2 * i];
+ a[i][j][k + 1] = t[2 * i + 1];
+ a[i][j][k + 2] = t[2 * n1 + 2 * i];
+ a[i][j][k + 3] = t[2 * n1 + 2 * i + 1];
+ a[i][j][k + 4] = t[4 * n1 + 2 * i];
+ a[i][j][k + 5] = t[4 * n1 + 2 * i + 1];
+ a[i][j][k + 6] = t[6 * n1 + 2 * i];
+ a[i][j][k + 7] = t[6 * n1 + 2 * i + 1];
+ }
+ }
+ }
+ } else if (n3 == 4) {
+ for (j = n0; j < n2; j += nthread) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][j][0];
+ t[2 * i + 1] = a[i][j][1];
+ t[2 * n1 + 2 * i] = a[i][j][2];
+ t[2 * n1 + 2 * i + 1] = a[i][j][3];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][j][0] = t[2 * i];
+ a[i][j][1] = t[2 * i + 1];
+ a[i][j][2] = t[2 * n1 + 2 * i];
+ a[i][j][3] = t[2 * n1 + 2 * i + 1];
+ }
+ }
+ } else if (n3 == 2) {
+ for (j = n0; j < n2; j += nthread) {
+ for (i = 0; i < n1; i++) {
+ t[2 * i] = a[i][j][0];
+ t[2 * i + 1] = a[i][j][1];
+ }
+ cdft(2 * n1, isgn, t, ip, w);
+ for (i = 0; i < n1; i++) {
+ a[i][j][0] = t[2 * i];
+ a[i][j][1] = t[2 * i + 1];
+ }
+ }
+ }
+ return (void *) 0;
+}
+
+
+void *ddxt3da_th(void *p)
+{
+ void ddct(int n, int isgn, double *a, int *ip, double *w);
+ void ddst(int n, int isgn, double *a, int *ip, double *w);
+ int nthread, n0, n1, n2, n3, ics, isgn, *ip, i, j, k;
+ double ***a, *t, *w;
+
+ nthread = ((fft3d_arg_t *) p)->nthread;
+ n0 = ((fft3d_arg_t *) p)->n0;
+ n1 = ((fft3d_arg_t *) p)->n1;
+ n2 = ((fft3d_arg_t *) p)->n2;
+ n3 = ((fft3d_arg_t *) p)->n3;
+ ics = ((fft3d_arg_t *) p)->ic;
+ isgn = ((fft3d_arg_t *) p)->isgn;
+ a = ((fft3d_arg_t *) p)->a;
+ t = ((fft3d_arg_t *) p)->t;
+ ip = ((fft3d_arg_t *) p)->ip;
+ w = ((fft3d_arg_t *) p)->w;
+ for (i = n0; i < n1; i += nthread) {
+ if (ics == 0) {
+ for (j = 0; j < n2; j++) {
+ ddct(n3, isgn, a[i][j], ip, w);
+ }
+ } else {
+ for (j = 0; j < n2; j++) {
+ ddst(n3, isgn, a[i][j], ip, w);
+ }
+ }
+ if (n3 > 2) {
+ for (k = 0; k < n3; k += 4) {
+ for (j = 0; j < n2; j++) {
+ t[j] = a[i][j][k];
+ t[n2 + j] = a[i][j][k + 1];
+ t[2 * n2 + j] = a[i][j][k + 2];
+ t[3 * n2 + j] = a[i][j][k + 3];
+ }
+ if (ics == 0) {
+ ddct(n2, isgn, t, ip, w);
+ ddct(n2, isgn, &t[n2], ip, w);
+ ddct(n2, isgn, &t[2 * n2], ip, w);
+ ddct(n2, isgn, &t[3 * n2], ip, w);
+ } else {
+ ddst(n2, isgn, t, ip, w);
+ ddst(n2, isgn, &t[n2], ip, w);
+ ddst(n2, isgn, &t[2 * n2], ip, w);
+ ddst(n2, isgn, &t[3 * n2], ip, w);
+ }
+ for (j = 0; j < n2; j++) {
+ a[i][j][k] = t[j];
+ a[i][j][k + 1] = t[n2 + j];
+ a[i][j][k + 2] = t[2 * n2 + j];
+ a[i][j][k + 3] = t[3 * n2 + j];
+ }
+ }
+ } else if (n3 == 2) {
+ for (j = 0; j < n2; j++) {
+ t[j] = a[i][j][0];
+ t[n2 + j] = a[i][j][1];
+ }
+ if (ics == 0) {
+ ddct(n2, isgn, t, ip, w);
+ ddct(n2, isgn, &t[n2], ip, w);
+ } else {
+ ddst(n2, isgn, t, ip, w);
+ ddst(n2, isgn, &t[n2], ip, w);
+ }
+ for (j = 0; j < n2; j++) {
+ a[i][j][0] = t[j];
+ a[i][j][1] = t[n2 + j];
+ }
+ }
+ }
+ return (void *) 0;
+}
+
+
+void *ddxt3db_th(void *p)
+{
+ void ddct(int n, int isgn, double *a, int *ip, double *w);
+ void ddst(int n, int isgn, double *a, int *ip, double *w);
+ int nthread, n0, n1, n2, n3, ics, isgn, *ip, i, j, k;
+ double ***a, *t, *w;
+
+ nthread = ((fft3d_arg_t *) p)->nthread;
+ n0 = ((fft3d_arg_t *) p)->n0;
+ n1 = ((fft3d_arg_t *) p)->n1;
+ n2 = ((fft3d_arg_t *) p)->n2;
+ n3 = ((fft3d_arg_t *) p)->n3;
+ ics = ((fft3d_arg_t *) p)->ic;
+ isgn = ((fft3d_arg_t *) p)->isgn;
+ a = ((fft3d_arg_t *) p)->a;
+ t = ((fft3d_arg_t *) p)->t;
+ ip = ((fft3d_arg_t *) p)->ip;
+ w = ((fft3d_arg_t *) p)->w;
+ if (n3 > 2) {
+ for (j = n0; j < n2; j += nthread) {
+ for (k = 0; k < n3; k += 4) {
+ for (i = 0; i < n1; i++) {
+ t[i] = a[i][j][k];
+ t[n1 + i] = a[i][j][k + 1];
+ t[2 * n1 + i] = a[i][j][k + 2];
+ t[3 * n1 + i] = a[i][j][k + 3];
+ }
+ if (ics == 0) {
+ ddct(n1, isgn, t, ip, w);
+ ddct(n1, isgn, &t[n1], ip, w);
+ ddct(n1, isgn, &t[2 * n1], ip, w);
+ ddct(n1, isgn, &t[3 * n1], ip, w);
+ } else {
+ ddst(n1, isgn, t, ip, w);
+ ddst(n1, isgn, &t[n1], ip, w);
+ ddst(n1, isgn, &t[2 * n1], ip, w);
+ ddst(n1, isgn, &t[3 * n1], ip, w);
+ }
+ for (i = 0; i < n1; i++) {
+ a[i][j][k] = t[i];
+ a[i][j][k + 1] = t[n1 + i];
+ a[i][j][k + 2] = t[2 * n1 + i];
+ a[i][j][k + 3] = t[3 * n1 + i];
+ }
+ }
+ }
+ } else if (n3 == 2) {
+ for (j = n0; j < n2; j += nthread) {
+ for (i = 0; i < n1; i++) {
+ t[i] = a[i][j][0];
+ t[n1 + i] = a[i][j][1];
+ }
+ if (ics == 0) {
+ ddct(n1, isgn, t, ip, w);
+ ddct(n1, isgn, &t[n1], ip, w);
+ } else {
+ ddst(n1, isgn, t, ip, w);
+ ddst(n1, isgn, &t[n1], ip, w);
+ }
+ for (i = 0; i < n1; i++) {
+ a[i][j][0] = t[i];
+ a[i][j][1] = t[n1 + i];
+ }
+ }
+ }
+ return (void *) 0;
+}
+#endif /* USE_FFT3D_THREADS */
+
diff --git a/src/fft2d/fft2d/fftsg3d.f b/src/fft2d/fft2d/fftsg3d.f
new file mode 100644
index 0000000..350bbef
--- /dev/null
+++ b/src/fft2d/fft2d/fftsg3d.f
@@ -0,0 +1,926 @@
+! Fast Fourier/Cosine/Sine Transform
+! dimension :three
+! data length :power of 2
+! decimation :frequency
+! radix :split-radix, row-column
+! data :inplace
+! table :use
+! subroutines
+! cdft3d: Complex Discrete Fourier Transform
+! rdft3d: Real Discrete Fourier Transform
+! ddct3d: Discrete Cosine Transform
+! ddst3d: Discrete Sine Transform
+! necessary package
+! fftsg.f : 1D-FFT package
+!
+!
+! -------- Complex DFT (Discrete Fourier Transform) --------
+! [definition]
+! <case1>
+! X(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+! x(j1,j2,j3) *
+! exp(2*pi*i*j1*k1/n1) *
+! exp(2*pi*i*j2*k2/n2) *
+! exp(2*pi*i*j3*k3/n3),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! <case2>
+! X(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+! x(j1,j2,j3) *
+! exp(-2*pi*i*j1*k1/n1) *
+! exp(-2*pi*i*j2*k2/n2) *
+! exp(-2*pi*i*j3*k3/n3),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call cdft3d(n1max, n2max, 2*n1, n2, n3, 1, a, t, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call cdft3d(n1max, n2max, 2*n1, n2, n3, -1, a, t, ip, w)
+! [parameters]
+! n1max :row1 size of the 3D array (integer)
+! n2max :row2 size of the 3D array (integer)
+! 2*n1 :data length (integer)
+! n1 >= 1, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 1, n2 = power of 2
+! n3 :data length (integer)
+! n3 >= 1, n3 = power of 2
+! a(0:2*n1-1,0:n2-1,0:n3-1)
+! :input/output data (real*8)
+! input data
+! a(2*j1,j2,j3) = Re(x(j1,j2,j3)),
+! a(2*j1+1,j2,j3) = Im(x(j1,j2,j3)),
+! 0<=j1<n1, 0<=j2<n2, 0<=j3<n3
+! output data
+! a(2*k1,k2,k3) = Re(X(k1,k2,k3)),
+! a(2*k1+1,k2,k3) = Im(X(k1,k2,k3)),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! t(0:*) :work area (real*8)
+! length of t >= max(8*n2, 8*n3)
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1, n2, n3))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1/2, n2/2, n3/2)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call cdft3d(n1max, n2max, 2*n1, n2, n3, -1, a, t, ip, w)
+! is
+! call cdft3d(n1max, n2max, 2*n1, n2, n3, 1, a, t, ip, w)
+! do j3 = 0, n3 - 1
+! do j2 = 0, n2 - 1
+! do j1 = 0, 2 * n1 - 1
+! a(j1,j2,j3) = a(j1,j2,j3) * (1.0d0/n1/n2/n3)
+! end do
+! end do
+! end do
+! .
+!
+!
+! -------- Real DFT / Inverse of Real DFT --------
+! [definition]
+! <case1> RDFT
+! R(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+! a(j1,j2,j3) *
+! cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
+! 2*pi*j3*k3/n3),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! I(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+! a(j1,j2,j3) *
+! sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
+! 2*pi*j3*k3/n3),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! <case2> IRDFT (excluding scale)
+! a(k1,k2,k3) = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+! (R(j1,j2,j3) *
+! cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
+! 2*pi*j3*k3/n3) +
+! I(j1,j2,j3) *
+! sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
+! 2*pi*j3*k3/n3)),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! (notes: R(mod(n1-k1,n1),mod(n2-k2,n2),mod(n3-k3,n3)) = R(k1,k2,k3),
+! I(mod(n1-k1,n1),mod(n2-k2,n2),mod(n3-k3,n3)) = -I(k1,k2,k3),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3)
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call rdft3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call rdft3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+! [parameters]
+! n1max :row1 size of the 3D array (integer)
+! n2max :row2 size of the 3D array (integer)
+! n1 :data length (integer)
+! n1 >= 2, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 2, n2 = power of 2
+! n3 :data length (integer)
+! n3 >= 2, n3 = power of 2
+! a(0:n1-1,0:n2-1,0:n3-1)
+! :input/output data (real*8)
+! <case1>
+! output data
+! a(2*k1,k2,k3) = R(k1,k2,k3)
+! = R(n1-k1,mod(n2-k2,n2),mod(n3-k3,n3)),
+! a(2*k1+1,k2,k3) = I(k1,k2,k3)
+! = -I(n1-k1,mod(n2-k2,n2),mod(n3-k3,n3)),
+! 0<k1<n1/2, 0<=k2<n2, 0<=k3<n3,
+! a(0,k2,k3) = R(0,k2,k3)
+! = R(0,n2-k2,mod(n3-k3,n3)),
+! a(1,k2,k3) = I(0,k2,k3)
+! = -I(0,n2-k2,mod(n3-k3,n3)),
+! a(1,n2-k2,k3) = R(n1/2,k2,k3)
+! = R(n1/2,n2-k2,mod(n3-k3,n3)),
+! a(0,n2-k2,k3) = -I(n1/2,k2,k3)
+! = I(n1/2,n2-k2,mod(n3-k3,n3)),
+! 0<k2<n2/2, 0<=k3<n3,
+! a(0,0,k3) = R(0,0,k3)
+! = R(0,0,n3-k3),
+! a(1,0,k3) = I(0,0,k3)
+! = -I(0,0,n3-k3),
+! a(0,n2/2,k3) = R(0,n2/2,k3)
+! = R(0,n2/2,n3-k3),
+! a(1,n2/2,k3) = I(0,n2/2,k3)
+! = -I(0,n2/2,n3-k3),
+! a(1,0,n3-k3) = R(n1/2,0,k3)
+! = R(n1/2,0,n3-k3),
+! a(0,0,n3-k3) = -I(n1/2,0,k3)
+! = I(n1/2,0,n3-k3),
+! a(1,n2/2,n3-k3) = R(n1/2,n2/2,k3)
+! = R(n1/2,n2/2,n3-k3),
+! a(0,n2/2,n3-k3) = -I(n1/2,n2/2,k3)
+! = I(n1/2,n2/2,n3-k3),
+! 0<k3<n3/2,
+! a(0,0,0) = R(0,0,0),
+! a(1,0,0) = R(n1/2,0,0),
+! a(0,0,n3/2) = R(0,0,n3/2),
+! a(1,0,n3/2) = R(n1/2,0,n3/2),
+! a(0,n2/2,0) = R(0,n2/2,0),
+! a(1,n2/2,0) = R(n1/2,n2/2,0),
+! a(0,n2/2,n3/2) = R(0,n2/2,n3/2),
+! a(1,n2/2,n3/2) = R(n1/2,n2/2,n3/2)
+! <case2>
+! input data
+! a(2*j1,j2,j3) = R(j1,j2,j3)
+! = R(n1-j1,mod(n2-j2,n2),mod(n3-j3,n3)),
+! a(2*j1+1,j2,j3) = I(j1,j2,j3)
+! = -I(n1-j1,mod(n2-j2,n2),mod(n3-j3,n3)),
+! 0<j1<n1/2, 0<=j2<n2, 0<=j3<n3,
+! a(0,j2,j3) = R(0,j2,j3)
+! = R(0,n2-j2,mod(n3-j3,n3)),
+! a(1,j2,j3) = I(0,j2,j3)
+! = -I(0,n2-j2,mod(n3-j3,n3)),
+! a(1,n2-j2,j3) = R(n1/2,j2,j3)
+! = R(n1/2,n2-j2,mod(n3-j3,n3)),
+! a(0,n2-j2,j3) = -I(n1/2,j2,j3)
+! = I(n1/2,n2-j2,mod(n3-j3,n3)),
+! 0<j2<n2/2, 0<=j3<n3,
+! a(0,0,j3) = R(0,0,j3)
+! = R(0,0,n3-j3),
+! a(1,0,j3) = I(0,0,j3)
+! = -I(0,0,n3-j3),
+! a(0,n2/2,j3) = R(0,n2/2,j3)
+! = R(0,n2/2,n3-j3),
+! a(1,n2/2,j3) = I(0,n2/2,j3)
+! = -I(0,n2/2,n3-j3),
+! a(1,0,n3-j3) = R(n1/2,0,j3)
+! = R(n1/2,0,n3-j3),
+! a(0,0,n3-j3) = -I(n1/2,0,j3)
+! = I(n1/2,0,n3-j3),
+! a(1,n2/2,n3-j3) = R(n1/2,n2/2,j3)
+! = R(n1/2,n2/2,n3-j3),
+! a(0,n2/2,n3-j3) = -I(n1/2,n2/2,j3)
+! = I(n1/2,n2/2,n3-j3),
+! 0<j3<n3/2,
+! a(0,0,0) = R(0,0,0),
+! a(1,0,0) = R(n1/2,0,0),
+! a(0,0,n3/2) = R(0,0,n3/2),
+! a(1,0,n3/2) = R(n1/2,0,n3/2),
+! a(0,n2/2,0) = R(0,n2/2,0),
+! a(1,n2/2,0) = R(n1/2,n2/2,0),
+! a(0,n2/2,n3/2) = R(0,n2/2,n3/2),
+! a(1,n2/2,n3/2) = R(n1/2,n2/2,n3/2)
+! ---- output ordering ----
+! call rdft3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+! call rdft3dsort(n1max, n2max, n1, n2, n3, 1, a)
+! ! stored data is a(0:n1-1,0:n2-1,0:n3+1):
+! ! a(2*k1,k2,k3) = R(k1,k2,k3),
+! ! a(2*k1+1,k2,k3) = I(k1,k2,k3),
+! ! 0<=k1<=n1/2, 0<=k2<n2, 0<=k3<n3.
+! ! the stored data is larger than the input data!
+! ---- input ordering ----
+! call rdft3dsort(n1max, n2max, n1, n2, n3, -1, a)
+! call rdft3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+! t(0:*) :work area (real*8)
+! length of t >= max(8*n2, 8*n3)
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1/2, n2, n3))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1/4, n2/2, n3/2) + n1/4
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call rdft3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+! is
+! call rdft3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+! do j3 = 0, n3 - 1
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1,j2,j3) = a(j1,j2,j3) * (2.0d0/n1/n2/n3)
+! end do
+! end do
+! end do
+! .
+!
+!
+! -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+! [definition]
+! <case1> IDCT (excluding scale)
+! C(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+! a(j1,j2,j3) *
+! cos(pi*j1*(k1+1/2)/n1) *
+! cos(pi*j2*(k2+1/2)/n2) *
+! cos(pi*j3*(k3+1/2)/n3),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! <case2> DCT
+! C(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+! a(j1,j2,j3) *
+! cos(pi*(j1+1/2)*k1/n1) *
+! cos(pi*(j2+1/2)*k2/n2) *
+! cos(pi*(j3+1/2)*k3/n3),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call ddct3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call ddct3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+! [parameters]
+! n1max :row1 size of the 3D array (integer)
+! n2max :row2 size of the 3D array (integer)
+! n1 :data length (integer)
+! n1 >= 2, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 2, n2 = power of 2
+! n3 :data length (integer)
+! n3 >= 2, n3 = power of 2
+! a(0:n1-1,0:n2-1,0:n3-1)
+! :input/output data (real*8)
+! output data
+! a(k1,k2,k3) = C(k1,k2,k3),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! t(0:*) :work area (real*8)
+! length of t >= max(4*n2, 4*n3)
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1/2, n2/2, n3/2))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1*3/2, n2*3/2, n3*3/2)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call ddct3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+! is
+! do j3 = 0, n3 - 1
+! do j2 = 0, n2 - 1
+! a(0, j2, j3) = a(0, j2, j3) * 0.5d0
+! end do
+! do j1 = 0, n1 - 1
+! a(j1, 0, j3) = a(j1, 0, j3) * 0.5d0
+! end do
+! end do
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1, j2, 0) = a(j1, j2, 0) * 0.5d0
+! end do
+! end do
+! call ddct3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+! do j3 = 0, n3 - 1
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1,j2,j3) = a(j1,j2,j3) * (8.0d0/n1/n2/n3)
+! end do
+! end do
+! end do
+! .
+!
+!
+! -------- DST (Discrete Sine Transform) / Inverse of DST --------
+! [definition]
+! <case1> IDST (excluding scale)
+! S(k1,k2,k3) = sum_j1=1^n1 sum_j2=1^n2 sum_j3=1^n3
+! A(j1,j2,j3) *
+! sin(pi*j1*(k1+1/2)/n1) *
+! sin(pi*j2*(k2+1/2)/n2) *
+! sin(pi*j3*(k3+1/2)/n3),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! <case2> DST
+! S(k1,k2,k3) = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
+! a(j1,j2,j3) *
+! sin(pi*(j1+1/2)*k1/n1) *
+! sin(pi*(j2+1/2)*k2/n2) *
+! sin(pi*(j3+1/2)*k3/n3),
+! 0<k1<=n1, 0<k2<=n2, 0<k3<=n3
+! [usage]
+! <case1>
+! ip(0) = 0 ! first time only
+! call ddst3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+! <case2>
+! ip(0) = 0 ! first time only
+! call ddst3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+! [parameters]
+! n1max :row1 size of the 3D array (integer)
+! n2max :row2 size of the 3D array (integer)
+! n1 :data length (integer)
+! n1 >= 2, n1 = power of 2
+! n2 :data length (integer)
+! n2 >= 2, n2 = power of 2
+! n3 :data length (integer)
+! n3 >= 2, n3 = power of 2
+! a(0:n1-1,0:n2-1,0:n3-1)
+! :input/output data (real*8)
+! <case1>
+! input data
+! a(mod(j1,n1),mod(j2,n2),mod(j3,n3)) = A(j1,j2,j3),
+! 0<j1<=n1, 0<j2<=n2, 0<j3<=n3
+! output data
+! a(k1,k2,k3) = S(k1,k2,k3),
+! 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
+! <case2>
+! output data
+! a(mod(k1,n1),mod(k2,n2),mod(k3,n3)) = S(k1,k2,k3),
+! 0<k1<=n1, 0<k2<=n2, 0<k3<=n3
+! t(0:*) :work area (real*8)
+! length of t >= max(4*n2, 4*n3)
+! ip(0:*):work area for bit reversal (integer)
+! length of ip >= 2+sqrt(n)
+! (n = max(n1/2, n2/2, n3/2))
+! ip(0),ip(1) are pointers of the cos/sin table.
+! w(0:*) :cos/sin table (real*8)
+! length of w >= max(n1*3/2, n2*3/2, n3*3/2)
+! w(),ip() are initialized if ip(0) = 0.
+! [remark]
+! Inverse of
+! call ddst3d(n1max, n2max, n1, n2, n3, -1, a, t, ip, w)
+! is
+! do j3 = 0, n3 - 1
+! do j2 = 0, n2 - 1
+! a(0, j2, j3) = a(0, j2, j3) * 0.5d0
+! end do
+! do j1 = 0, n1 - 1
+! a(j1, 0, j3) = a(j1, 0, j3) * 0.5d0
+! end do
+! end do
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1, j2, 0) = a(j1, j2, 0) * 0.5d0
+! end do
+! end do
+! call ddst3d(n1max, n2max, n1, n2, n3, 1, a, t, ip, w)
+! do j3 = 0, n3 - 1
+! do j2 = 0, n2 - 1
+! do j1 = 0, n1 - 1
+! a(j1,j2,j3) = a(j1,j2,j3) * (8.0d0/n1/n2/n3)
+! end do
+! end do
+! end do
+! .
+!
+!
+ subroutine cdft3d(n1max, n2max, n1, n2, n3, isgn, a,
+ & t, ip, w)
+ integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *), n
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1),
+ & t(0 : *), w(0 : *)
+ n = 2 * max(n2, n3)
+ n = max(n, n1)
+ if (n .gt. 4 * ip(0)) then
+ call makewt(n / 4, ip, w)
+ end if
+ call xdft3da_sub(n1max, n2max, n1, n2, n3, 0,
+ & isgn, a, t, ip, w)
+ call cdft3db_sub(n1max, n2max, n1, n2, n3,
+ & isgn, a, t, ip, w)
+ end
+!
+ subroutine rdft3d(n1max, n2max, n1, n2, n3, isgn, a,
+ & t, ip, w)
+ integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *),
+ & n, nw, nc
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1),
+ & t(0 : *), w(0 : *)
+ n = 2 * max(n2, n3)
+ n = max(n, n1)
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n1 .gt. 4 * nc) then
+ nc = n1 / 4
+ call makect(nc, ip, w(nw))
+ end if
+ if (isgn .lt. 0) then
+ call rdft3d_sub(n1max, n2max, n1, n2, n3, isgn, a)
+ call cdft3db_sub(n1max, n2max, n1, n2, n3,
+ & isgn, a, t, ip, w)
+ call xdft3da_sub(n1max, n2max, n1, n2, n3, 1,
+ & isgn, a, t, ip, w)
+ else
+ call xdft3da_sub(n1max, n2max, n1, n2, n3, 1,
+ & isgn, a, t, ip, w)
+ call cdft3db_sub(n1max, n2max, n1, n2, n3,
+ & isgn, a, t, ip, w)
+ call rdft3d_sub(n1max, n2max, n1, n2, n3, isgn, a)
+ end if
+ end
+!
+ subroutine rdft3dsort(n1max, n2max, n1, n2, n3, isgn, a)
+ integer n1max, n2max, n1, n2, n3, isgn, n2h, n3h, j, k
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), x, y
+ n2h = n2 / 2
+ n3h = n3 / 2
+ if (isgn .lt. 0) then
+ do k = 0, n3 - 1
+ do j = n2h + 1, n2 - 1
+ a(0, j, k) = a(n1 + 1, j, k)
+ a(1, j, k) = a(n1, j, k)
+ end do
+ end do
+ do k = n3h + 1, n3 - 1
+ a(0, 0, k) = a(n1 + 1, 0, k)
+ a(1, 0, k) = a(n1, 0, k)
+ a(0, n2h, k) = a(n1 + 1, n2h, k)
+ a(1, n2h, k) = a(n1, n2h, k)
+ end do
+ a(1, 0, 0) = a(n1, 0, 0)
+ a(1, n2h, 0) = a(n1, n2h, 0)
+ a(1, 0, n3h) = a(n1, 0, n3h)
+ a(1, n2h, n3h) = a(n1, n2h, n3h)
+ else
+ do j = n2h + 1, n2 - 1
+ y = a(0, j, 0)
+ x = a(1, j, 0)
+ a(n1, j, 0) = x
+ a(n1 + 1, j, 0) = y
+ a(n1, n2 - j, 0) = x
+ a(n1 + 1, n2 - j, 0) = -y
+ a(0, j, 0) = a(0, n2 - j, 0)
+ a(1, j, 0) = -a(1, n2 - j, 0)
+ end do
+ do k = 1, n3 - 1
+ do j = n2h + 1, n2 - 1
+ y = a(0, j, k)
+ x = a(1, j, k)
+ a(n1, j, k) = x
+ a(n1 + 1, j, k) = y
+ a(n1, n2 - j, n3 - k) = x
+ a(n1 + 1, n2 - j, n3 - k) = -y
+ a(0, j, k) = a(0, n2 - j, n3 - k)
+ a(1, j, k) = -a(1, n2 - j, n3 - k)
+ end do
+ end do
+ do k = n3h + 1, n3 - 1
+ y = a(0, 0, k)
+ x = a(1, 0, k)
+ a(n1, 0, k) = x
+ a(n1 + 1, 0, k) = y
+ a(n1, 0, n3 - k) = x
+ a(n1 + 1, 0, n3 - k) = -y
+ a(0, 0, k) = a(0, 0, n3 - k)
+ a(1, 0, k) = -a(1, 0, n3 - k)
+ y = a(0, n2h, k)
+ x = a(1, n2h, k)
+ a(n1, n2h, k) = x
+ a(n1 + 1, n2h, k) = y
+ a(n1, n2h, n3 - k) = x
+ a(n1 + 1, n2h, n3 - k) = -y
+ a(0, n2h, k) = a(0, n2h, n3 - k)
+ a(1, n2h, k) = -a(1, n2h, n3 - k)
+ end do
+ a(n1, 0, 0) = a(1, 0, 0)
+ a(n1 + 1, 0, 0) = 0
+ a(1, 0, 0) = 0
+ a(n1, n2h, 0) = a(1, n2h, 0)
+ a(n1 + 1, n2h, 0) = 0
+ a(1, n2h, 0) = 0
+ a(n1, 0, n3h) = a(1, 0, n3h)
+ a(n1 + 1, 0, n3h) = 0
+ a(1, 0, n3h) = 0
+ a(n1, n2h, n3h) = a(1, n2h, n3h)
+ a(n1 + 1, n2h, n3h) = 0
+ a(1, n2h, n3h) = 0
+ end if
+ end
+!
+ subroutine ddct3d(n1max, n2max, n1, n2, n3, isgn, a,
+ & t, ip, w)
+ integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *),
+ & n, nw, nc
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1),
+ & t(0 : *), w(0 : *)
+ n = max(n2, n3)
+ n = max(n, n1)
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n .gt. nc) then
+ nc = n
+ call makect(nc, ip, w(nw))
+ end if
+ call ddxt3da_sub(n1max, n2max, n1, n2, n3, 0,
+ & isgn, a, t, ip, w)
+ call ddxt3db_sub(n1max, n2max, n1, n2, n3, 0,
+ & isgn, a, t, ip, w)
+ end
+!
+ subroutine ddst3d(n1max, n2max, n1, n2, n3, isgn, a,
+ & t, ip, w)
+ integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *),
+ & n, nw, nc
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1),
+ & t(0 : *), w(0 : *)
+ n = max(n2, n3)
+ n = max(n, n1)
+ nw = ip(0)
+ if (n .gt. 4 * nw) then
+ nw = n / 4
+ call makewt(nw, ip, w)
+ end if
+ nc = ip(1)
+ if (n .gt. nc) then
+ nc = n
+ call makect(nc, ip, w(nw))
+ end if
+ call ddxt3da_sub(n1max, n2max, n1, n2, n3, 1,
+ & isgn, a, t, ip, w)
+ call ddxt3db_sub(n1max, n2max, n1, n2, n3, 1,
+ & isgn, a, t, ip, w)
+ end
+!
+! -------- child routines --------
+!
+ subroutine xdft3da_sub(n1max, n2max, n1, n2, n3, icr,
+ & isgn, a, t, ip, w)
+ integer n1max, n2max, n1, n2, n3, icr, isgn,
+ & ip(0 : *), i, j, k
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1),
+ & t(0 : *), w(0 : *)
+ do k = 0, n3 - 1
+ if (icr .eq. 0) then
+ do j = 0, n2 - 1
+ call cdft(n1, isgn, a(0, j, k), ip, w)
+ end do
+ else if (isgn .ge. 0) then
+ do j = 0, n2 - 1
+ call rdft(n1, isgn, a(0, j, k), ip, w)
+ end do
+ end if
+ if (n1 .gt. 4) then
+ do i = 0, n1 - 8, 8
+ do j = 0, n2 - 1
+ t(2 * j) = a(i, j, k)
+ t(2 * j + 1) = a(i + 1, j, k)
+ t(2 * n2 + 2 * j) = a(i + 2, j, k)
+ t(2 * n2 + 2 * j + 1) = a(i + 3, j, k)
+ t(4 * n2 + 2 * j) = a(i + 4, j, k)
+ t(4 * n2 + 2 * j + 1) = a(i + 5, j, k)
+ t(6 * n2 + 2 * j) = a(i + 6, j, k)
+ t(6 * n2 + 2 * j + 1) = a(i + 7, j, k)
+ end do
+ call cdft(2 * n2, isgn, t, ip, w)
+ call cdft(2 * n2, isgn, t(2 * n2), ip, w)
+ call cdft(2 * n2, isgn, t(4 * n2), ip, w)
+ call cdft(2 * n2, isgn, t(6 * n2), ip, w)
+ do j = 0, n2 - 1
+ a(i, j, k) = t(2 * j)
+ a(i + 1, j, k) = t(2 * j + 1)
+ a(i + 2, j, k) = t(2 * n2 + 2 * j)
+ a(i + 3, j, k) = t(2 * n2 + 2 * j + 1)
+ a(i + 4, j, k) = t(4 * n2 + 2 * j)
+ a(i + 5, j, k) = t(4 * n2 + 2 * j + 1)
+ a(i + 6, j, k) = t(6 * n2 + 2 * j)
+ a(i + 7, j, k) = t(6 * n2 + 2 * j + 1)
+ end do
+ end do
+ else if (n1 .eq. 4) then
+ do j = 0, n2 - 1
+ t(2 * j) = a(0, j, k)
+ t(2 * j + 1) = a(1, j, k)
+ t(2 * n2 + 2 * j) = a(2, j, k)
+ t(2 * n2 + 2 * j + 1) = a(3, j, k)
+ end do
+ call cdft(2 * n2, isgn, t, ip, w)
+ call cdft(2 * n2, isgn, t(2 * n2), ip, w)
+ do j = 0, n2 - 1
+ a(0, j, k) = t(2 * j)
+ a(1, j, k) = t(2 * j + 1)
+ a(2, j, k) = t(2 * n2 + 2 * j)
+ a(3, j, k) = t(2 * n2 + 2 * j + 1)
+ end do
+ else if (n1 .eq. 2) then
+ do j = 0, n2 - 1
+ t(2 * j) = a(0, j, k)
+ t(2 * j + 1) = a(1, j, k)
+ end do
+ call cdft(2 * n2, isgn, t, ip, w)
+ do j = 0, n2 - 1
+ a(0, j, k) = t(2 * j)
+ a(1, j, k) = t(2 * j + 1)
+ end do
+ end if
+ if (icr .ne. 0 .and. isgn .lt. 0) then
+ do j = 0, n2 - 1
+ call rdft(n1, isgn, a(0, j, k), ip, w)
+ end do
+ end if
+ end do
+ end
+!
+ subroutine cdft3db_sub(n1max, n2max, n1, n2, n3,
+ & isgn, a, t, ip, w)
+ integer n1max, n2max, n1, n2, n3, isgn, ip(0 : *),
+ & i, j, k
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1),
+ & t(0 : *), w(0 : *)
+ if (n1 .gt. 4) then
+ do j = 0, n2 - 1
+ do i = 0, n1 - 8, 8
+ do k = 0, n3 - 1
+ t(2 * k) = a(i, j, k)
+ t(2 * k + 1) = a(i + 1, j, k)
+ t(2 * n3 + 2 * k) = a(i + 2, j, k)
+ t(2 * n3 + 2 * k + 1) = a(i + 3, j, k)
+ t(4 * n3 + 2 * k) = a(i + 4, j, k)
+ t(4 * n3 + 2 * k + 1) = a(i + 5, j, k)
+ t(6 * n3 + 2 * k) = a(i + 6, j, k)
+ t(6 * n3 + 2 * k + 1) = a(i + 7, j, k)
+ end do
+ call cdft(2 * n3, isgn, t, ip, w)
+ call cdft(2 * n3, isgn, t(2 * n3), ip, w)
+ call cdft(2 * n3, isgn, t(4 * n3), ip, w)
+ call cdft(2 * n3, isgn, t(6 * n3), ip, w)
+ do k = 0, n3 - 1
+ a(i, j, k) = t(2 * k)
+ a(i + 1, j, k) = t(2 * k + 1)
+ a(i + 2, j, k) = t(2 * n3 + 2 * k)
+ a(i + 3, j, k) = t(2 * n3 + 2 * k + 1)
+ a(i + 4, j, k) = t(4 * n3 + 2 * k)
+ a(i + 5, j, k) = t(4 * n3 + 2 * k + 1)
+ a(i + 6, j, k) = t(6 * n3 + 2 * k)
+ a(i + 7, j, k) = t(6 * n3 + 2 * k + 1)
+ end do
+ end do
+ end do
+ else if (n1 .eq. 4) then
+ do j = 0, n2 - 1
+ do k = 0, n3 - 1
+ t(2 * k) = a(0, j, k)
+ t(2 * k + 1) = a(1, j, k)
+ t(2 * n3 + 2 * k) = a(2, j, k)
+ t(2 * n3 + 2 * k + 1) = a(3, j, k)
+ end do
+ call cdft(2 * n3, isgn, t, ip, w)
+ call cdft(2 * n3, isgn, t(2 * n3), ip, w)
+ do k = 0, n3 - 1
+ a(0, j, k) = t(2 * k)
+ a(1, j, k) = t(2 * k + 1)
+ a(2, j, k) = t(2 * n3 + 2 * k)
+ a(3, j, k) = t(2 * n3 + 2 * k + 1)
+ end do
+ end do
+ else if (n1 .eq. 2) then
+ do j = 0, n2 - 1
+ do k = 0, n3 - 1
+ t(2 * k) = a(0, j, k)
+ t(2 * k + 1) = a(1, j, k)
+ end do
+ call cdft(2 * n3, isgn, t, ip, w)
+ do k = 0, n3 - 1
+ a(0, j, k) = t(2 * k)
+ a(1, j, k) = t(2 * k + 1)
+ end do
+ end do
+ end if
+ end
+!
+ subroutine rdft3d_sub(n1max, n2max, n1, n2, n3, isgn, a)
+ integer n1max, n2max, n1, n2, n3, isgn,
+ & n2h, n3h, i, j, k, l
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1), xi
+ n2h = n2 / 2
+ n3h = n3 / 2
+ if (isgn .lt. 0) then
+ do k = 1, n3h - 1
+ l = n3 - k
+ xi = a(0, 0, k) - a(0, 0, l)
+ a(0, 0, k) = a(0, 0, k) + a(0, 0, l)
+ a(0, 0, l) = xi
+ xi = a(1, 0, l) - a(1, 0, k)
+ a(1, 0, k) = a(1, 0, k) + a(1, 0, l)
+ a(1, 0, l) = xi
+ xi = a(0, n2h, k) - a(0, n2h, l)
+ a(0, n2h, k) = a(0, n2h, k) + a(0, n2h, l)
+ a(0, n2h, l) = xi
+ xi = a(1, n2h, l) - a(1, n2h, k)
+ a(1, n2h, k) = a(1, n2h, k) + a(1, n2h, l)
+ a(1, n2h, l) = xi
+ do i = 1, n2h - 1
+ j = n2 - i
+ xi = a(0, i, k) - a(0, j, l)
+ a(0, i, k) = a(0, i, k) + a(0, j, l)
+ a(0, j, l) = xi
+ xi = a(1, j, l) - a(1, i, k)
+ a(1, i, k) = a(1, i, k) + a(1, j, l)
+ a(1, j, l) = xi
+ xi = a(0, i, l) - a(0, j, k)
+ a(0, i, l) = a(0, i, l) + a(0, j, k)
+ a(0, j, k) = xi
+ xi = a(1, j, k) - a(1, i, l)
+ a(1, i, l) = a(1, i, l) + a(1, j, k)
+ a(1, j, k) = xi
+ end do
+ end do
+ do i = 1, n2h - 1
+ j = n2 - i
+ xi = a(0, i, 0) - a(0, j, 0)
+ a(0, i, 0) = a(0, i, 0) + a(0, j, 0)
+ a(0, j, 0) = xi
+ xi = a(1, j, 0) - a(1, i, 0)
+ a(1, i, 0) = a(1, i, 0) + a(1, j, 0)
+ a(1, j, 0) = xi
+ xi = a(0, i, n3h) - a(0, j, n3h)
+ a(0, i, n3h) = a(0, i, n3h) + a(0, j, n3h)
+ a(0, j, n3h) = xi
+ xi = a(1, j, n3h) - a(1, i, n3h)
+ a(1, i, n3h) = a(1, i, n3h) + a(1, j, n3h)
+ a(1, j, n3h) = xi
+ end do
+ else
+ do k = 1, n3h - 1
+ l = n3 - k
+ a(0, 0, l) = 0.5d0 * (a(0, 0, k) - a(0, 0, l))
+ a(0, 0, k) = a(0, 0, k) - a(0, 0, l)
+ a(1, 0, l) = 0.5d0 * (a(1, 0, k) + a(1, 0, l))
+ a(1, 0, k) = a(1, 0, k) - a(1, 0, l)
+ a(0, n2h, l) = 0.5d0 * (a(0, n2h, k) - a(0, n2h, l))
+ a(0, n2h, k) = a(0, n2h, k) - a(0, n2h, l)
+ a(1, n2h, l) = 0.5d0 * (a(1, n2h, k) + a(1, n2h, l))
+ a(1, n2h, k) = a(1, n2h, k) - a(1, n2h, l)
+ do i = 1, n2h - 1
+ j = n2 - i
+ a(0, j, l) = 0.5d0 * (a(0, i, k) - a(0, j, l))
+ a(0, i, k) = a(0, i, k) - a(0, j, l)
+ a(1, j, l) = 0.5d0 * (a(1, i, k) + a(1, j, l))
+ a(1, i, k) = a(1, i, k) - a(1, j, l)
+ a(0, j, k) = 0.5d0 * (a(0, i, l) - a(0, j, k))
+ a(0, i, l) = a(0, i, l) - a(0, j, k)
+ a(1, j, k) = 0.5d0 * (a(1, i, l) + a(1, j, k))
+ a(1, i, l) = a(1, i, l) - a(1, j, k)
+ end do
+ end do
+ do i = 1, n2h - 1
+ j = n2 - i
+ a(0, j, 0) = 0.5d0 * (a(0, i, 0) - a(0, j, 0))
+ a(0, i, 0) = a(0, i, 0) - a(0, j, 0)
+ a(1, j, 0) = 0.5d0 * (a(1, i, 0) + a(1, j, 0))
+ a(1, i, 0) = a(1, i, 0) - a(1, j, 0)
+ a(0, j, n3h) = 0.5d0 * (a(0, i, n3h) - a(0, j, n3h))
+ a(0, i, n3h) = a(0, i, n3h) - a(0, j, n3h)
+ a(1, j, n3h) = 0.5d0 * (a(1, i, n3h) + a(1, j, n3h))
+ a(1, i, n3h) = a(1, i, n3h) - a(1, j, n3h)
+ end do
+ end if
+ end
+!
+ subroutine ddxt3da_sub(n1max, n2max, n1, n2, n3, ics,
+ & isgn, a, t, ip, w)
+ integer n1max, n2max, n1, n2, n3, ics, isgn,
+ & ip(0 : *), i, j, k
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1),
+ & t(0 : *), w(0 : *)
+ do k = 0, n3 - 1
+ if (ics .eq. 0) then
+ do j = 0, n2 - 1
+ call ddct(n1, isgn, a(0, j, k), ip, w)
+ end do
+ else
+ do j = 0, n2 - 1
+ call ddst(n1, isgn, a(0, j, k), ip, w)
+ end do
+ end if
+ if (n1 .gt. 2) then
+ do i = 0, n1 - 4, 4
+ do j = 0, n2 - 1
+ t(j) = a(i, j, k)
+ t(n2 + j) = a(i + 1, j, k)
+ t(2 * n2 + j) = a(i + 2, j, k)
+ t(3 * n2 + j) = a(i + 3, j, k)
+ end do
+ if (ics .eq. 0) then
+ call ddct(n2, isgn, t, ip, w)
+ call ddct(n2, isgn, t(n2), ip, w)
+ call ddct(n2, isgn, t(2 * n2), ip, w)
+ call ddct(n2, isgn, t(3 * n2), ip, w)
+ else
+ call ddst(n2, isgn, t, ip, w)
+ call ddst(n2, isgn, t(n2), ip, w)
+ call ddst(n2, isgn, t(2 * n2), ip, w)
+ call ddst(n2, isgn, t(3 * n2), ip, w)
+ end if
+ do j = 0, n2 - 1
+ a(i, j, k) = t(j)
+ a(i + 1, j, k) = t(n2 + j)
+ a(i + 2, j, k) = t(2 * n2 + j)
+ a(i + 3, j, k) = t(3 * n2 + j)
+ end do
+ end do
+ else if (n1 .eq. 2) then
+ do j = 0, n2 - 1
+ t(j) = a(0, j, k)
+ t(n2 + j) = a(1, j, k)
+ end do
+ if (ics .eq. 0) then
+ call ddct(n2, isgn, t, ip, w)
+ call ddct(n2, isgn, t(n2), ip, w)
+ else
+ call ddst(n2, isgn, t, ip, w)
+ call ddst(n2, isgn, t(n2), ip, w)
+ end if
+ do j = 0, n2 - 1
+ a(0, j, k) = t(j)
+ a(1, j, k) = t(n2 + j)
+ end do
+ end if
+ end do
+ end
+!
+ subroutine ddxt3db_sub(n1max, n2max, n1, n2, n3, ics,
+ & isgn, a, t, ip, w)
+ integer n1max, n2max, n1, n2, n3, ics, isgn,
+ & ip(0 : *), i, j, k
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : n3 - 1),
+ & t(0 : *), w(0 : *)
+ if (n1 .gt. 2) then
+ do j = 0, n2 - 1
+ do i = 0, n1 - 4, 4
+ do k = 0, n3 - 1
+ t(k) = a(i, j, k)
+ t(n3 + k) = a(i + 1, j, k)
+ t(2 * n3 + k) = a(i + 2, j, k)
+ t(3 * n3 + k) = a(i + 3, j, k)
+ end do
+ if (ics .eq. 0) then
+ call ddct(n3, isgn, t, ip, w)
+ call ddct(n3, isgn, t(n3), ip, w)
+ call ddct(n3, isgn, t(2 * n3), ip, w)
+ call ddct(n3, isgn, t(3 * n3), ip, w)
+ else
+ call ddst(n3, isgn, t, ip, w)
+ call ddst(n3, isgn, t(n3), ip, w)
+ call ddst(n3, isgn, t(2 * n3), ip, w)
+ call ddst(n3, isgn, t(3 * n3), ip, w)
+ end if
+ do k = 0, n3 - 1
+ a(i, j, k) = t(k)
+ a(i + 1, j, k) = t(n3 + k)
+ a(i + 2, j, k) = t(2 * n3 + k)
+ a(i + 3, j, k) = t(3 * n3 + k)
+ end do
+ end do
+ end do
+ else if (n1 .eq. 2) then
+ do j = 0, n2 - 1
+ do k = 0, n3 - 1
+ t(k) = a(0, j, k)
+ t(n3 + k) = a(1, j, k)
+ end do
+ if (ics .eq. 0) then
+ call ddct(n3, isgn, t, ip, w)
+ call ddct(n3, isgn, t(n3), ip, w)
+ else
+ call ddst(n3, isgn, t, ip, w)
+ call ddst(n3, isgn, t(n3), ip, w)
+ end if
+ do k = 0, n3 - 1
+ a(0, j, k) = t(k)
+ a(1, j, k) = t(n3 + k)
+ end do
+ end do
+ end if
+ end
+!
diff --git a/src/fft2d/fft2d/readme2d.txt b/src/fft2d/fft2d/readme2d.txt
new file mode 100644
index 0000000..5a48e1d
--- /dev/null
+++ b/src/fft2d/fft2d/readme2d.txt
@@ -0,0 +1,71 @@
+General Purpose 2D,3D FFT (Fast Fourier Transform) Package
+
+Files
+ alloc.c : 2D-array Allocation
+ alloc.h : 2D-array Allocation
+ fft4f2d.c : 2D FFT Package in C - Version I (radix 4, 2)
+ fft4f2d.f : 2D FFT Package in Fortran - Version I (radix 4, 2)
+ fftsg.c : 1D FFT Package in C - Fast Version (Split-Radix)
+ fftsg.f : 1D FFT Package in Fortran - Fast Version (Split-Radix)
+ fftsg2d.c : 2D FFT Package in C - Version II (Split-Radix)
+ fftsg2d.f : 2D FFT Package in Fortran - Version II (Split-Radix)
+ fftsg3d.c : 3D FFT Package in C - Version II (Split-Radix)
+ fftsg3d.f : 3D FFT Package in Fortran - Version II (Split-Radix)
+ shrtdct.c : 8x8, 16x16 DCT Package
+ sample2d/
+ Makefile : for gcc, cc
+ Makefile.f77: for Fortran
+ Makefile.pth: Pthread version
+ fft4f2dt.c : Test Program for "fft4f2d.c"
+ fft4f2dt.f : Test Program for "fft4f2d.f"
+ fftsg2dt.c : Test Program for "fftsg2d.c"
+ fftsg2dt.f : Test Program for "fftsg2d.f"
+ fftsg3dt.c : Test Program for "fftsg3d.c"
+ fftsg3dt.f : Test Program for "fftsg3d.f"
+ shrtdctt.c : Test Program for "shrtdct.c"
+
+Difference of Files
+ C and Fortran versions are equal and
+ the same routines are in each version.
+ ---- Difference between "fft4f2d.*" and "fftsg2d.*" ----
+ "fft4f2d.*" are optimized for the old machines that
+ don't have the large size CPU cache.
+ "fftsg2d.*", "fftsg3d.*" use 1D FFT routines in "fftsg.*".
+ "fftsg2d.*", "fftsg3d.*" are optimized for the machines that
+ have the multi-level (L1,L2,etc) cache.
+
+Routines in the Package
+ in fft4f2d.*, fftsg2d.*
+ cdft2d: 2-dim Complex Discrete Fourier Transform
+ rdft2d: 2-dim Real Discrete Fourier Transform
+ ddct2d: 2-dim Discrete Cosine Transform
+ ddst2d: 2-dim Discrete Sine Transform
+ rdft2dsort: rdft2d input/output ordering (fftsg2d.*)
+ in fftsg3d.*
+ cdft3d: 3-dim Complex Discrete Fourier Transform
+ rdft3d: 3-dim Real Discrete Fourier Transform
+ ddct3d: 3-dim Discrete Cosine Transform
+ ddst3d: 3-dim Discrete Sine Transform
+ rdft3dsort: rdft3d input/output ordering
+ in fftsg.*
+ cdft: 1-dim Complex Discrete Fourier Transform
+ rdft: 1-dim Real Discrete Fourier Transform
+ ddct: 1-dim Discrete Cosine Transform
+ ddst: 1-dim Discrete Sine Transform
+ dfct: 1-dim Real Symmetric DFT
+ dfst: 1-dim Real Anti-symmetric DFT
+ (these routines are called by fftsg2d.*, fftsg3d.*)
+ in shrtdct.c
+ ddct8x8s : Normalized 8x8 DCT
+ ddct16x16s: Normalized 16x16 DCT
+ (faster than ddct2d())
+
+Usage
+ Brief explanations are in block comments of each packages.
+ The examples are given in the test programs.
+
+Copyright
+ Copyright(C) 1997,2001 Takuya OOURA (email: ooura@kurims.kyoto-u.ac.jp).
+ You may use, copy, modify this code for any purpose and
+ without fee. You may distribute this ORIGINAL package.
+
diff --git a/src/fft2d/fft2d/sample2d/Makefile b/src/fft2d/fft2d/sample2d/Makefile
new file mode 100644
index 0000000..130e0a4
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/Makefile
@@ -0,0 +1,72 @@
+# ---- for GNU gcc ----
+
+CC = gcc
+
+CFLAGS = -Wall
+
+OFLAGS = -O2
+
+# ---- for SUN WS cc ----
+#
+#CC = cc
+#
+#CFLAGS =
+#
+#OFLAGS = -xO2
+
+
+
+
+all: fft4f2dt fftsg2dt fftsg3dt shrtdctt
+
+
+fft4f2dt : fft4f2dt.o fft4f2d.o alloc.o
+ $(CC) fft4f2dt.o fft4f2d.o alloc.o -lm -o fft4f2dt
+
+fftsg2dt : fftsg2dt.o fftsg2d.o fftsg.o alloc.o
+ $(CC) fftsg2dt.o fftsg2d.o fftsg.o alloc.o -lm -o fftsg2dt
+
+fftsg3dt : fftsg3dt.o fftsg3d.o fftsg.o alloc.o
+ $(CC) fftsg3dt.o fftsg3d.o fftsg.o alloc.o -lm -o fftsg3dt
+
+shrtdctt : shrtdctt.o shrtdct.o
+ $(CC) shrtdctt.o shrtdct.o -lm -o shrtdctt
+
+
+fft4f2dt.o : fft4f2dt.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c fft4f2dt.c -o fft4f2dt.o
+
+fftsg2dt.o : fftsg2dt.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c fftsg2dt.c -o fftsg2dt.o
+
+fftsg3dt.o : fftsg3dt.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c fftsg3dt.c -o fftsg3dt.o
+
+shrtdctt.o : shrtdctt.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c shrtdctt.c -o shrtdctt.o
+
+
+fft4f2d.o : ../fft4f2d.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c ../fft4f2d.c -o fft4f2d.o
+
+fftsg2d.o : ../fftsg2d.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg2d.c -o fftsg2d.o
+
+fftsg3d.o : ../fftsg3d.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg3d.c -o fftsg3d.o
+
+fftsg.o : ../fftsg.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg.c -o fftsg.o
+
+alloc.o : ../alloc.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c ../alloc.c -o alloc.o
+
+shrtdct.o : ../shrtdct.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c ../shrtdct.c -o shrtdct.o
+
+
+
+
+clean:
+ rm -f *.o
+
diff --git a/src/fft2d/fft2d/sample2d/Makefile.f77 b/src/fft2d/fft2d/sample2d/Makefile.f77
new file mode 100644
index 0000000..0162fdc
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/Makefile.f77
@@ -0,0 +1,60 @@
+# ---- for GNU g77 ----
+
+F77 = g77
+
+FFLAGS = -Wall
+
+OFLAGS = -O2
+
+# ---- for SUN WS f77 ----
+#
+#F77 = f77
+#
+#FFLAGS =
+#
+#OFLAGS = -xO2
+
+
+
+
+all: fft4f2dt_f fftsg2dt_f fftsg3dt_f
+
+
+fft4f2dt_f : fft4f2dt_f.o fft4f2d_f.o
+ $(F77) fft4f2dt_f.o fft4f2d_f.o -o fft4f2dt_f
+
+fftsg2dt_f : fftsg2dt_f.o fftsg2d_f.o fftsg_f.o
+ $(F77) fftsg2dt_f.o fftsg2d_f.o fftsg_f.o -o fftsg2dt_f
+
+fftsg3dt_f : fftsg3dt_f.o fftsg3d_f.o fftsg_f.o
+ $(F77) fftsg3dt_f.o fftsg3d_f.o fftsg_f.o -o fftsg3dt_f
+
+
+fft4f2dt_f.o : fft4f2dt.f
+ $(F77) $(FFLAGS) $(OFLAGS) -c fft4f2dt.f -o fft4f2dt_f.o
+
+fftsg2dt_f.o : fftsg2dt.f
+ $(F77) $(FFLAGS) $(OFLAGS) -c fftsg2dt.f -o fftsg2dt_f.o
+
+fftsg3dt_f.o : fftsg3dt.f
+ $(F77) $(FFLAGS) $(OFLAGS) -c fftsg3dt.f -o fftsg3dt_f.o
+
+
+fft4f2d_f.o : ../fft4f2d.f
+ $(F77) $(FFLAGS) $(OFLAGS) -c ../fft4f2d.f -o fft4f2d_f.o
+
+fftsg2d_f.o : ../fftsg2d.f
+ $(F77) $(FFLAGS) $(OFLAGS) -c ../fftsg2d.f -o fftsg2d_f.o
+
+fftsg3d_f.o : ../fftsg3d.f
+ $(F77) $(FFLAGS) $(OFLAGS) -c ../fftsg3d.f -o fftsg3d_f.o
+
+fftsg_f.o : ../fftsg.f
+ $(F77) $(FFLAGS) $(OFLAGS) -c ../fftsg.f -o fftsg_f.o
+
+
+
+
+clean:
+ rm -f *.o
+
diff --git a/src/fft2d/fft2d/sample2d/Makefile.pth b/src/fft2d/fft2d/sample2d/Makefile.pth
new file mode 100644
index 0000000..d46941e
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/Makefile.pth
@@ -0,0 +1,54 @@
+# ---- for GNU gcc ----
+
+CC = gcc
+
+CFLAGS = -Wall -DUSE_FFT2D_PTHREADS -DUSE_FFT3D_PTHREADS
+
+OFLAGS = -O2
+
+# ---- for SUN WS cc ----
+#
+#CC = cc
+#
+#CFLAGS = -Wall -DUSE_FFT2D_PTHREADS -DUSE_FFT3D_PTHREADS
+#
+#OFLAGS = -xO2
+
+
+
+
+all: fftsg2dt_pt fftsg3dt_pt
+
+
+fftsg2dt_pt : fftsg2dt.o fftsg2dpt.o fftsg.o alloc.o
+ $(CC) fftsg2dt.o fftsg2dpt.o fftsg.o alloc.o -lm -lpthread -o fftsg2dt_pt
+
+fftsg3dt_pt : fftsg3dt.o fftsg3dpt.o fftsg.o alloc.o
+ $(CC) fftsg3dt.o fftsg3dpt.o fftsg.o alloc.o -lm -lpthread -o fftsg3dt_pt
+
+
+fftsg2dt.o : fftsg2dt.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c fftsg2dt.c -o fftsg2dt.o
+
+fftsg3dt.o : fftsg3dt.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c fftsg3dt.c -o fftsg3dt.o
+
+
+fftsg2dpt.o : ../fftsg2d.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg2d.c -o fftsg2dpt.o
+
+fftsg3dpt.o : ../fftsg3d.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg3d.c -o fftsg3dpt.o
+
+fftsg.o : ../fftsg.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c ../fftsg.c -o fftsg.o
+
+alloc.o : ../alloc.c
+ $(CC) $(CFLAGS) $(OFLAGS) -c ../alloc.c -o alloc.o
+
+
+
+
+clean:
+ rm -f *.o
+
diff --git a/src/fft2d/fft2d/sample2d/alloc.h b/src/fft2d/fft2d/sample2d/alloc.h
new file mode 100644
index 0000000..3467cc4
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/alloc.h
@@ -0,0 +1,20 @@
+/* ---- memory allocation ---- */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+
+int *alloc_1d_int(int n1);
+void free_1d_int(int *i);
+double *alloc_1d_double(int n1);
+void free_1d_double(double *d);
+int **alloc_2d_int(int n1, int n2);
+void free_2d_int(int **ii);
+double **alloc_2d_double(int n1, int n2);
+void free_2d_double(double **dd);
+int ***alloc_3d_int(int n1, int n2, int n3);
+void free_3d_int(int ***iii);
+double ***alloc_3d_double(int n1, int n2, int n3);
+void free_3d_double(double ***ddd);
+
diff --git a/src/fft2d/fft2d/sample2d/fft4f2dt.c b/src/fft2d/fft2d/sample2d/fft4f2dt.c
new file mode 100644
index 0000000..2f300fd
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/fft4f2dt.c
@@ -0,0 +1,109 @@
+/* test of fft4f2d.c */
+
+#include <math.h>
+#include <stdio.h>
+#include "alloc.h"
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+/* random number generator, 0 <= RND < 1 */
+#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
+
+
+int main()
+{
+ void cdft2d(int, int, int, double **, int *, double *);
+ void rdft2d(int, int, int, double **, int *, double *);
+ void ddct2d(int, int, int, double **, double **, int *, double *);
+ void ddst2d(int, int, int, double **, double **, int *, double *);
+ void putdata2d(int n1, int n2, double **a);
+ double errorcheck2d(int n1, int n2, double scale, double **a);
+ int *ip, n1, n2, n, i;
+ double **a, **t, *w, err;
+
+ printf("data length n1=? (n1 = power of 2) \n");
+ scanf("%d", &n1);
+ printf("data length n2=? (n2 = power of 2) \n");
+ scanf("%d", &n2);
+
+ a = alloc_2d_double(n1, n2);
+ t = alloc_2d_double(n1, n2);
+ n = MAX(n1, n2 / 2);
+ ip = alloc_1d_int(2 + (int) sqrt(n + 0.5));
+ n = MAX(n1 / 2, n2 / 4) + MAX(n1, n2);
+ w = alloc_1d_double(n);
+ ip[0] = 0;
+
+ /* check of CDFT */
+ putdata2d(n1, n2, a);
+ cdft2d(n1, n2, 1, a, ip, w);
+ cdft2d(n1, n2, -1, a, ip, w);
+ err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
+ printf("cdft2d err= %g \n", err);
+
+ /* check of RDFT */
+ putdata2d(n1, n2, a);
+ rdft2d(n1, n2, 1, a, ip, w);
+ rdft2d(n1, n2, -1, a, ip, w);
+ err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
+ printf("rdft2d err= %g \n", err);
+
+ /* check of DDCT */
+ putdata2d(n1, n2, a);
+ ddct2d(n1, n2, 1, a, t, ip, w);
+ ddct2d(n1, n2, -1, a, t, ip, w);
+ for (i = 0; i <= n1 - 1; i++) {
+ a[i][0] *= 0.5;
+ }
+ for (i = 0; i <= n2 - 1; i++) {
+ a[0][i] *= 0.5;
+ }
+ err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
+ printf("ddct2d err= %g \n", err);
+
+ /* check of DDST */
+ putdata2d(n1, n2, a);
+ ddst2d(n1, n2, 1, a, t, ip, w);
+ ddst2d(n1, n2, -1, a, t, ip, w);
+ for (i = 0; i <= n1 - 1; i++) {
+ a[i][0] *= 0.5;
+ }
+ for (i = 0; i <= n2 - 1; i++) {
+ a[0][i] *= 0.5;
+ }
+ err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
+ printf("ddst2d err= %g \n", err);
+
+ free_1d_double(w);
+ free_1d_int(ip);
+ free_2d_double(t);
+ free_2d_double(a);
+ return 0;
+}
+
+
+void putdata2d(int n1, int n2, double **a)
+{
+ int j1, j2, seed = 0;
+
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2] = RND(&seed);
+ }
+ }
+}
+
+
+double errorcheck2d(int n1, int n2, double scale, double **a)
+{
+ int j1, j2, seed = 0;
+ double err = 0, e;
+
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ e = RND(&seed) - a[j1][j2] * scale;
+ err = MAX(err, fabs(e));
+ }
+ }
+ return err;
+}
+
diff --git a/src/fft2d/fft2d/sample2d/fft4f2dt.f b/src/fft2d/fft2d/sample2d/fft4f2dt.f
new file mode 100644
index 0000000..43ac8ac
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/fft4f2dt.f
@@ -0,0 +1,95 @@
+! test of fft4f2d.f
+!
+ program main
+ integer nmax, nmaxsqrt
+ parameter (nmax = 1024)
+ parameter (nmaxsqrt = 32)
+ integer ip(0 : nmaxsqrt + 1), n1, n2, i
+ real*8 a(0 : nmax - 1, 0 : nmax - 1),
+ & t(0 : nmax - 1, 0 : nmax - 1), w(0 : nmax * 3 / 2 - 1),
+ & err, errorcheck2d
+!
+ write (*, *) 'data length n1=? (n1 = power of 2) '
+ read (*, *) n1
+ write (*, *) 'data length n2=? (n2 = power of 2) '
+ read (*, *) n2
+ ip(0) = 0
+!
+! check of CDFT
+ call putdata2d(nmax, n1, n2, a)
+ call cdft2d(nmax, n1, n2, 1, a, ip, w)
+ call cdft2d(nmax, n1, n2, -1, a, ip, w)
+ err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
+ write (*, *) 'cdft2d err= ', err
+!
+! check of RDFT
+ call putdata2d(nmax, n1, n2, a)
+ call rdft2d(nmax, n1, n2, 1, a, ip, w)
+ call rdft2d(nmax, n1, n2, -1, a, ip, w)
+ err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
+ write (*, *) 'rdft2d err= ', err
+!
+! check of DDCT
+ call putdata2d(nmax, n1, n2, a)
+ call ddct2d(nmax, n1, n2, 1, a, t, ip, w)
+ call ddct2d(nmax, n1, n2, -1, a, t, ip, w)
+ do i = 0, n1 - 1
+ a(i, 0) = a(i, 0) * 0.5d0
+ end do
+ do i = 0, n2 - 1
+ a(0, i) = a(0, i) * 0.5d0
+ end do
+ err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
+ write (*, *) 'ddct2d err= ', err
+!
+! check of DDST
+ call putdata2d(nmax, n1, n2, a)
+ call ddst2d(nmax, n1, n2, 1, a, t, ip, w)
+ call ddst2d(nmax, n1, n2, -1, a, t, ip, w)
+ do i = 0, n1 - 1
+ a(i, 0) = a(i, 0) * 0.5d0
+ end do
+ do i = 0, n2 - 1
+ a(0, i) = a(0, i) * 0.5d0
+ end do
+ err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
+ write (*, *) 'ddst2d err= ', err
+!
+ end
+!
+!
+ subroutine putdata2d(n1max, n1, n2, a)
+ integer n1max, n1, n2, j1, j2, seed
+ real*8 a(0 : n1max - 1, 0 : *), drnd
+ seed = 0
+ do j2 = 0, n2 - 1
+ do j1 = 0, n1 - 1
+ a(j1, j2) = drnd(seed)
+ end do
+ end do
+ end
+!
+!
+ function errorcheck2d(n1max, n1, n2, scale, a)
+ integer n1max, n1, n2, j1, j2, seed
+ real*8 scale, a(0 : n1max - 1, 0 : *), drnd, err, e,
+ & errorcheck2d
+ err = 0
+ seed = 0
+ do j2 = 0, n2 - 1
+ do j1 = 0, n1 - 1
+ e = drnd(seed) - a(j1, j2) * scale
+ err = max(err, abs(e))
+ end do
+ end do
+ errorcheck2d = err
+ end
+!
+!
+! random number generator, 0 <= drnd < 1
+ real*8 function drnd(seed)
+ integer seed
+ seed = mod(seed * 7141 + 54773, 259200)
+ drnd = seed * (1.0d0 / 259200)
+ end
+!
diff --git a/src/fft2d/fft2d/sample2d/fftsg2dt.c b/src/fft2d/fft2d/sample2d/fftsg2dt.c
new file mode 100644
index 0000000..c4cf935
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/fftsg2dt.c
@@ -0,0 +1,107 @@
+/* test of fftsg2d.c */
+
+#include <math.h>
+#include <stdio.h>
+#include "alloc.h"
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+/* random number generator, 0 <= RND < 1 */
+#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
+
+
+int main()
+{
+ void cdft2d(int, int, int, double **, double *, int *, double *);
+ void rdft2d(int, int, int, double **, double *, int *, double *);
+ void ddct2d(int, int, int, double **, double *, int *, double *);
+ void ddst2d(int, int, int, double **, double *, int *, double *);
+ void putdata2d(int n1, int n2, double **a);
+ double errorcheck2d(int n1, int n2, double scale, double **a);
+ int *ip, n1, n2, n, i;
+ double **a, *w, err;
+
+ printf("data length n1=? (n1 = power of 2) \n");
+ scanf("%d", &n1);
+ printf("data length n2=? (n2 = power of 2) \n");
+ scanf("%d", &n2);
+
+ a = alloc_2d_double(n1, n2);
+ n = MAX(n1, n2 / 2);
+ ip = alloc_1d_int(2 + (int) sqrt(n + 0.5));
+ n = MAX(n1, n2) * 3 / 2;
+ w = alloc_1d_double(n);
+ ip[0] = 0;
+
+ /* check of CDFT */
+ putdata2d(n1, n2, a);
+ cdft2d(n1, n2, 1, a, NULL, ip, w);
+ cdft2d(n1, n2, -1, a, NULL, ip, w);
+ err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
+ printf("cdft2d err= %g \n", err);
+
+ /* check of RDFT */
+ putdata2d(n1, n2, a);
+ rdft2d(n1, n2, 1, a, NULL, ip, w);
+ rdft2d(n1, n2, -1, a, NULL, ip, w);
+ err = errorcheck2d(n1, n2, 2.0 / n1 / n2, a);
+ printf("rdft2d err= %g \n", err);
+
+ /* check of DDCT */
+ putdata2d(n1, n2, a);
+ ddct2d(n1, n2, 1, a, NULL, ip, w);
+ ddct2d(n1, n2, -1, a, NULL, ip, w);
+ for (i = 0; i <= n1 - 1; i++) {
+ a[i][0] *= 0.5;
+ }
+ for (i = 0; i <= n2 - 1; i++) {
+ a[0][i] *= 0.5;
+ }
+ err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
+ printf("ddct2d err= %g \n", err);
+
+ /* check of DDST */
+ putdata2d(n1, n2, a);
+ ddst2d(n1, n2, 1, a, NULL, ip, w);
+ ddst2d(n1, n2, -1, a, NULL, ip, w);
+ for (i = 0; i <= n1 - 1; i++) {
+ a[i][0] *= 0.5;
+ }
+ for (i = 0; i <= n2 - 1; i++) {
+ a[0][i] *= 0.5;
+ }
+ err = errorcheck2d(n1, n2, 4.0 / n1 / n2, a);
+ printf("ddst2d err= %g \n", err);
+
+ free_1d_double(w);
+ free_1d_int(ip);
+ free_2d_double(a);
+ return 0;
+}
+
+
+void putdata2d(int n1, int n2, double **a)
+{
+ int j1, j2, seed = 0;
+
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2] = RND(&seed);
+ }
+ }
+}
+
+
+double errorcheck2d(int n1, int n2, double scale, double **a)
+{
+ int j1, j2, seed = 0;
+ double err = 0, e;
+
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ e = RND(&seed) - a[j1][j2] * scale;
+ err = MAX(err, fabs(e));
+ }
+ }
+ return err;
+}
+
diff --git a/src/fft2d/fft2d/sample2d/fftsg2dt.f b/src/fft2d/fft2d/sample2d/fftsg2dt.f
new file mode 100644
index 0000000..a375c05
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/fftsg2dt.f
@@ -0,0 +1,94 @@
+! test of fftsg2d.f
+!
+ program main
+ integer nmax, nmaxsqrt
+ parameter (nmax = 1024)
+ parameter (nmaxsqrt = 32)
+ integer ip(0 : nmaxsqrt + 1), n1, n2, i
+ real*8 a(0 : nmax - 1, 0 : nmax - 1), t(0 : 8 * nmax - 1),
+ & w(0 : nmax * 3 / 2 - 1), err, errorcheck2d
+!
+ write (*, *) 'data length n1=? (n1 = power of 2) '
+ read (*, *) n1
+ write (*, *) 'data length n2=? (n2 = power of 2) '
+ read (*, *) n2
+ ip(0) = 0
+!
+! check of CDFT
+ call putdata2d(nmax, n1, n2, a)
+ call cdft2d(nmax, n1, n2, 1, a, t, ip, w)
+ call cdft2d(nmax, n1, n2, -1, a, t, ip, w)
+ err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
+ write (*, *) 'cdft2d err= ', err
+!
+! check of RDFT
+ call putdata2d(nmax, n1, n2, a)
+ call rdft2d(nmax, n1, n2, 1, a, t, ip, w)
+ call rdft2d(nmax, n1, n2, -1, a, t, ip, w)
+ err = errorcheck2d(nmax, n1, n2, 2.0d0 / n1 / n2, a)
+ write (*, *) 'rdft2d err= ', err
+!
+! check of DDCT
+ call putdata2d(nmax, n1, n2, a)
+ call ddct2d(nmax, n1, n2, 1, a, t, ip, w)
+ call ddct2d(nmax, n1, n2, -1, a, t, ip, w)
+ do i = 0, n1 - 1
+ a(i, 0) = a(i, 0) * 0.5d0
+ end do
+ do i = 0, n2 - 1
+ a(0, i) = a(0, i) * 0.5d0
+ end do
+ err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
+ write (*, *) 'ddct2d err= ', err
+!
+! check of DDST
+ call putdata2d(nmax, n1, n2, a)
+ call ddst2d(nmax, n1, n2, 1, a, t, ip, w)
+ call ddst2d(nmax, n1, n2, -1, a, t, ip, w)
+ do i = 0, n1 - 1
+ a(i, 0) = a(i, 0) * 0.5d0
+ end do
+ do i = 0, n2 - 1
+ a(0, i) = a(0, i) * 0.5d0
+ end do
+ err = errorcheck2d(nmax, n1, n2, 4.0d0 / n1 / n2, a)
+ write (*, *) 'ddst2d err= ', err
+!
+ end
+!
+!
+ subroutine putdata2d(n1max, n1, n2, a)
+ integer n1max, n1, n2, j1, j2, seed
+ real*8 a(0 : n1max - 1, 0 : *), drnd
+ seed = 0
+ do j2 = 0, n2 - 1
+ do j1 = 0, n1 - 1
+ a(j1, j2) = drnd(seed)
+ end do
+ end do
+ end
+!
+!
+ function errorcheck2d(n1max, n1, n2, scale, a)
+ integer n1max, n1, n2, j1, j2, seed
+ real*8 scale, a(0 : n1max - 1, 0 : *), drnd, err, e,
+ & errorcheck2d
+ err = 0
+ seed = 0
+ do j2 = 0, n2 - 1
+ do j1 = 0, n1 - 1
+ e = drnd(seed) - a(j1, j2) * scale
+ err = max(err, abs(e))
+ end do
+ end do
+ errorcheck2d = err
+ end
+!
+!
+! random number generator, 0 <= drnd < 1
+ real*8 function drnd(seed)
+ integer seed
+ seed = mod(seed * 7141 + 54773, 259200)
+ drnd = seed * (1.0d0 / 259200)
+ end
+!
diff --git a/src/fft2d/fft2d/sample2d/fftsg3dt.c b/src/fft2d/fft2d/sample2d/fftsg3dt.c
new file mode 100644
index 0000000..87879e6
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/fftsg3dt.c
@@ -0,0 +1,128 @@
+/* test of fftsg3d.c */
+
+#include <math.h>
+#include <stdio.h>
+#include "alloc.h"
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+/* random number generator, 0 <= RND < 1 */
+#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
+
+
+int main()
+{
+ void cdft3d(int, int, int, int, double ***, double *, int *, double *);
+ void rdft3d(int, int, int, int, double ***, double *, int *, double *);
+ void ddct3d(int, int, int, int, double ***, double *, int *, double *);
+ void ddst3d(int, int, int, int, double ***, double *, int *, double *);
+ void putdata3d(int n1, int n2, int n3, double ***a);
+ double errorcheck3d(int n1, int n2, int n3, double scale, double ***a);
+ int *ip, n1, n2, n3, n, nt, i, j;
+ double ***a, *w, err;
+
+ printf("data length n1=? (n1 = power of 2) \n");
+ scanf("%d", &n1);
+ printf("data length n2=? (n2 = power of 2) \n");
+ scanf("%d", &n2);
+ printf("data length n3=? (n3 = power of 2) \n");
+ scanf("%d", &n3);
+
+ a = alloc_3d_double(n1, n2, n3);
+ nt = MAX(n1, n2);
+ n = MAX(nt, n3 / 2);
+ ip = alloc_1d_int(2 + (int) sqrt(n + 0.5));
+ n = MAX(nt, n3) * 3 / 2;
+ w = alloc_1d_double(n);
+ ip[0] = 0;
+
+ /* check of CDFT */
+ putdata3d(n1, n2, n3, a);
+ cdft3d(n1, n2, n3, 1, a, NULL, ip, w);
+ cdft3d(n1, n2, n3, -1, a, NULL, ip, w);
+ err = errorcheck3d(n1, n2, n3, 2.0 / n1 / n2 / n3, a);
+ printf("cdft3d err= %g \n", err);
+
+ /* check of RDFT */
+ putdata3d(n1, n2, n3, a);
+ rdft3d(n1, n2, n3, 1, a, NULL, ip, w);
+ rdft3d(n1, n2, n3, -1, a, NULL, ip, w);
+ err = errorcheck3d(n1, n2, n3, 2.0 / n1 / n2 / n3, a);
+ printf("rdft3d err= %g \n", err);
+
+ /* check of DDCT */
+ putdata3d(n1, n2, n3, a);
+ ddct3d(n1, n2, n3, 1, a, NULL, ip, w);
+ ddct3d(n1, n2, n3, -1, a, NULL, ip, w);
+ for (i = 0; i <= n1 - 1; i++) {
+ for (j = 0; j <= n2 - 1; j++) {
+ a[i][j][0] *= 0.5;
+ }
+ for (j = 0; j <= n3 - 1; j++) {
+ a[i][0][j] *= 0.5;
+ }
+ }
+ for (i = 0; i <= n2 - 1; i++) {
+ for (j = 0; j <= n3 - 1; j++) {
+ a[0][i][j] *= 0.5;
+ }
+ }
+ err = errorcheck3d(n1, n2, n3, 8.0 / n1 / n2 / n3, a);
+ printf("ddct3d err= %g \n", err);
+
+ /* check of DDST */
+ putdata3d(n1, n2, n3, a);
+ ddst3d(n1, n2, n3, 1, a, NULL, ip, w);
+ ddst3d(n1, n2, n3, -1, a, NULL, ip, w);
+ for (i = 0; i <= n1 - 1; i++) {
+ for (j = 0; j <= n2 - 1; j++) {
+ a[i][j][0] *= 0.5;
+ }
+ for (j = 0; j <= n3 - 1; j++) {
+ a[i][0][j] *= 0.5;
+ }
+ }
+ for (i = 0; i <= n2 - 1; i++) {
+ for (j = 0; j <= n3 - 1; j++) {
+ a[0][i][j] *= 0.5;
+ }
+ }
+ err = errorcheck3d(n1, n2, n3, 8.0 / n1 / n2 / n3, a);
+ printf("ddst3d err= %g \n", err);
+
+ free_1d_double(w);
+ free_1d_int(ip);
+ free_3d_double(a);
+ return 0;
+}
+
+
+void putdata3d(int n1, int n2, int n3, double ***a)
+{
+ int j1, j2, j3, seed = 0;
+
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ for (j3 = 0; j3 <= n3 - 1; j3++) {
+ a[j1][j2][j3] = RND(&seed);
+ }
+ }
+ }
+}
+
+
+double errorcheck3d(int n1, int n2, int n3, double scale, double ***a)
+{
+ int j1, j2, j3, seed = 0;
+ double err = 0, e;
+
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ for (j3 = 0; j3 <= n3 - 1; j3++) {
+ e = RND(&seed) - a[j1][j2][j3] * scale;
+ err = MAX(err, fabs(e));
+ }
+ }
+ }
+ return err;
+}
+
diff --git a/src/fft2d/fft2d/sample2d/fftsg3dt.f b/src/fft2d/fft2d/sample2d/fftsg3dt.f
new file mode 100644
index 0000000..bbada0d
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/fftsg3dt.f
@@ -0,0 +1,119 @@
+! test of fftsg3d.f
+!
+ program main
+ integer nmax, nmaxsqrt
+ parameter (nmax = 128)
+ parameter (nmaxsqrt = 16)
+ integer ip(0 : nmaxsqrt + 1), n1, n2, n3, i, j
+ real*8 a(0 : nmax - 1, 0 : nmax - 1, 0 : nmax - 1),
+ & t(0 : 8 * nmax - 1),
+ & w(0 : nmax * 3 / 2 - 1), err, errorcheck3d
+!
+ write (*, *) 'data length n1=? (n1 = power of 2) '
+ read (*, *) n1
+ write (*, *) 'data length n2=? (n2 = power of 2) '
+ read (*, *) n2
+ write (*, *) 'data length n3=? (n3 = power of 2) '
+ read (*, *) n3
+ ip(0) = 0
+!
+! check of CDFT
+ call putdata3d(nmax, nmax, n1, n2, n3, a)
+ call cdft3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
+ call cdft3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
+ err = errorcheck3d(nmax, nmax, n1, n2, n3,
+ & 2.0d0 / n1 / n2 / n3, a)
+ write (*, *) 'cdft3d err= ', err
+!
+! check of RDFT
+ call putdata3d(nmax, nmax, n1, n2, n3, a)
+ call rdft3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
+ call rdft3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
+ err = errorcheck3d(nmax, nmax, n1, n2, n3,
+ & 2.0d0 / n1 / n2 / n3, a)
+ write (*, *) 'rdft3d err= ', err
+!
+! check of DDCT
+ call putdata3d(nmax, nmax, n1, n2, n3, a)
+ call ddct3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
+ call ddct3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
+ do j = 0, n2 - 1
+ do i = 0, n1 - 1
+ a(i, j, 0) = a(i, j, 0) * 0.5d0
+ end do
+ end do
+ do j = 0, n3 - 1
+ do i = 0, n1 - 1
+ a(i, 0, j) = a(i, 0, j) * 0.5d0
+ end do
+ do i = 0, n2 - 1
+ a(0, i, j) = a(0, i, j) * 0.5d0
+ end do
+ end do
+ err = errorcheck3d(nmax, nmax, n1, n2, n3,
+ & 8.0d0 / n1 / n2 / n3, a)
+ write (*, *) 'ddct3d err= ', err
+!
+! check of DDST
+ call putdata3d(nmax, nmax, n1, n2, n3, a)
+ call ddst3d(nmax, nmax, n1, n2, n3, 1, a, t, ip, w)
+ call ddst3d(nmax, nmax, n1, n2, n3, -1, a, t, ip, w)
+ do j = 0, n2 - 1
+ do i = 0, n1 - 1
+ a(i, j, 0) = a(i, j, 0) * 0.5d0
+ end do
+ end do
+ do j = 0, n3 - 1
+ do i = 0, n1 - 1
+ a(i, 0, j) = a(i, 0, j) * 0.5d0
+ end do
+ do i = 0, n2 - 1
+ a(0, i, j) = a(0, i, j) * 0.5d0
+ end do
+ end do
+ err = errorcheck3d(nmax, nmax, n1, n2, n3,
+ & 8.0d0 / n1 / n2 / n3, a)
+ write (*, *) 'ddst3d err= ', err
+!
+ end
+!
+!
+ subroutine putdata3d(n1max, n2max, n1, n2, n3, a)
+ integer n1max, n2max, n1, n2, n3, j1, j2, j3, seed
+ real*8 a(0 : n1max - 1, 0 : n2max - 1, 0 : *), drnd
+ seed = 0
+ do j3 = 0, n3 - 1
+ do j2 = 0, n2 - 1
+ do j1 = 0, n1 - 1
+ a(j1, j2, j3) = drnd(seed)
+ end do
+ end do
+ end do
+ end
+!
+!
+ function errorcheck3d(n1max, n2max, n1, n2, n3, scale, a)
+ integer n1max, n2max, n1, n2, n3, j1, j2, j3, seed
+ real*8 scale, a(0 : n1max - 1, 0 : n2max - 1, 0 : *),
+ & drnd, err, e, errorcheck3d
+ err = 0
+ seed = 0
+ do j3 = 0, n3 - 1
+ do j2 = 0, n2 - 1
+ do j1 = 0, n1 - 1
+ e = drnd(seed) - a(j1, j2, j3) * scale
+ err = max(err, abs(e))
+ end do
+ end do
+ end do
+ errorcheck3d = err
+ end
+!
+!
+! random number generator, 0 <= drnd < 1
+ real*8 function drnd(seed)
+ integer seed
+ seed = mod(seed * 7141 + 54773, 259200)
+ drnd = seed * (1.0d0 / 259200)
+ end
+!
diff --git a/src/fft2d/fft2d/sample2d/shrtdctt.c b/src/fft2d/fft2d/sample2d/shrtdctt.c
new file mode 100644
index 0000000..70d665f
--- /dev/null
+++ b/src/fft2d/fft2d/sample2d/shrtdctt.c
@@ -0,0 +1,68 @@
+/* test of shrtdct.c */
+
+#include <math.h>
+#include <stdio.h>
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+/* random number generator, 0 <= RND < 1 */
+#define RND(p) ((*(p) = (*(p) * 7141 + 54773) % 259200) * (1.0 / 259200))
+
+#define NMAX 16
+
+int main()
+{
+ void ddct8x8s(int isgn, double **a);
+ void ddct16x16s(int isgn, double **a);
+ void putdata2d(int n1, int n2, double **a);
+ double errorcheck2d(int n1, int n2, double scale, double **a);
+ double err;
+
+ int i;
+ double aarr[NMAX][NMAX], *a[NMAX], barr[NMAX][NMAX], *b[NMAX];
+ for (i = 0; i < NMAX; i++) a[i] = aarr[i];
+ for (i = 0; i < NMAX; i++) b[i] = barr[i];
+
+ /* check of 8x8 DCT */
+ putdata2d(8, 8, a);
+ ddct8x8s(-1, a);
+ ddct8x8s(1, a);
+ err = errorcheck2d(8, 8, 1.0, a);
+ printf("ddct8x8s err= %g\n", err);
+
+ /* check of 16x16 DCT */
+ putdata2d(16, 16, a);
+ ddct16x16s(-1, a);
+ ddct16x16s(1, a);
+ err = errorcheck2d(16, 16, 1.0, a);
+ printf("ddct16x16s err= %g\n", err);
+
+ return 0;
+}
+
+
+void putdata2d(int n1, int n2, double **a)
+{
+ int j1, j2, seed = 0;
+
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ a[j1][j2] = RND(&seed);
+ }
+ }
+}
+
+
+double errorcheck2d(int n1, int n2, double scale, double **a)
+{
+ int j1, j2, seed = 0;
+ double err = 0, e;
+
+ for (j1 = 0; j1 <= n1 - 1; j1++) {
+ for (j2 = 0; j2 <= n2 - 1; j2++) {
+ e = RND(&seed) - a[j1][j2] * scale;
+ err = MAX(err, fabs(e));
+ }
+ }
+ return err;
+}
+
diff --git a/src/fft2d/fft2d/shrtdct.c b/src/fft2d/fft2d/shrtdct.c
new file mode 100644
index 0000000..455cb4c
--- /dev/null
+++ b/src/fft2d/fft2d/shrtdct.c
@@ -0,0 +1,538 @@
+/*
+Short Discrete Cosine Transform
+ data length :8x8, 16x16
+ method :row-column, radix 4 FFT
+functions
+ ddct8x8s : 8x8 DCT
+ ddct16x16s: 16x16 DCT
+function prototypes
+ void ddct8x8s(int isgn, double **a);
+ void ddct16x16s(int isgn, double **a);
+*/
+
+
+/*
+-------- 8x8 DCT (Discrete Cosine Transform) / Inverse of DCT --------
+ [definition]
+ <case1> Normalized 8x8 IDCT
+ C[k1][k2] = (1/4) * sum_j1=0^7 sum_j2=0^7
+ a[j1][j2] * s[j1] * s[j2] *
+ cos(pi*j1*(k1+1/2)/8) *
+ cos(pi*j2*(k2+1/2)/8), 0<=k1<8, 0<=k2<8
+ (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
+ <case2> Normalized 8x8 DCT
+ C[k1][k2] = (1/4) * s[k1] * s[k2] * sum_j1=0^7 sum_j2=0^7
+ a[j1][j2] *
+ cos(pi*(j1+1/2)*k1/8) *
+ cos(pi*(j2+1/2)*k2/8), 0<=k1<8, 0<=k2<8
+ (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
+ [usage]
+ <case1>
+ ddct8x8s(1, a);
+ <case2>
+ ddct8x8s(-1, a);
+ [parameters]
+ a[0...7][0...7] :input/output data (double **)
+ output data
+ a[k1][k2] = C[k1][k2], 0<=k1<8, 0<=k2<8
+*/
+
+
+/* Cn_kR = sqrt(2.0/n) * cos(pi/2*k/n) */
+/* Cn_kI = sqrt(2.0/n) * sin(pi/2*k/n) */
+/* Wn_kR = cos(pi/2*k/n) */
+/* Wn_kI = sin(pi/2*k/n) */
+#define C8_1R 0.49039264020161522456
+#define C8_1I 0.09754516100806413392
+#define C8_2R 0.46193976625564337806
+#define C8_2I 0.19134171618254488586
+#define C8_3R 0.41573480615127261854
+#define C8_3I 0.27778511650980111237
+#define C8_4R 0.35355339059327376220
+#define W8_4R 0.70710678118654752440
+
+
+void ddct8x8s(int isgn, double **a)
+{
+ int j;
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+ double xr, xi;
+
+ if (isgn < 0) {
+ for (j = 0; j <= 7; j++) {
+ x0r = a[0][j] + a[7][j];
+ x1r = a[0][j] - a[7][j];
+ x0i = a[2][j] + a[5][j];
+ x1i = a[2][j] - a[5][j];
+ x2r = a[4][j] + a[3][j];
+ x3r = a[4][j] - a[3][j];
+ x2i = a[6][j] + a[1][j];
+ x3i = a[6][j] - a[1][j];
+ xr = x0r + x2r;
+ xi = x0i + x2i;
+ a[0][j] = C8_4R * (xr + xi);
+ a[4][j] = C8_4R * (xr - xi);
+ xr = x0r - x2r;
+ xi = x0i - x2i;
+ a[2][j] = C8_2R * xr - C8_2I * xi;
+ a[6][j] = C8_2R * xi + C8_2I * xr;
+ xr = W8_4R * (x1i - x3i);
+ x1i = W8_4R * (x1i + x3i);
+ x3i = x1i - x3r;
+ x1i += x3r;
+ x3r = x1r - xr;
+ x1r += xr;
+ a[1][j] = C8_1R * x1r - C8_1I * x1i;
+ a[7][j] = C8_1R * x1i + C8_1I * x1r;
+ a[3][j] = C8_3R * x3r - C8_3I * x3i;
+ a[5][j] = C8_3R * x3i + C8_3I * x3r;
+ }
+ for (j = 0; j <= 7; j++) {
+ x0r = a[j][0] + a[j][7];
+ x1r = a[j][0] - a[j][7];
+ x0i = a[j][2] + a[j][5];
+ x1i = a[j][2] - a[j][5];
+ x2r = a[j][4] + a[j][3];
+ x3r = a[j][4] - a[j][3];
+ x2i = a[j][6] + a[j][1];
+ x3i = a[j][6] - a[j][1];
+ xr = x0r + x2r;
+ xi = x0i + x2i;
+ a[j][0] = C8_4R * (xr + xi);
+ a[j][4] = C8_4R * (xr - xi);
+ xr = x0r - x2r;
+ xi = x0i - x2i;
+ a[j][2] = C8_2R * xr - C8_2I * xi;
+ a[j][6] = C8_2R * xi + C8_2I * xr;
+ xr = W8_4R * (x1i - x3i);
+ x1i = W8_4R * (x1i + x3i);
+ x3i = x1i - x3r;
+ x1i += x3r;
+ x3r = x1r - xr;
+ x1r += xr;
+ a[j][1] = C8_1R * x1r - C8_1I * x1i;
+ a[j][7] = C8_1R * x1i + C8_1I * x1r;
+ a[j][3] = C8_3R * x3r - C8_3I * x3i;
+ a[j][5] = C8_3R * x3i + C8_3I * x3r;
+ }
+ } else {
+ for (j = 0; j <= 7; j++) {
+ x1r = C8_1R * a[1][j] + C8_1I * a[7][j];
+ x1i = C8_1R * a[7][j] - C8_1I * a[1][j];
+ x3r = C8_3R * a[3][j] + C8_3I * a[5][j];
+ x3i = C8_3R * a[5][j] - C8_3I * a[3][j];
+ xr = x1r - x3r;
+ xi = x1i + x3i;
+ x1r += x3r;
+ x3i -= x1i;
+ x1i = W8_4R * (xr + xi);
+ x3r = W8_4R * (xr - xi);
+ xr = C8_2R * a[2][j] + C8_2I * a[6][j];
+ xi = C8_2R * a[6][j] - C8_2I * a[2][j];
+ x0r = C8_4R * (a[0][j] + a[4][j]);
+ x0i = C8_4R * (a[0][j] - a[4][j]);
+ x2r = x0r - xr;
+ x2i = x0i - xi;
+ x0r += xr;
+ x0i += xi;
+ a[0][j] = x0r + x1r;
+ a[7][j] = x0r - x1r;
+ a[2][j] = x0i + x1i;
+ a[5][j] = x0i - x1i;
+ a[4][j] = x2r - x3i;
+ a[3][j] = x2r + x3i;
+ a[6][j] = x2i - x3r;
+ a[1][j] = x2i + x3r;
+ }
+ for (j = 0; j <= 7; j++) {
+ x1r = C8_1R * a[j][1] + C8_1I * a[j][7];
+ x1i = C8_1R * a[j][7] - C8_1I * a[j][1];
+ x3r = C8_3R * a[j][3] + C8_3I * a[j][5];
+ x3i = C8_3R * a[j][5] - C8_3I * a[j][3];
+ xr = x1r - x3r;
+ xi = x1i + x3i;
+ x1r += x3r;
+ x3i -= x1i;
+ x1i = W8_4R * (xr + xi);
+ x3r = W8_4R * (xr - xi);
+ xr = C8_2R * a[j][2] + C8_2I * a[j][6];
+ xi = C8_2R * a[j][6] - C8_2I * a[j][2];
+ x0r = C8_4R * (a[j][0] + a[j][4]);
+ x0i = C8_4R * (a[j][0] - a[j][4]);
+ x2r = x0r - xr;
+ x2i = x0i - xi;
+ x0r += xr;
+ x0i += xi;
+ a[j][0] = x0r + x1r;
+ a[j][7] = x0r - x1r;
+ a[j][2] = x0i + x1i;
+ a[j][5] = x0i - x1i;
+ a[j][4] = x2r - x3i;
+ a[j][3] = x2r + x3i;
+ a[j][6] = x2i - x3r;
+ a[j][1] = x2i + x3r;
+ }
+ }
+}
+
+
+
+/*
+-------- 16x16 DCT (Discrete Cosine Transform) / Inverse of DCT --------
+ [definition]
+ <case1> Normalized 16x16 IDCT
+ C[k1][k2] = (1/8) * sum_j1=0^15 sum_j2=0^15
+ a[j1][j2] * s[j1] * s[j2] *
+ cos(pi*j1*(k1+1/2)/16) *
+ cos(pi*j2*(k2+1/2)/16), 0<=k1<16, 0<=k2<16
+ (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
+ <case2> Normalized 16x16 DCT
+ C[k1][k2] = (1/8) * s[k1] * s[k2] * sum_j1=0^15 sum_j2=0^15
+ a[j1][j2] *
+ cos(pi*(j1+1/2)*k1/16) *
+ cos(pi*(j2+1/2)*k2/16), 0<=k1<16, 0<=k2<16
+ (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
+ [usage]
+ <case1>
+ ddct16x16s(1, a);
+ <case2>
+ ddct16x16s(-1, a);
+ [parameters]
+ a[0...15][0...15] :input/output data (double **)
+ output data
+ a[k1][k2] = C[k1][k2], 0<=k1<16, 0<=k2<16
+*/
+
+
+/* Cn_kR = sqrt(2.0/n) * cos(pi/2*k/n) */
+/* Cn_kI = sqrt(2.0/n) * sin(pi/2*k/n) */
+/* Wn_kR = cos(pi/2*k/n) */
+/* Wn_kI = sin(pi/2*k/n) */
+#define C16_1R 0.35185093438159561476
+#define C16_1I 0.03465429229977286565
+#define C16_2R 0.34675996133053686546
+#define C16_2I 0.06897484482073575308
+#define C16_3R 0.33832950029358816957
+#define C16_3I 0.10263113188058934529
+#define C16_4R 0.32664074121909413196
+#define C16_4I 0.13529902503654924610
+#define C16_5R 0.31180625324666780814
+#define C16_5I 0.16666391461943662432
+#define C16_6R 0.29396890060483967924
+#define C16_6I 0.19642373959677554532
+#define C16_7R 0.27330046675043937206
+#define C16_7I 0.22429189658565907106
+#define C16_8R 0.25
+#define W16_4R 0.92387953251128675613
+#define W16_4I 0.38268343236508977173
+#define W16_8R 0.70710678118654752440
+
+
+void ddct16x16s(int isgn, double **a)
+{
+ int j;
+ double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+ double x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i;
+ double xr, xi;
+
+ if (isgn < 0) {
+ for (j = 0; j <= 15; j++) {
+ x4r = a[0][j] - a[15][j];
+ xr = a[0][j] + a[15][j];
+ x4i = a[8][j] - a[7][j];
+ xi = a[8][j] + a[7][j];
+ x0r = xr + xi;
+ x0i = xr - xi;
+ x5r = a[2][j] - a[13][j];
+ xr = a[2][j] + a[13][j];
+ x5i = a[10][j] - a[5][j];
+ xi = a[10][j] + a[5][j];
+ x1r = xr + xi;
+ x1i = xr - xi;
+ x6r = a[4][j] - a[11][j];
+ xr = a[4][j] + a[11][j];
+ x6i = a[12][j] - a[3][j];
+ xi = a[12][j] + a[3][j];
+ x2r = xr + xi;
+ x2i = xr - xi;
+ x7r = a[6][j] - a[9][j];
+ xr = a[6][j] + a[9][j];
+ x7i = a[14][j] - a[1][j];
+ xi = a[14][j] + a[1][j];
+ x3r = xr + xi;
+ x3i = xr - xi;
+ xr = x0r + x2r;
+ xi = x1r + x3r;
+ a[0][j] = C16_8R * (xr + xi);
+ a[8][j] = C16_8R * (xr - xi);
+ xr = x0r - x2r;
+ xi = x1r - x3r;
+ a[4][j] = C16_4R * xr - C16_4I * xi;
+ a[12][j] = C16_4R * xi + C16_4I * xr;
+ x0r = W16_8R * (x1i - x3i);
+ x2r = W16_8R * (x1i + x3i);
+ xr = x0i + x0r;
+ xi = x2r + x2i;
+ a[2][j] = C16_2R * xr - C16_2I * xi;
+ a[14][j] = C16_2R * xi + C16_2I * xr;
+ xr = x0i - x0r;
+ xi = x2r - x2i;
+ a[6][j] = C16_6R * xr - C16_6I * xi;
+ a[10][j] = C16_6R * xi + C16_6I * xr;
+ xr = W16_8R * (x6r - x6i);
+ xi = W16_8R * (x6i + x6r);
+ x6r = x4r - xr;
+ x6i = x4i - xi;
+ x4r += xr;
+ x4i += xi;
+ xr = W16_4I * x7r - W16_4R * x7i;
+ xi = W16_4I * x7i + W16_4R * x7r;
+ x7r = W16_4R * x5r - W16_4I * x5i;
+ x7i = W16_4R * x5i + W16_4I * x5r;
+ x5r = x7r + xr;
+ x5i = x7i + xi;
+ x7r -= xr;
+ x7i -= xi;
+ xr = x4r + x5r;
+ xi = x5i + x4i;
+ a[1][j] = C16_1R * xr - C16_1I * xi;
+ a[15][j] = C16_1R * xi + C16_1I * xr;
+ xr = x4r - x5r;
+ xi = x5i - x4i;
+ a[7][j] = C16_7R * xr - C16_7I * xi;
+ a[9][j] = C16_7R * xi + C16_7I * xr;
+ xr = x6r - x7i;
+ xi = x7r + x6i;
+ a[5][j] = C16_5R * xr - C16_5I * xi;
+ a[11][j] = C16_5R * xi + C16_5I * xr;
+ xr = x6r + x7i;
+ xi = x7r - x6i;
+ a[3][j] = C16_3R * xr - C16_3I * xi;
+ a[13][j] = C16_3R * xi + C16_3I * xr;
+ }
+ for (j = 0; j <= 15; j++) {
+ x4r = a[j][0] - a[j][15];
+ xr = a[j][0] + a[j][15];
+ x4i = a[j][8] - a[j][7];
+ xi = a[j][8] + a[j][7];
+ x0r = xr + xi;
+ x0i = xr - xi;
+ x5r = a[j][2] - a[j][13];
+ xr = a[j][2] + a[j][13];
+ x5i = a[j][10] - a[j][5];
+ xi = a[j][10] + a[j][5];
+ x1r = xr + xi;
+ x1i = xr - xi;
+ x6r = a[j][4] - a[j][11];
+ xr = a[j][4] + a[j][11];
+ x6i = a[j][12] - a[j][3];
+ xi = a[j][12] + a[j][3];
+ x2r = xr + xi;
+ x2i = xr - xi;
+ x7r = a[j][6] - a[j][9];
+ xr = a[j][6] + a[j][9];
+ x7i = a[j][14] - a[j][1];
+ xi = a[j][14] + a[j][1];
+ x3r = xr + xi;
+ x3i = xr - xi;
+ xr = x0r + x2r;
+ xi = x1r + x3r;
+ a[j][0] = C16_8R * (xr + xi);
+ a[j][8] = C16_8R * (xr - xi);
+ xr = x0r - x2r;
+ xi = x1r - x3r;
+ a[j][4] = C16_4R * xr - C16_4I * xi;
+ a[j][12] = C16_4R * xi + C16_4I * xr;
+ x0r = W16_8R * (x1i - x3i);
+ x2r = W16_8R * (x1i + x3i);
+ xr = x0i + x0r;
+ xi = x2r + x2i;
+ a[j][2] = C16_2R * xr - C16_2I * xi;
+ a[j][14] = C16_2R * xi + C16_2I * xr;
+ xr = x0i - x0r;
+ xi = x2r - x2i;
+ a[j][6] = C16_6R * xr - C16_6I * xi;
+ a[j][10] = C16_6R * xi + C16_6I * xr;
+ xr = W16_8R * (x6r - x6i);
+ xi = W16_8R * (x6i + x6r);
+ x6r = x4r - xr;
+ x6i = x4i - xi;
+ x4r += xr;
+ x4i += xi;
+ xr = W16_4I * x7r - W16_4R * x7i;
+ xi = W16_4I * x7i + W16_4R * x7r;
+ x7r = W16_4R * x5r - W16_4I * x5i;
+ x7i = W16_4R * x5i + W16_4I * x5r;
+ x5r = x7r + xr;
+ x5i = x7i + xi;
+ x7r -= xr;
+ x7i -= xi;
+ xr = x4r + x5r;
+ xi = x5i + x4i;
+ a[j][1] = C16_1R * xr - C16_1I * xi;
+ a[j][15] = C16_1R * xi + C16_1I * xr;
+ xr = x4r - x5r;
+ xi = x5i - x4i;
+ a[j][7] = C16_7R * xr - C16_7I * xi;
+ a[j][9] = C16_7R * xi + C16_7I * xr;
+ xr = x6r - x7i;
+ xi = x7r + x6i;
+ a[j][5] = C16_5R * xr - C16_5I * xi;
+ a[j][11] = C16_5R * xi + C16_5I * xr;
+ xr = x6r + x7i;
+ xi = x7r - x6i;
+ a[j][3] = C16_3R * xr - C16_3I * xi;
+ a[j][13] = C16_3R * xi + C16_3I * xr;
+ }
+ } else {
+ for (j = 0; j <= 15; j++) {
+ x5r = C16_1R * a[1][j] + C16_1I * a[15][j];
+ x5i = C16_1R * a[15][j] - C16_1I * a[1][j];
+ xr = C16_7R * a[7][j] + C16_7I * a[9][j];
+ xi = C16_7R * a[9][j] - C16_7I * a[7][j];
+ x4r = x5r + xr;
+ x4i = x5i - xi;
+ x5r -= xr;
+ x5i += xi;
+ x7r = C16_5R * a[5][j] + C16_5I * a[11][j];
+ x7i = C16_5R * a[11][j] - C16_5I * a[5][j];
+ xr = C16_3R * a[3][j] + C16_3I * a[13][j];
+ xi = C16_3R * a[13][j] - C16_3I * a[3][j];
+ x6r = x7r + xr;
+ x6i = x7i - xi;
+ x7r -= xr;
+ x7i += xi;
+ xr = x4r - x6r;
+ xi = x4i - x6i;
+ x4r += x6r;
+ x4i += x6i;
+ x6r = W16_8R * (xi + xr);
+ x6i = W16_8R * (xi - xr);
+ xr = x5r + x7i;
+ xi = x5i - x7r;
+ x5r -= x7i;
+ x5i += x7r;
+ x7r = W16_4I * x5r + W16_4R * x5i;
+ x7i = W16_4I * x5i - W16_4R * x5r;
+ x5r = W16_4R * xr + W16_4I * xi;
+ x5i = W16_4R * xi - W16_4I * xr;
+ xr = C16_4R * a[4][j] + C16_4I * a[12][j];
+ xi = C16_4R * a[12][j] - C16_4I * a[4][j];
+ x2r = C16_8R * (a[0][j] + a[8][j]);
+ x3r = C16_8R * (a[0][j] - a[8][j]);
+ x0r = x2r + xr;
+ x1r = x3r + xi;
+ x2r -= xr;
+ x3r -= xi;
+ x0i = C16_2R * a[2][j] + C16_2I * a[14][j];
+ x2i = C16_2R * a[14][j] - C16_2I * a[2][j];
+ x1i = C16_6R * a[6][j] + C16_6I * a[10][j];
+ x3i = C16_6R * a[10][j] - C16_6I * a[6][j];
+ xr = x0i - x1i;
+ xi = x2i + x3i;
+ x0i += x1i;
+ x2i -= x3i;
+ x1i = W16_8R * (xi + xr);
+ x3i = W16_8R * (xi - xr);
+ xr = x0r + x0i;
+ xi = x0r - x0i;
+ a[0][j] = xr + x4r;
+ a[15][j] = xr - x4r;
+ a[8][j] = xi + x4i;
+ a[7][j] = xi - x4i;
+ xr = x1r + x1i;
+ xi = x1r - x1i;
+ a[2][j] = xr + x5r;
+ a[13][j] = xr - x5r;
+ a[10][j] = xi + x5i;
+ a[5][j] = xi - x5i;
+ xr = x2r + x2i;
+ xi = x2r - x2i;
+ a[4][j] = xr + x6r;
+ a[11][j] = xr - x6r;
+ a[12][j] = xi + x6i;
+ a[3][j] = xi - x6i;
+ xr = x3r + x3i;
+ xi = x3r - x3i;
+ a[6][j] = xr + x7r;
+ a[9][j] = xr - x7r;
+ a[14][j] = xi + x7i;
+ a[1][j] = xi - x7i;
+ }
+ for (j = 0; j <= 15; j++) {
+ x5r = C16_1R * a[j][1] + C16_1I * a[j][15];
+ x5i = C16_1R * a[j][15] - C16_1I * a[j][1];
+ xr = C16_7R * a[j][7] + C16_7I * a[j][9];
+ xi = C16_7R * a[j][9] - C16_7I * a[j][7];
+ x4r = x5r + xr;
+ x4i = x5i - xi;
+ x5r -= xr;
+ x5i += xi;
+ x7r = C16_5R * a[j][5] + C16_5I * a[j][11];
+ x7i = C16_5R * a[j][11] - C16_5I * a[j][5];
+ xr = C16_3R * a[j][3] + C16_3I * a[j][13];
+ xi = C16_3R * a[j][13] - C16_3I * a[j][3];
+ x6r = x7r + xr;
+ x6i = x7i - xi;
+ x7r -= xr;
+ x7i += xi;
+ xr = x4r - x6r;
+ xi = x4i - x6i;
+ x4r += x6r;
+ x4i += x6i;
+ x6r = W16_8R * (xi + xr);
+ x6i = W16_8R * (xi - xr);
+ xr = x5r + x7i;
+ xi = x5i - x7r;
+ x5r -= x7i;
+ x5i += x7r;
+ x7r = W16_4I * x5r + W16_4R * x5i;
+ x7i = W16_4I * x5i - W16_4R * x5r;
+ x5r = W16_4R * xr + W16_4I * xi;
+ x5i = W16_4R * xi - W16_4I * xr;
+ xr = C16_4R * a[j][4] + C16_4I * a[j][12];
+ xi = C16_4R * a[j][12] - C16_4I * a[j][4];
+ x2r = C16_8R * (a[j][0] + a[j][8]);
+ x3r = C16_8R * (a[j][0] - a[j][8]);
+ x0r = x2r + xr;
+ x1r = x3r + xi;
+ x2r -= xr;
+ x3r -= xi;
+ x0i = C16_2R * a[j][2] + C16_2I * a[j][14];
+ x2i = C16_2R * a[j][14] - C16_2I * a[j][2];
+ x1i = C16_6R * a[j][6] + C16_6I * a[j][10];
+ x3i = C16_6R * a[j][10] - C16_6I * a[j][6];
+ xr = x0i - x1i;
+ xi = x2i + x3i;
+ x0i += x1i;
+ x2i -= x3i;
+ x1i = W16_8R * (xi + xr);
+ x3i = W16_8R * (xi - xr);
+ xr = x0r + x0i;
+ xi = x0r - x0i;
+ a[j][0] = xr + x4r;
+ a[j][15] = xr - x4r;
+ a[j][8] = xi + x4i;
+ a[j][7] = xi - x4i;
+ xr = x1r + x1i;
+ xi = x1r - x1i;
+ a[j][2] = xr + x5r;
+ a[j][13] = xr - x5r;
+ a[j][10] = xi + x5i;
+ a[j][5] = xi - x5i;
+ xr = x2r + x2i;
+ xi = x2r - x2i;
+ a[j][4] = xr + x6r;
+ a[j][11] = xr - x6r;
+ a[j][12] = xi + x6i;
+ a[j][3] = xi - x6i;
+ xr = x3r + x3i;
+ xi = x3r - x3i;
+ a[j][6] = xr + x7r;
+ a[j][9] = xr - x7r;
+ a[j][14] = xi + x7i;
+ a[j][1] = xi - x7i;
+ }
+ }
+}
+