Jamoma API  0.6.0.a19
fftsg2d.cpp
1 /*
2 Fast Fourier/Cosine/Sine Transform
3  dimension :two
4  data length :power of 2
5  decimation :frequency
7  data :inplace
8  table :use
9 functions
10  cdft2d: Complex Discrete Fourier Transform
11  rdft2d: Real Discrete Fourier Transform
12  ddct2d: Discrete Cosine Transform
13  ddst2d: Discrete Sine Transform
14 function prototypes
15  void cdft2d(int, int, int, double **, double *, int *, double *);
16  void rdft2d(int, int, int, double **, double *, int *, double *);
17  void rdft2dsort(int, int, int, double **);
18  void ddct2d(int, int, int, double **, double *, int *, double *);
19  void ddst2d(int, int, int, double **, double *, int *, double *);
20 necessary package
21  fftsg.c : 1D-FFT package
22 macro definitions
24  FFT2D_MAX_THREADS : must be 2^N, default=4
27  FFT2D_MAX_THREADS : must be 2^N, default=4
29
30
31 -------- Complex DFT (Discrete Fourier Transform) --------
32  [definition]
33  <case1>
34  X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] *
35  exp(2*pi*i*j1*k1/n1) *
36  exp(2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
37  <case2>
38  X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] *
39  exp(-2*pi*i*j1*k1/n1) *
40  exp(-2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
41  (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
42  [usage]
43  <case1>
44  ip[0] = 0; // first time only
45  cdft2d(n1, 2*n2, 1, a, t, ip, w);
46  <case2>
47  ip[0] = 0; // first time only
48  cdft2d(n1, 2*n2, -1, a, t, ip, w);
49  [parameters]
50  n1 :data length (int)
51  n1 >= 1, n1 = power of 2
52  2*n2 :data length (int)
53  n2 >= 1, n2 = power of 2
54  a[0...n1-1][0...2*n2-1]
55  :input/output data (double **)
56  input data
57  a[j1][2*j2] = Re(x[j1][j2]),
58  a[j1][2*j2+1] = Im(x[j1][j2]),
59  0<=j1<n1, 0<=j2<n2
60  output data
61  a[k1][2*k2] = Re(X[k1][k2]),
62  a[k1][2*k2+1] = Im(X[k1][k2]),
63  0<=k1<n1, 0<=k2<n2
64  t[0...*]
65  :work area (double *)
66  length of t >= 8*n1, if single thread,
68  t is dynamically allocated, if t == NULL.
69  ip[0...*]
70  :work area for bit reversal (int *)
71  length of ip >= 2+sqrt(n)
72  (n = max(n1, n2))
73  ip[0],ip[1] are pointers of the cos/sin table.
74  w[0...*]
75  :cos/sin table (double *)
76  length of w >= max(n1/2, n2/2)
77  w[],ip[] are initialized if ip[0] == 0.
78  [remark]
79  Inverse of
80  cdft2d(n1, 2*n2, -1, a, t, ip, w);
81  is
82  cdft2d(n1, 2*n2, 1, a, t, ip, w);
83  for (j1 = 0; j1 <= n1 - 1; j1++) {
84  for (j2 = 0; j2 <= 2 * n2 - 1; j2++) {
85  a[j1][j2] *= 1.0 / n1 / n2;
86  }
87  }
88  .
89
90
91 -------- Real DFT / Inverse of Real DFT --------
92  [definition]
93  <case1> RDFT
94  R[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
95  cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
96  0<=k1<n1, 0<=k2<n2
97  I[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
98  sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
99  0<=k1<n1, 0<=k2<n2
100  <case2> IRDFT (excluding scale)
101  a[k1][k2] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
102  (R[j1][j2] *
103  cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) +
104  I[j1][j2] *
105  sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)),
106  0<=k1<n1, 0<=k2<n2
107  (notes: R[n1-k1][n2-k2] = R[k1][k2],
108  I[n1-k1][n2-k2] = -I[k1][k2],
109  R[n1-k1][0] = R[k1][0],
110  I[n1-k1][0] = -I[k1][0],
111  R[0][n2-k2] = R[0][k2],
112  I[0][n2-k2] = -I[0][k2],
113  0<k1<n1, 0<k2<n2)
114  [usage]
115  <case1>
116  ip[0] = 0; // first time only
117  rdft2d(n1, n2, 1, a, t, ip, w);
118  <case2>
119  ip[0] = 0; // first time only
120  rdft2d(n1, n2, -1, a, t, ip, w);
121  [parameters]
122  n1 :data length (int)
123  n1 >= 2, n1 = power of 2
124  n2 :data length (int)
125  n2 >= 2, n2 = power of 2
126  a[0...n1-1][0...n2-1]
127  :input/output data (double **)
128  <case1>
129  output data
130  a[k1][2*k2] = R[k1][k2] = R[n1-k1][n2-k2],
131  a[k1][2*k2+1] = I[k1][k2] = -I[n1-k1][n2-k2],
132  0<k1<n1, 0<k2<n2/2,
133  a[0][2*k2] = R[0][k2] = R[0][n2-k2],
134  a[0][2*k2+1] = I[0][k2] = -I[0][n2-k2],
135  0<k2<n2/2,
136  a[k1][0] = R[k1][0] = R[n1-k1][0],
137  a[k1][1] = I[k1][0] = -I[n1-k1][0],
138  a[n1-k1][1] = R[k1][n2/2] = R[n1-k1][n2/2],
139  a[n1-k1][0] = -I[k1][n2/2] = I[n1-k1][n2/2],
140  0<k1<n1/2,
141  a[0][0] = R[0][0],
142  a[0][1] = R[0][n2/2],
143  a[n1/2][0] = R[n1/2][0],
144  a[n1/2][1] = R[n1/2][n2/2]
145  <case2>
146  input data
147  a[j1][2*j2] = R[j1][j2] = R[n1-j1][n2-j2],
148  a[j1][2*j2+1] = I[j1][j2] = -I[n1-j1][n2-j2],
149  0<j1<n1, 0<j2<n2/2,
150  a[0][2*j2] = R[0][j2] = R[0][n2-j2],
151  a[0][2*j2+1] = I[0][j2] = -I[0][n2-j2],
152  0<j2<n2/2,
153  a[j1][0] = R[j1][0] = R[n1-j1][0],
154  a[j1][1] = I[j1][0] = -I[n1-j1][0],
155  a[n1-j1][1] = R[j1][n2/2] = R[n1-j1][n2/2],
156  a[n1-j1][0] = -I[j1][n2/2] = I[n1-j1][n2/2],
157  0<j1<n1/2,
158  a[0][0] = R[0][0],
159  a[0][1] = R[0][n2/2],
160  a[n1/2][0] = R[n1/2][0],
161  a[n1/2][1] = R[n1/2][n2/2]
162  ---- output ordering ----
163  rdft2d(n1, n2, 1, a, t, ip, w);
164  rdft2dsort(n1, n2, 1, a);
165  // stored data is a[0...n1-1][0...n2+1]:
166  // a[k1][2*k2] = R[k1][k2],
167  // a[k1][2*k2+1] = I[k1][k2],
168  // 0<=k1<n1, 0<=k2<=n2/2.
169  // the stored data is larger than the input data!
170  ---- input ordering ----
171  rdft2dsort(n1, n2, -1, a);
172  rdft2d(n1, n2, -1, a, t, ip, w);
173  t[0...*]
174  :work area (double *)
175  length of t >= 8*n1, if single thread,
177  t is dynamically allocated, if t == NULL.
178  ip[0...*]
179  :work area for bit reversal (int *)
180  length of ip >= 2+sqrt(n)
181  (n = max(n1, n2/2))
182  ip[0],ip[1] are pointers of the cos/sin table.
183  w[0...*]
184  :cos/sin table (double *)
185  length of w >= max(n1/2, n2/4) + n2/4
186  w[],ip[] are initialized if ip[0] == 0.
187  [remark]
188  Inverse of
189  rdft2d(n1, n2, 1, a, t, ip, w);
190  is
191  rdft2d(n1, n2, -1, a, t, ip, w);
192  for (j1 = 0; j1 <= n1 - 1; j1++) {
193  for (j2 = 0; j2 <= n2 - 1; j2++) {
194  a[j1][j2] *= 2.0 / n1 / n2;
195  }
196  }
197  .
198
199
200 -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
201  [definition]
202  <case1> IDCT (excluding scale)
203  C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
204  cos(pi*j1*(k1+1/2)/n1) *
205  cos(pi*j2*(k2+1/2)/n2),
206  0<=k1<n1, 0<=k2<n2
207  <case2> DCT
208  C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
209  cos(pi*(j1+1/2)*k1/n1) *
210  cos(pi*(j2+1/2)*k2/n2),
211  0<=k1<n1, 0<=k2<n2
212  [usage]
213  <case1>
214  ip[0] = 0; // first time only
215  ddct2d(n1, n2, 1, a, t, ip, w);
216  <case2>
217  ip[0] = 0; // first time only
218  ddct2d(n1, n2, -1, a, t, ip, w);
219  [parameters]
220  n1 :data length (int)
221  n1 >= 2, n1 = power of 2
222  n2 :data length (int)
223  n2 >= 2, n2 = power of 2
224  a[0...n1-1][0...n2-1]
225  :input/output data (double **)
226  output data
227  a[k1][k2] = C[k1][k2], 0<=k1<n1, 0<=k2<n2
228  t[0...*]
229  :work area (double *)
230  length of t >= 4*n1, if single thread,
232  t is dynamically allocated, if t == NULL.
233  ip[0...*]
234  :work area for bit reversal (int *)
235  length of ip >= 2+sqrt(n)
236  (n = max(n1/2, n2/2))
237  ip[0],ip[1] are pointers of the cos/sin table.
238  w[0...*]
239  :cos/sin table (double *)
240  length of w >= max(n1*3/2, n2*3/2)
241  w[],ip[] are initialized if ip[0] == 0.
242  [remark]
243  Inverse of
244  ddct2d(n1, n2, -1, a, t, ip, w);
245  is
246  for (j1 = 0; j1 <= n1 - 1; j1++) {
247  a[j1][0] *= 0.5;
248  }
249  for (j2 = 0; j2 <= n2 - 1; j2++) {
250  a[0][j2] *= 0.5;
251  }
252  ddct2d(n1, n2, 1, a, t, ip, w);
253  for (j1 = 0; j1 <= n1 - 1; j1++) {
254  for (j2 = 0; j2 <= n2 - 1; j2++) {
255  a[j1][j2] *= 4.0 / n1 / n2;
256  }
257  }
258  .
259
260
261 -------- DST (Discrete Sine Transform) / Inverse of DST --------
262  [definition]
263  <case1> IDST (excluding scale)
264  S[k1][k2] = sum_j1=1^n1 sum_j2=1^n2 A[j1][j2] *
265  sin(pi*j1*(k1+1/2)/n1) *
266  sin(pi*j2*(k2+1/2)/n2),
267  0<=k1<n1, 0<=k2<n2
268  <case2> DST
269  S[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
270  sin(pi*(j1+1/2)*k1/n1) *
271  sin(pi*(j2+1/2)*k2/n2),
272  0<k1<=n1, 0<k2<=n2
273  [usage]
274  <case1>
275  ip[0] = 0; // first time only
276  ddst2d(n1, n2, 1, a, t, ip, w);
277  <case2>
278  ip[0] = 0; // first time only
279  ddst2d(n1, n2, -1, a, t, ip, w);
280  [parameters]
281  n1 :data length (int)
282  n1 >= 2, n1 = power of 2
283  n2 :data length (int)
284  n2 >= 2, n2 = power of 2
285  a[0...n1-1][0...n2-1]
286  :input/output data (double **)
287  <case1>
288  input data
289  a[j1][j2] = A[j1][j2], 0<j1<n1, 0<j2<n2,
290  a[j1][0] = A[j1][n2], 0<j1<n1,
291  a[0][j2] = A[n1][j2], 0<j2<n2,
292  a[0][0] = A[n1][n2]
293  (i.e. A[j1][j2] = a[j1 % n1][j2 % n2])
294  output data
295  a[k1][k2] = S[k1][k2], 0<=k1<n1, 0<=k2<n2
296  <case2>
297  output data
298  a[k1][k2] = S[k1][k2], 0<k1<n1, 0<k2<n2,
299  a[k1][0] = S[k1][n2], 0<k1<n1,
300  a[0][k2] = S[n1][k2], 0<k2<n2,
301  a[0][0] = S[n1][n2]
302  (i.e. S[k1][k2] = a[k1 % n1][k2 % n2])
303  t[0...*]
304  :work area (double *)
305  length of t >= 4*n1, if single thread,
307  t is dynamically allocated, if t == NULL.
308  ip[0...*]
309  :work area for bit reversal (int *)
310  length of ip >= 2+sqrt(n)
311  (n = max(n1/2, n2/2))
312  ip[0],ip[1] are pointers of the cos/sin table.
313  w[0...*]
314  :cos/sin table (double *)
315  length of w >= max(n1*3/2, n2*3/2)
316  w[],ip[] are initialized if ip[0] == 0.
317  [remark]
318  Inverse of
319  ddst2d(n1, n2, -1, a, t, ip, w);
320  is
321  for (j1 = 0; j1 <= n1 - 1; j1++) {
322  a[j1][0] *= 0.5;
323  }
324  for (j2 = 0; j2 <= n2 - 1; j2++) {
325  a[0][j2] *= 0.5;
326  }
327  ddst2d(n1, n2, 1, a, t, ip, w);
328  for (j1 = 0; j1 <= n1 - 1; j1++) {
329  for (j2 = 0; j2 <= n2 - 1; j2++) {
330  a[j1][j2] *= 4.0 / n1 / n2;
331  }
332  }
333  .
334 */
335
336
337 #include <stdio.h>
338 #include <stdlib.h>
339 #define fft2d_alloc_error_check(p) { \
340  if ((p) == NULL) { \
341  fprintf(stderr, "fft2d memory allocation error\n"); \
342  exit(1); \
343  } \
344 }
345
346
351 #endif
354 #endif
358  if (pthread_create(thp, NULL, func, (void *) (argp)) != 0) { \
359  fprintf(stderr, "fft2d thread error\n"); \
360  exit(1); \
361  } \
362 }
364  if (pthread_join(th, NULL) != 0) { \
365  fprintf(stderr, "fft2d thread error\n"); \
366  exit(1); \
367  } \
368 }
370
371
376 #endif
379 #endif
380 #include <windows.h>
383  DWORD thid; \
384  *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) (func), (LPVOID) (argp), 0, &thid); \
385  if (*(thp) == 0) { \
386  fprintf(stderr, "fft2d thread error\n"); \
387  exit(1); \
388  } \
389 }
391  WaitForSingleObject(th, INFINITE); \
392  CloseHandle(th); \
393 }
395
396
397 void cdft2d(int n1, int n2, int isgn, double **a, double *t,
398  int *ip, double *w)
399 {
400  void makewt(int nw, int *ip, double *w);
401  void cdft(int n, int isgn, double *a, int *ip, double *w);
402  void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t,
403  int *ip, double *w);
405  void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a,
406  int *ip, double *w);
407  void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t,
408  int *ip, double *w);
410  int n, itnull, nthread, nt, i;
411
412  n = n1 << 1;
413  if (n < n2) {
414  n = n2;
415  }
416  if (n > (ip[0] << 2)) {
417  makewt(n >> 2, ip, w);
418  }
419  itnull = 0;
420  if (t == NULL) {
421  itnull = 1;
426  nt = 8 * nthread * n1;
427  if (n2 == 4 * nthread) {
428  nt >>= 1;
429  } else if (n2 < 4 * nthread) {
430  nt >>= 2;
431  }
432  t = (double *) malloc(sizeof(double) * nt);
433  fft2d_alloc_error_check(t);
434  }
436  if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
437  xdft2d0_subth(n1, n2, 0, isgn, a, ip, w);
438  cdft2d_subth(n1, n2, isgn, a, t, ip, w);
439  } else
441  {
442  for (i = 0; i < n1; i++) {
443  cdft(n2, isgn, a[i], ip, w);
444  }
445  cdft2d_sub(n1, n2, isgn, a, t, ip, w);
446  }
447  if (itnull != 0) {
448  free(t);
449  }
450 }
451
452
453 void rdft2d(int n1, int n2, int isgn, double **a, double *t,
454  int *ip, double *w)
455 {
456  void makewt(int nw, int *ip, double *w);
457  void makect(int nc, int *ip, double *c);
458  void rdft(int n, int isgn, double *a, int *ip, double *w);
459  void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t,
460  int *ip, double *w);
461  void rdft2d_sub(int n1, int n2, int isgn, double **a);
463  void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a,
464  int *ip, double *w);
465  void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t,
466  int *ip, double *w);
468  int n, nw, nc, itnull, nthread, nt, i;
469
470  n = n1 << 1;
471  if (n < n2) {
472  n = n2;
473  }
474  nw = ip[0];
475  if (n > (nw << 2)) {
476  nw = n >> 2;
477  makewt(nw, ip, w);
478  }
479  nc = ip[1];
480  if (n2 > (nc << 2)) {
481  nc = n2 >> 2;
482  makect(nc, ip, w + nw);
483  }
484  itnull = 0;
485  if (t == NULL) {
486  itnull = 1;
491  nt = 8 * nthread * n1;
492  if (n2 == 4 * nthread) {
493  nt >>= 1;
494  } else if (n2 < 4 * nthread) {
495  nt >>= 2;
496  }
497  t = (double *) malloc(sizeof(double) * nt);
498  fft2d_alloc_error_check(t);
499  }
501  if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
502  if (isgn < 0) {
503  rdft2d_sub(n1, n2, isgn, a);
504  cdft2d_subth(n1, n2, isgn, a, t, ip, w);
505  }
506  xdft2d0_subth(n1, n2, 1, isgn, a, ip, w);
507  if (isgn >= 0) {
508  cdft2d_subth(n1, n2, isgn, a, t, ip, w);
509  rdft2d_sub(n1, n2, isgn, a);
510  }
511  } else
513  {
514  if (isgn < 0) {
515  rdft2d_sub(n1, n2, isgn, a);
516  cdft2d_sub(n1, n2, isgn, a, t, ip, w);
517  }
518  for (i = 0; i < n1; i++) {
519  rdft(n2, isgn, a[i], ip, w);
520  }
521  if (isgn >= 0) {
522  cdft2d_sub(n1, n2, isgn, a, t, ip, w);
523  rdft2d_sub(n1, n2, isgn, a);
524  }
525  }
526  if (itnull != 0) {
527  free(t);
528  }
529 }
530
531
532 void rdft2dsort(int n1, int n2, int isgn, double **a)
533 {
534  int n1h, i;
535  double x, y;
536
537  n1h = n1 >> 1;
538  if (isgn < 0) {
539  for (i = n1h + 1; i < n1; i++) {
540  a[i][0] = a[i][n2 + 1];
541  a[i][1] = a[i][n2];
542  }
543  a[0][1] = a[0][n2];
544  a[n1h][1] = a[n1h][n2];
545  } else {
546  for (i = n1h + 1; i < n1; i++) {
547  y = a[i][0];
548  x = a[i][1];
549  a[i][n2] = x;
550  a[i][n2 + 1] = y;
551  a[n1 - i][n2] = x;
552  a[n1 - i][n2 + 1] = -y;
553  a[i][0] = a[n1 - i][0];
554  a[i][1] = -a[n1 - i][1];
555  }
556  a[0][n2] = a[0][1];
557  a[0][n2 + 1] = 0;
558  a[0][1] = 0;
559  a[n1h][n2] = a[n1h][1];
560  a[n1h][n2 + 1] = 0;
561  a[n1h][1] = 0;
562  }
563 }
564
565
566 void ddct2d(int n1, int n2, int isgn, double **a, double *t,
567  int *ip, double *w)
568 {
569  void makewt(int nw, int *ip, double *w);
570  void makect(int nc, int *ip, double *c);
571  void ddct(int n, int isgn, double *a, int *ip, double *w);
572  void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a,
573  double *t, int *ip, double *w);
575  void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a,
576  int *ip, double *w);
577  void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a,
578  double *t, int *ip, double *w);
580  int n, nw, nc, itnull, nthread, nt, i;
581
582  n = n1;
583  if (n < n2) {
584  n = n2;
585  }
586  nw = ip[0];
587  if (n > (nw << 2)) {
588  nw = n >> 2;
589  makewt(nw, ip, w);
590  }
591  nc = ip[1];
592  if (n > nc) {
593  nc = n;
594  makect(nc, ip, w + nw);
595  }
596  itnull = 0;
597  if (t == NULL) {
598  itnull = 1;
603  nt = 4 * nthread * n1;
604  if (n2 == 2 * nthread) {
605  nt >>= 1;
606  } else if (n2 < 2 * nthread) {
607  nt >>= 2;
608  }
609  t = (double *) malloc(sizeof(double) * nt);
610  fft2d_alloc_error_check(t);
611  }
613  if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
614  ddxt2d0_subth(n1, n2, 0, isgn, a, ip, w);
615  ddxt2d_subth(n1, n2, 0, isgn, a, t, ip, w);
616  } else
618  {
619  for (i = 0; i < n1; i++) {
620  ddct(n2, isgn, a[i], ip, w);
621  }
622  ddxt2d_sub(n1, n2, 0, isgn, a, t, ip, w);
623  }
624  if (itnull != 0) {
625  free(t);
626  }
627 }
628
629
630 void ddst2d(int n1, int n2, int isgn, double **a, double *t,
631  int *ip, double *w)
632 {
633  void makewt(int nw, int *ip, double *w);
634  void makect(int nc, int *ip, double *c);
635  void ddst(int n, int isgn, double *a, int *ip, double *w);
636  void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a,
637  double *t, int *ip, double *w);
639  void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a,
640  int *ip, double *w);
641  void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a,
642  double *t, int *ip, double *w);
644  int n, nw, nc, itnull, nthread, nt, i;
645
646  n = n1;
647  if (n < n2) {
648  n = n2;
649  }
650  nw = ip[0];
651  if (n > (nw << 2)) {
652  nw = n >> 2;
653  makewt(nw, ip, w);
654  }
655  nc = ip[1];
656  if (n > nc) {
657  nc = n;
658  makect(nc, ip, w + nw);
659  }
660  itnull = 0;
661  if (t == NULL) {
662  itnull = 1;
667  nt = 4 * nthread * n1;
668  if (n2 == 2 * nthread) {
669  nt >>= 1;
670  } else if (n2 < 2 * nthread) {
671  nt >>= 2;
672  }
673  t = (double *) malloc(sizeof(double) * nt);
674  fft2d_alloc_error_check(t);
675  }
677  if ((double) n1 * n2 >= (double) FFT2D_THREADS_BEGIN_N) {
678  ddxt2d0_subth(n1, n2, 1, isgn, a, ip, w);
679  ddxt2d_subth(n1, n2, 1, isgn, a, t, ip, w);
680  } else
682  {
683  for (i = 0; i < n1; i++) {
684  ddst(n2, isgn, a[i], ip, w);
685  }
686  ddxt2d_sub(n1, n2, 1, isgn, a, t, ip, w);
687  }
688  if (itnull != 0) {
689  free(t);
690  }
691 }
692
693
694 /* -------- child routines -------- */
695
696
697 void cdft2d_sub(int n1, int n2, int isgn, double **a, double *t,
698  int *ip, double *w)
699 {
700  void cdft(int n, int isgn, double *a, int *ip, double *w);
701  int i, j;
702
703  if (n2 > 4) {
704  for (j = 0; j < n2; j += 8) {
705  for (i = 0; i < n1; i++) {
706  t[2 * i] = a[i][j];
707  t[2 * i + 1] = a[i][j + 1];
708  t[2 * n1 + 2 * i] = a[i][j + 2];
709  t[2 * n1 + 2 * i + 1] = a[i][j + 3];
710  t[4 * n1 + 2 * i] = a[i][j + 4];
711  t[4 * n1 + 2 * i + 1] = a[i][j + 5];
712  t[6 * n1 + 2 * i] = a[i][j + 6];
713  t[6 * n1 + 2 * i + 1] = a[i][j + 7];
714  }
715  cdft(2 * n1, isgn, t, ip, w);
716  cdft(2 * n1, isgn, &t[2 * n1], ip, w);
717  cdft(2 * n1, isgn, &t[4 * n1], ip, w);
718  cdft(2 * n1, isgn, &t[6 * n1], ip, w);
719  for (i = 0; i < n1; i++) {
720  a[i][j] = t[2 * i];
721  a[i][j + 1] = t[2 * i + 1];
722  a[i][j + 2] = t[2 * n1 + 2 * i];
723  a[i][j + 3] = t[2 * n1 + 2 * i + 1];
724  a[i][j + 4] = t[4 * n1 + 2 * i];
725  a[i][j + 5] = t[4 * n1 + 2 * i + 1];
726  a[i][j + 6] = t[6 * n1 + 2 * i];
727  a[i][j + 7] = t[6 * n1 + 2 * i + 1];
728  }
729  }
730  } else if (n2 == 4) {
731  for (i = 0; i < n1; i++) {
732  t[2 * i] = a[i][0];
733  t[2 * i + 1] = a[i][1];
734  t[2 * n1 + 2 * i] = a[i][2];
735  t[2 * n1 + 2 * i + 1] = a[i][3];
736  }
737  cdft(2 * n1, isgn, t, ip, w);
738  cdft(2 * n1, isgn, &t[2 * n1], ip, w);
739  for (i = 0; i < n1; i++) {
740  a[i][0] = t[2 * i];
741  a[i][1] = t[2 * i + 1];
742  a[i][2] = t[2 * n1 + 2 * i];
743  a[i][3] = t[2 * n1 + 2 * i + 1];
744  }
745  } else if (n2 == 2) {
746  for (i = 0; i < n1; i++) {
747  t[2 * i] = a[i][0];
748  t[2 * i + 1] = a[i][1];
749  }
750  cdft(2 * n1, isgn, t, ip, w);
751  for (i = 0; i < n1; i++) {
752  a[i][0] = t[2 * i];
753  a[i][1] = t[2 * i + 1];
754  }
755  }
756 }
757
758
759 void rdft2d_sub(int n1, int n2, int isgn, double **a)
760 {
761  int n1h, i, j;
762  double xi;
763
764  n1h = n1 >> 1;
765  if (isgn < 0) {
766  for (i = 1; i < n1h; i++) {
767  j = n1 - i;
768  xi = a[i][0] - a[j][0];
769  a[i][0] += a[j][0];
770  a[j][0] = xi;
771  xi = a[j][1] - a[i][1];
772  a[i][1] += a[j][1];
773  a[j][1] = xi;
774  }
775  } else {
776  for (i = 1; i < n1h; i++) {
777  j = n1 - i;
778  a[j][0] = 0.5 * (a[i][0] - a[j][0]);
779  a[i][0] -= a[j][0];
780  a[j][1] = 0.5 * (a[i][1] + a[j][1]);
781  a[i][1] -= a[j][1];
782  }
783  }
784 }
785
786
787 void ddxt2d_sub(int n1, int n2, int ics, int isgn, double **a,
788  double *t, int *ip, double *w)
789 {
790  void ddct(int n, int isgn, double *a, int *ip, double *w);
791  void ddst(int n, int isgn, double *a, int *ip, double *w);
792  int i, j;
793
794  if (n2 > 2) {
795  for (j = 0; j < n2; j += 4) {
796  for (i = 0; i < n1; i++) {
797  t[i] = a[i][j];
798  t[n1 + i] = a[i][j + 1];
799  t[2 * n1 + i] = a[i][j + 2];
800  t[3 * n1 + i] = a[i][j + 3];
801  }
802  if (ics == 0) {
803  ddct(n1, isgn, t, ip, w);
804  ddct(n1, isgn, &t[n1], ip, w);
805  ddct(n1, isgn, &t[2 * n1], ip, w);
806  ddct(n1, isgn, &t[3 * n1], ip, w);
807  } else {
808  ddst(n1, isgn, t, ip, w);
809  ddst(n1, isgn, &t[n1], ip, w);
810  ddst(n1, isgn, &t[2 * n1], ip, w);
811  ddst(n1, isgn, &t[3 * n1], ip, w);
812  }
813  for (i = 0; i < n1; i++) {
814  a[i][j] = t[i];
815  a[i][j + 1] = t[n1 + i];
816  a[i][j + 2] = t[2 * n1 + i];
817  a[i][j + 3] = t[3 * n1 + i];
818  }
819  }
820  } else if (n2 == 2) {
821  for (i = 0; i < n1; i++) {
822  t[i] = a[i][0];
823  t[n1 + i] = a[i][1];
824  }
825  if (ics == 0) {
826  ddct(n1, isgn, t, ip, w);
827  ddct(n1, isgn, &t[n1], ip, w);
828  } else {
829  ddst(n1, isgn, t, ip, w);
830  ddst(n1, isgn, &t[n1], ip, w);
831  }
832  for (i = 0; i < n1; i++) {
833  a[i][0] = t[i];
834  a[i][1] = t[n1 + i];
835  }
836  }
837 }
838
839
841 struct fft2d_arg_st {
843  int n0;
844  int n1;
845  int n2;
846  int ic;
847  int isgn;
848  double **a;
849  double *t;
850  int *ip;
851  double *w;
852 };
853 typedef struct fft2d_arg_st fft2d_arg_t;
854
855
856 void xdft2d0_subth(int n1, int n2, int icr, int isgn, double **a,
857  int *ip, double *w)
858 {
859  void *xdft2d0_th(void *p);
863
865  if (nthread > n1) {
867  }
868  for (i = 0; i < nthread; i++) {
870  ag[i].n0 = i;
871  ag[i].n1 = n1;
872  ag[i].n2 = n2;
873  ag[i].ic = icr;
874  ag[i].isgn = isgn;
875  ag[i].a = a;
876  ag[i].ip = ip;
877  ag[i].w = w;
879  }
880  for (i = 0; i < nthread; i++) {
882  }
883 }
884
885
886 void cdft2d_subth(int n1, int n2, int isgn, double **a, double *t,
887  int *ip, double *w)
888 {
889  void *cdft2d_th(void *p);
893
895  nt = 8 * n1;
896  if (n2 == 4 * FFT2D_MAX_THREADS) {
897  nt >>= 1;
898  } else if (n2 < 4 * FFT2D_MAX_THREADS) {
899  nthread = n2 >> 1;
900  nt >>= 2;
901  }
902  for (i = 0; i < nthread; i++) {
904  ag[i].n0 = i;
905  ag[i].n1 = n1;
906  ag[i].n2 = n2;
907  ag[i].isgn = isgn;
908  ag[i].a = a;
909  ag[i].t = &t[nt * i];
910  ag[i].ip = ip;
911  ag[i].w = w;
913  }
914  for (i = 0; i < nthread; i++) {
916  }
917 }
918
919
920 void ddxt2d0_subth(int n1, int n2, int ics, int isgn, double **a,
921  int *ip, double *w)
922 {
923  void *ddxt2d0_th(void *p);
927
929  if (nthread > n1) {
931  }
932  for (i = 0; i < nthread; i++) {
934  ag[i].n0 = i;
935  ag[i].n1 = n1;
936  ag[i].n2 = n2;
937  ag[i].ic = ics;
938  ag[i].isgn = isgn;
939  ag[i].a = a;
940  ag[i].ip = ip;
941  ag[i].w = w;
943  }
944  for (i = 0; i < nthread; i++) {
946  }
947 }
948
949
950 void ddxt2d_subth(int n1, int n2, int ics, int isgn, double **a,
951  double *t, int *ip, double *w)
952 {
953  void *ddxt2d_th(void *p);
957
959  nt = 4 * n1;
960  if (n2 == 2 * FFT2D_MAX_THREADS) {
961  nt >>= 1;
962  } else if (n2 < 2 * FFT2D_MAX_THREADS) {
964  nt >>= 2;
965  }
966  for (i = 0; i < nthread; i++) {
968  ag[i].n0 = i;
969  ag[i].n1 = n1;
970  ag[i].n2 = n2;
971  ag[i].ic = ics;
972  ag[i].isgn = isgn;
973  ag[i].a = a;
974  ag[i].t = &t[nt * i];
975  ag[i].ip = ip;
976  ag[i].w = w;
978  }
979  for (i = 0; i < nthread; i++) {
981  }
982 }
983
984
985 void *xdft2d0_th(void *p)
986 {
987  void cdft(int n, int isgn, double *a, int *ip, double *w);
988  void rdft(int n, int isgn, double *a, int *ip, double *w);
989  int nthread, n0, n1, n2, icr, isgn, *ip, i;
990  double **a, *w;
991
993  n0 = ((fft2d_arg_t *) p)->n0;
994  n1 = ((fft2d_arg_t *) p)->n1;
995  n2 = ((fft2d_arg_t *) p)->n2;
996  icr = ((fft2d_arg_t *) p)->ic;
997  isgn = ((fft2d_arg_t *) p)->isgn;
998  a = ((fft2d_arg_t *) p)->a;
999  ip = ((fft2d_arg_t *) p)->ip;
1000  w = ((fft2d_arg_t *) p)->w;
1001  if (icr == 0) {
1002  for (i = n0; i < n1; i += nthread) {
1003  cdft(n2, isgn, a[i], ip, w);
1004  }
1005  } else {
1006  for (i = n0; i < n1; i += nthread) {
1007  rdft(n2, isgn, a[i], ip, w);
1008  }
1009  }
1010  return (void *) 0;
1011 }
1012
1013
1014 void *cdft2d_th(void *p)
1015 {
1016  void cdft(int n, int isgn, double *a, int *ip, double *w);
1017  int nthread, n0, n1, n2, isgn, *ip, i, j;
1018  double **a, *t, *w;
1019
1021  n0 = ((fft2d_arg_t *) p)->n0;
1022  n1 = ((fft2d_arg_t *) p)->n1;
1023  n2 = ((fft2d_arg_t *) p)->n2;
1024  isgn = ((fft2d_arg_t *) p)->isgn;
1025  a = ((fft2d_arg_t *) p)->a;
1026  t = ((fft2d_arg_t *) p)->t;
1027  ip = ((fft2d_arg_t *) p)->ip;
1028  w = ((fft2d_arg_t *) p)->w;
1029  if (n2 > 4 * nthread) {
1030  for (j = 8 * n0; j < n2; j += 8 * nthread) {
1031  for (i = 0; i < n1; i++) {
1032  t[2 * i] = a[i][j];
1033  t[2 * i + 1] = a[i][j + 1];
1034  t[2 * n1 + 2 * i] = a[i][j + 2];
1035  t[2 * n1 + 2 * i + 1] = a[i][j + 3];
1036  t[4 * n1 + 2 * i] = a[i][j + 4];
1037  t[4 * n1 + 2 * i + 1] = a[i][j + 5];
1038  t[6 * n1 + 2 * i] = a[i][j + 6];
1039  t[6 * n1 + 2 * i + 1] = a[i][j + 7];
1040  }
1041  cdft(2 * n1, isgn, t, ip, w);
1042  cdft(2 * n1, isgn, &t[2 * n1], ip, w);
1043  cdft(2 * n1, isgn, &t[4 * n1], ip, w);
1044  cdft(2 * n1, isgn, &t[6 * n1], ip, w);
1045  for (i = 0; i < n1; i++) {
1046  a[i][j] = t[2 * i];
1047  a[i][j + 1] = t[2 * i + 1];
1048  a[i][j + 2] = t[2 * n1 + 2 * i];
1049  a[i][j + 3] = t[2 * n1 + 2 * i + 1];
1050  a[i][j + 4] = t[4 * n1 + 2 * i];
1051  a[i][j + 5] = t[4 * n1 + 2 * i + 1];
1052  a[i][j + 6] = t[6 * n1 + 2 * i];
1053  a[i][j + 7] = t[6 * n1 + 2 * i + 1];
1054  }
1055  }
1056  } else if (n2 == 4 * nthread) {
1057  for (i = 0; i < n1; i++) {
1058  t[2 * i] = a[i][4 * n0];
1059  t[2 * i + 1] = a[i][4 * n0 + 1];
1060  t[2 * n1 + 2 * i] = a[i][4 * n0 + 2];
1061  t[2 * n1 + 2 * i + 1] = a[i][4 * n0 + 3];
1062  }
1063  cdft(2 * n1, isgn, t, ip, w);
1064  cdft(2 * n1, isgn, &t[2 * n1], ip, w);
1065  for (i = 0; i < n1; i++) {
1066  a[i][4 * n0] = t[2 * i];
1067  a[i][4 * n0 + 1] = t[2 * i + 1];
1068  a[i][4 * n0 + 2] = t[2 * n1 + 2 * i];
1069  a[i][4 * n0 + 3] = t[2 * n1 + 2 * i + 1];
1070  }
1071  } else if (n2 == 2 * nthread) {
1072  for (i = 0; i < n1; i++) {
1073  t[2 * i] = a[i][2 * n0];
1074  t[2 * i + 1] = a[i][2 * n0 + 1];
1075  }
1076  cdft(2 * n1, isgn, t, ip, w);
1077  for (i = 0; i < n1; i++) {
1078  a[i][2 * n0] = t[2 * i];
1079  a[i][2 * n0 + 1] = t[2 * i + 1];
1080  }
1081  }
1082  return (void *) 0;
1083 }
1084
1085
1086 void *ddxt2d0_th(void *p)
1087 {
1088  void ddct(int n, int isgn, double *a, int *ip, double *w);
1089  void ddst(int n, int isgn, double *a, int *ip, double *w);
1090  int nthread, n0, n1, n2, ics, isgn, *ip, i;
1091  double **a, *w;
1092
1094  n0 = ((fft2d_arg_t *) p)->n0;
1095  n1 = ((fft2d_arg_t *) p)->n1;
1096  n2 = ((fft2d_arg_t *) p)->n2;
1097  ics = ((fft2d_arg_t *) p)->ic;
1098  isgn = ((fft2d_arg_t *) p)->isgn;
1099  a = ((fft2d_arg_t *) p)->a;
1100  ip = ((fft2d_arg_t *) p)->ip;
1101  w = ((fft2d_arg_t *) p)->w;
1102  if (ics == 0) {
1103  for (i = n0; i < n1; i += nthread) {
1104  ddct(n2, isgn, a[i], ip, w);
1105  }
1106  } else {
1107  for (i = n0; i < n1; i += nthread) {
1108  ddst(n2, isgn, a[i], ip, w);
1109  }
1110  }
1111  return (void *) 0;
1112 }
1113
1114
1115 void *ddxt2d_th(void *p)
1116 {
1117  void ddct(int n, int isgn, double *a, int *ip, double *w);
1118  void ddst(int n, int isgn, double *a, int *ip, double *w);
1119  int nthread, n0, n1, n2, ics, isgn, *ip, i, j;
1120  double **a, *t, *w;
1121
1123  n0 = ((fft2d_arg_t *) p)->n0;
1124  n1 = ((fft2d_arg_t *) p)->n1;
1125  n2 = ((fft2d_arg_t *) p)->n2;
1126  ics = ((fft2d_arg_t *) p)->ic;
1127  isgn = ((fft2d_arg_t *) p)->isgn;
1128  a = ((fft2d_arg_t *) p)->a;
1129  t = ((fft2d_arg_t *) p)->t;
1130  ip = ((fft2d_arg_t *) p)->ip;
1131  w = ((fft2d_arg_t *) p)->w;
1132  if (n2 > 2 * nthread) {
1133  for (j = 4 * n0; j < n2; j += 4 * nthread) {
1134  for (i = 0; i < n1; i++) {
1135  t[i] = a[i][j];
1136  t[n1 + i] = a[i][j + 1];
1137  t[2 * n1 + i] = a[i][j + 2];
1138  t[3 * n1 + i] = a[i][j + 3];
1139  }
1140  if (ics == 0) {
1141  ddct(n1, isgn, t, ip, w);
1142  ddct(n1, isgn, &t[n1], ip, w);
1143  ddct(n1, isgn, &t[2 * n1], ip, w);
1144  ddct(n1, isgn, &t[3 * n1], ip, w);
1145  } else {
1146  ddst(n1, isgn, t, ip, w);
1147  ddst(n1, isgn, &t[n1], ip, w);
1148  ddst(n1, isgn, &t[2 * n1], ip, w);
1149  ddst(n1, isgn, &t[3 * n1], ip, w);
1150  }
1151  for (i = 0; i < n1; i++) {
1152  a[i][j] = t[i];
1153  a[i][j + 1] = t[n1 + i];
1154  a[i][j + 2] = t[2 * n1 + i];
1155  a[i][j + 3] = t[3 * n1 + i];
1156  }
1157  }
1158  } else if (n2 == 2 * nthread) {
1159  for (i = 0; i < n1; i++) {
1160  t[i] = a[i][2 * n0];
1161  t[n1 + i] = a[i][2 * n0 + 1];
1162  }
1163  if (ics == 0) {
1164  ddct(n1, isgn, t, ip, w);
1165  ddct(n1, isgn, &t[n1], ip, w);
1166  } else {
1167  ddst(n1, isgn, t, ip, w);
1168  ddst(n1, isgn, &t[n1], ip, w);
1169  }
1170  for (i = 0; i < n1; i++) {
1171  a[i][2 * n0] = t[i];
1172  a[i][2 * n0 + 1] = t[n1 + i];
1173  }
1174  } else if (n2 == nthread) {
1175  for (i = 0; i < n1; i++) {
1176  t[i] = a[i][n0];
1177  }
1178  if (ics == 0) {
1179  ddct(n1, isgn, t, ip, w);
1180  } else {
1181  ddst(n1, isgn, t, ip, w);
1182  }
1183  for (i = 0; i < n1; i++) {
1184  a[i][n0] = t[i];
1185  }
1186  }
1187  return (void *) 0;
1188 }