00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include "config.h"
00027
00028
00029 #ifdef HAVE_COMPLEX_H
00030 #include<complex.h>
00031 #endif
00032
00033
00034 #include "nfft3util.h"
00035 #include "nfft3.h"
00036 #include "infft.h"
00037
00038 #ifdef _OPENMP
00039 #include <omp.h>
00040 #endif
00041
00042 #ifdef OMP_ASSERT
00043 #include <assert.h>
00044 #endif
00045
00046
00061 static void nfft_sort_nodes_for_better_cache_handle(int d,
00062 const int *n, int m, int local_x_num, const R *local_x, int *ar_x)
00063 {
00064 int u_j[d], i, j, help, rhigh;
00065 int *ar_x_temp;
00066 int nprod;
00067
00068 for(i = 0; i < local_x_num; i++) {
00069 ar_x[2*i] = 0;
00070 ar_x[2*i+1] = i;
00071 for(j = 0; j < d; j++) {
00072 help = (int) floor( n[j]*local_x[d*i+j] - m);
00073 u_j[j] = (help%n[j]+n[j])%n[j];
00074
00075 ar_x[2*i] += u_j[j];
00076 if (j+1 < d)
00077 ar_x[2*i] *= n[j+1];
00078 }
00079 }
00080
00081 for (j = 0, nprod = 1; j < d; j++)
00082 nprod *= n[j];
00083
00084 rhigh = (int) ceil(log2(nprod)) - 1;
00085
00086 ar_x_temp = (int *) nfft_malloc(2*local_x_num*sizeof(int));
00087 nfft_sort_node_indices_radix_lsdf(local_x_num, ar_x, ar_x_temp, rhigh);
00088 #ifdef OMP_ASSERT
00089 for (i = 1; i < local_x_num; i++)
00090 assert(ar_x[2*(i-1)] <= ar_x[2*i]);
00091 #endif
00092 nfft_free(ar_x_temp);
00093 }
00094
00103 static void nfft_sort_nodes(const nfft_plan *ths)
00104 {
00105 if (ths->nfft_flags & NFFT_SORT_NODES)
00106 nfft_sort_nodes_for_better_cache_handle(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
00107 }
00108
00130
00131 #define MACRO_ndft_init_result_trafo memset(f,0,ths->M_total*sizeof(C));
00132 #define MACRO_ndft_init_result_conjugated MACRO_ndft_init_result_trafo
00133 #define MACRO_ndft_init_result_adjoint memset(f_hat,0,ths->N_total*sizeof(C));
00134 #define MACRO_ndft_init_result_transposed MACRO_ndft_init_result_adjoint
00135
00136
00137 #define MACRO_ndft_sign_trafo K2PI*ths->x[j*ths->d+t]
00138 #define MACRO_ndft_sign_conjugated -K2PI*ths->x[j*ths->d+t]
00139 #define MACRO_ndft_sign_adjoint K2PI*ths->x[j*ths->d+t]
00140 #define MACRO_ndft_sign_transposed -K2PI*ths->x[j*ths->d+t]
00141
00142 #define MACRO_init_k_N_Omega_x(which_one) \
00143 { \
00144 for (t = 0; t < ths->d; t++) \
00145 { \
00146 k[t] = -ths->N[t]/2; \
00147 x[t] = MACRO_ndft_sign_ ## which_one; \
00148 Omega[t+1] = k[t]*x[t] + Omega[t]; \
00149 } \
00150 omega = Omega[ths->d]; \
00151 } \
00152
00153 #define MACRO_count_k_N_Omega \
00154 { \
00155 for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--) \
00156 k[t]-= ths->N[t]-1; \
00157 \
00158 k[t]++; \
00159 \
00160 for (t2 = t; t2 < ths->d; t2++) \
00161 Omega[t2+1] = k[t2]*x[t2] + Omega[t2]; \
00162 \
00163 omega = Omega[ths->d]; \
00164 }
00165
00166 #define MACRO_ndft_compute_trafo f[j] += f_hat[k_L]*CEXP(-II*omega);
00167 #define MACRO_ndft_compute_conjugated MACRO_ndft_compute_trafo
00168 #define MACRO_ndft_compute_adjoint f_hat[k_L] += f[j]*CEXP(+II*omega);
00169 #define MACRO_ndft_compute_transposed MACRO_ndft_compute_adjoint
00170
00171 #define MACRO_ndft(which_one) \
00172 void nfft_ ## which_one ## _direct (nfft_plan *ths) \
00173 { \
00174 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f; \
00175 \
00176 MACRO_ndft_init_result_ ## which_one \
00177 \
00178 if (ths->d == 1) \
00179 { \
00180 \
00181 const int t = 0; \
00182 int j; \
00183 for (j = 0; j < ths->M_total; j++) \
00184 { \
00185 int k_L; \
00186 for(k_L = 0; k_L < ths->N_total; k_L++) \
00187 { \
00188 R omega = (k_L - (ths->N_total/2)) * MACRO_ndft_sign_ ## which_one; \
00189 MACRO_ndft_compute_ ## which_one; \
00190 } \
00191 } \
00192 } \
00193 else \
00194 { \
00195 \
00196 int j; \
00197 for (j = 0; j < ths->M_total; j++) \
00198 { \
00199 R x[ths->d], omega, Omega[ths->d+1]; \
00200 int t, t2, k_L, k[ths->d]; \
00201 Omega[0] = K(0.0); \
00202 MACRO_init_k_N_Omega_x(which_one); \
00203 for(k_L = 0; k_L < ths->N_total; k_L++) \
00204 { \
00205 MACRO_ndft_compute_ ## which_one; \
00206 MACRO_count_k_N_Omega; \
00207 } \
00208 } \
00209 } \
00210 }
00211
00212
00213
00214 void nfft_trafo_direct (nfft_plan *ths)
00215 {
00216 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
00217
00218 memset(f,0,ths->M_total*sizeof(C));
00219
00220 if (ths->d == 1)
00221 {
00222
00223 int j;
00224 #pragma omp parallel for default(shared) private(j)
00225 for (j = 0; j < ths->M_total; j++)
00226 {
00227 int k_L;
00228 for(k_L = 0; k_L < ths->N_total; k_L++)
00229 {
00230 R omega = (k_L - ths->N_total/2) * K2PI * ths->x[j];
00231 f[j] += f_hat[k_L]*cexp(-( 1.0iF)*omega);
00232 }
00233 }
00234 }
00235 else
00236 {
00237
00238 int j;
00239 #pragma omp parallel for default(shared) private(j)
00240 for (j = 0; j < ths->M_total; j++)
00241 {
00242 R x[ths->d], omega, Omega[ths->d+1];
00243 int t, t2, k_L, k[ths->d];
00244 Omega[0] = ((R) 0.0);
00245 for (t = 0; t < ths->d; t++)
00246 {
00247 k[t] = -ths->N[t]/2;
00248 x[t] = K2PI*ths->x[j*ths->d+t];
00249 Omega[t+1] = k[t]*x[t] + Omega[t];
00250 }
00251 omega = Omega[ths->d];
00252
00253 for(k_L = 0; k_L < ths->N_total; k_L++)
00254 {
00255 f[j] += f_hat[k_L]*cexp(-( 1.0iF)*omega);
00256 {
00257 for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
00258 k[t]-= ths->N[t]-1;
00259
00260 k[t]++;
00261
00262 for (t2 = t; t2 < ths->d; t2++)
00263 Omega[t2+1] = k[t2]*x[t2] + Omega[t2];
00264
00265 omega = Omega[ths->d];
00266 }
00267 }
00268 }
00269 }
00270 }
00271
00272
00273
00274
00275 void nfft_adjoint_direct (nfft_plan *ths)
00276 {
00277 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
00278
00279 memset(f_hat,0,ths->N_total*sizeof(C));
00280
00281 if (ths->d == 1)
00282 {
00283
00284 #ifdef _OPENMP
00285 int k_L;
00286 #pragma omp parallel for default(shared) private(k_L)
00287 for(k_L = 0; k_L < ths->N_total; k_L++)
00288 {
00289 int j;
00290 for (j = 0; j < ths->M_total; j++)
00291 {
00292 R omega = (k_L - (ths->N_total/2)) * K2PI * ths->x[j];
00293 f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
00294 }
00295 }
00296 #else
00297 int j;
00298 for (j = 0; j < ths->M_total; j++)
00299 {
00300 int k_L;
00301 for(k_L = 0; k_L < ths->N_total; k_L++)
00302 {
00303 R omega = (k_L - (ths->N_total/2)) * K2PI * ths->x[j];
00304 f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
00305 }
00306 }
00307 #endif
00308 }
00309 else
00310 {
00311
00312 int j, k_L;
00313 #ifdef _OPENMP
00314 #pragma omp parallel for default(shared) private(j, k_L)
00315 for(k_L = 0; k_L < ths->N_total; k_L++)
00316 {
00317 int k[ths->d], k_temp, t;
00318
00319 k_temp = k_L;
00320
00321 for (t = ths->d-1; t >= 0; t--)
00322 {
00323 k[t] = k_temp % ths->N[t] - ths->N[t]/2;
00324 k_temp /= ths->N[t];
00325 }
00326
00327 for (j = 0; j < ths->M_total; j++)
00328 {
00329 R omega = 0.0;
00330 for (t = 0; t < ths->d; t++)
00331 omega += k[t] * K2PI * ths->x[j*ths->d+t];
00332 f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
00333 }
00334 }
00335 #else
00336 for (j = 0; j < ths->M_total; j++)
00337 {
00338 R x[ths->d], omega, Omega[ths->d+1];
00339 int t, t2, k[ths->d];
00340 Omega[0] = ((R) 0.0);
00341 for (t = 0; t < ths->d; t++)
00342 {
00343 k[t] = -ths->N[t]/2;
00344 x[t] = K2PI * ths->x[j*ths->d+t];
00345 Omega[t+1] = k[t]*x[t] + Omega[t];
00346 }
00347 omega = Omega[ths->d];
00348 for(k_L = 0; k_L < ths->N_total; k_L++)
00349 {
00350 f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
00351
00352 for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
00353 k[t]-= ths->N[t]-1;
00354
00355 k[t]++;
00356
00357 for (t2 = t; t2 < ths->d; t2++)
00358 Omega[t2+1] = k[t2]*x[t2] + Omega[t2];
00359
00360 omega = Omega[ths->d];
00361 }
00362 }
00363 #endif
00364 }
00365 }
00366
00392 static inline void nfft_uo(const nfft_plan *ths, const int j, int *up, int *op,
00393 const int act_dim)
00394 {
00395 const R xj = ths->x[j * ths->d + act_dim];
00396 int c = LRINT(FLOOR(xj * ths->n[act_dim]));
00397
00398 (*up) = c - (ths->m);
00399 (*op) = c + 1 + (ths->m);
00400 }
00401
00402 static inline void nfft_uo2(int *u, int *o, const R x, const int n, const int m)
00403 {
00404 int c = LRINT(FLOOR(x * n));
00405
00406 *u = (c - m + n) % n;
00407 *o = (c + 1 + m + n) % n;
00408 }
00409
00410 #define MACRO_nfft_D_compute_A \
00411 { \
00412 g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
00413 }
00414
00415 #define MACRO_nfft_D_compute_T \
00416 { \
00417 f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
00418 }
00419
00420 #define MACRO_nfft_D_init_result_A memset(g_hat,0,ths->n_total*sizeof(C));
00421
00422 #define MACRO_nfft_D_init_result_T memset(f_hat,0,ths->N_total*sizeof(C));
00423
00424 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
00425
00426 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ks[t2]-(ths->N[t2]/2),t2));
00427
00428 #define MACRO_init_k_ks \
00429 { \
00430 for (t = ths->d-1; 0 <= t; t--) \
00431 { \
00432 kp[t] = k[t] = 0; \
00433 ks[t] = ths->N[t]/2; \
00434 } \
00435 t++; \
00436 }
00437
00438 #define MACRO_update_c_phi_inv_k(which_one) \
00439 { \
00440 for (t2 = t; t2 < ths->d; t2++) \
00441 { \
00442 c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
00443 ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
00444 k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
00445 } \
00446 }
00447
00448 #define MACRO_count_k_ks \
00449 { \
00450 for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
00451 { \
00452 kp[t] = k[t] = 0; \
00453 ks[t]= ths->N[t]/2; \
00454 } \
00455 \
00456 kp[t]++; k[t]++; ks[t]++; \
00457 if(kp[t] == ths->N[t]/2) \
00458 { \
00459 k[t] = ths->n[t] - ths->N[t]/2; \
00460 ks[t] = 0; \
00461 } \
00462 } \
00463
00464
00468 #define MACRO_nfft_D(which_one) \
00469 static inline void nfft_D_serial_ ## which_one (nfft_plan *ths) \
00470 { \
00471 C *f_hat, *g_hat; \
00472 R c_phi_inv_k[ths->d+1]; \
00473 int t, t2; \
00474 int k_L; \
00475 int kp[ths->d]; \
00476 int k[ths->d]; \
00477 int ks[ths->d]; \
00478 int k_plain[ths->d+1]; \
00479 int ks_plain[ths->d+1]; \
00480 \
00481 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
00482 MACRO_nfft_D_init_result_ ## which_one; \
00483 \
00484 c_phi_inv_k[0] = K(1.0); \
00485 k_plain[0] = 0; \
00486 ks_plain[0] = 0; \
00487 \
00488 if (ths->nfft_flags & PRE_PHI_HUT) \
00489 { \
00490 MACRO_init_k_ks; \
00491 \
00492 for (k_L = 0; k_L < ths->N_total; k_L++) \
00493 { \
00494 MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
00495 MACRO_nfft_D_compute_ ## which_one; \
00496 MACRO_count_k_ks; \
00497 } \
00498 } \
00499 else \
00500 { \
00501 MACRO_init_k_ks; \
00502 for (k_L = 0; k_L < ths->N_total; k_L++) \
00503 { \
00504 MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
00505 MACRO_nfft_D_compute_ ## which_one; \
00506 MACRO_count_k_ks; \
00507 } \
00508 } \
00509 }
00510
00511 #ifdef _OPENMP
00512 static void nfft_D_openmp_A(nfft_plan *ths)
00513 {
00514 C *f_hat, *g_hat;
00515 int k_L;
00517 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
00518 memset(g_hat,0,ths->n_total*sizeof(C));
00519
00520 if (ths->nfft_flags & PRE_PHI_HUT)
00521 {
00522 #pragma omp parallel for default(shared) private(k_L)
00523 for (k_L = 0; k_L < ths->N_total; k_L++)
00524 {
00525 int kp[ths->d];
00526 int k[ths->d];
00527 int ks[ths->d];
00528 R c_phi_inv_k_val = K(1.0);
00529 int k_plain_val = 0;
00530 int ks_plain_val = 0;
00531 int t;
00532 int k_temp = k_L;
00533
00534 for (t = ths->d-1; t >= 0; t--)
00535 {
00536 kp[t] = k_temp % ths->N[t];
00537 if (kp[t] >= ths->N[t]/2)
00538 k[t] = ths->n[t] - ths->N[t] + kp[t];
00539 else
00540 k[t] = kp[t];
00541 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00542 k_temp /= ths->N[t];
00543 }
00544
00545 for (t = 0; t < ths->d; t++)
00546 {
00547 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
00548 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00549 k_plain_val = k_plain_val*ths->n[t] + k[t];
00550 }
00551
00552 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
00553 }
00554 }
00555 else
00556 {
00557 #pragma omp parallel for default(shared) private(k_L)
00558 for (k_L = 0; k_L < ths->N_total; k_L++)
00559 {
00560 int kp[ths->d];
00561 int k[ths->d];
00562 int ks[ths->d];
00563 R c_phi_inv_k_val = K(1.0);
00564 int k_plain_val = 0;
00565 int ks_plain_val = 0;
00566 int t;
00567 int k_temp = k_L;
00568
00569 for (t = ths->d-1; t >= 0; t--)
00570 {
00571 kp[t] = k_temp % ths->N[t];
00572 if (kp[t] >= ths->N[t]/2)
00573 k[t] = ths->n[t] - ths->N[t] + kp[t];
00574 else
00575 k[t] = kp[t];
00576 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00577 k_temp /= ths->N[t];
00578 }
00579
00580 for (t = 0; t < ths->d; t++)
00581 {
00582 c_phi_inv_k_val /= (PHI_HUT(ks[t]-(ths->N[t]/2),t));
00583 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00584 k_plain_val = k_plain_val*ths->n[t] + k[t];
00585 }
00586
00587 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
00588 }
00589 }
00590 }
00591 #endif
00592
00593 #ifndef _OPENMP
00594 MACRO_nfft_D(A)
00595 #endif
00596
00597 static void nfft_D_A(nfft_plan *ths)
00598 {
00599 #ifdef _OPENMP
00600 nfft_D_openmp_A(ths);
00601 #else
00602 nfft_D_serial_A(ths);
00603 #endif
00604 }
00605
00606 #ifdef _OPENMP
00607 static void nfft_D_openmp_T(nfft_plan *ths)
00608 {
00609 C *f_hat, *g_hat;
00610 int k_L;
00612 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
00613 memset(f_hat,0,ths->N_total*sizeof(C));
00614
00615 if (ths->nfft_flags & PRE_PHI_HUT)
00616 {
00617 #pragma omp parallel for default(shared) private(k_L)
00618 for (k_L = 0; k_L < ths->N_total; k_L++)
00619 {
00620 int kp[ths->d];
00621 int k[ths->d];
00622 int ks[ths->d];
00623 R c_phi_inv_k_val = K(1.0);
00624 int k_plain_val = 0;
00625 int ks_plain_val = 0;
00626 int t;
00627 int k_temp = k_L;
00628
00629 for (t = ths->d-1; t >= 0; t--)
00630 {
00631 kp[t] = k_temp % ths->N[t];
00632 if (kp[t] >= ths->N[t]/2)
00633 k[t] = ths->n[t] - ths->N[t] + kp[t];
00634 else
00635 k[t] = kp[t];
00636 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00637 k_temp /= ths->N[t];
00638 }
00639
00640 for (t = 0; t < ths->d; t++)
00641 {
00642 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
00643 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00644 k_plain_val = k_plain_val*ths->n[t] + k[t];
00645 }
00646
00647 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
00648 }
00649 }
00650 else
00651 {
00652 #pragma omp parallel for default(shared) private(k_L)
00653 for (k_L = 0; k_L < ths->N_total; k_L++)
00654 {
00655 int kp[ths->d];
00656 int k[ths->d];
00657 int ks[ths->d];
00658 R c_phi_inv_k_val = K(1.0);
00659 int k_plain_val = 0;
00660 int ks_plain_val = 0;
00661 int t;
00662 int k_temp = k_L;
00663
00664 for (t = ths->d-1; t >= 0; t--)
00665 {
00666 kp[t] = k_temp % ths->N[t];
00667 if (kp[t] >= ths->N[t]/2)
00668 k[t] = ths->n[t] - ths->N[t] + kp[t];
00669 else
00670 k[t] = kp[t];
00671 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00672 k_temp /= ths->N[t];
00673 }
00674
00675 for (t = 0; t < ths->d; t++)
00676 {
00677 c_phi_inv_k_val /= (PHI_HUT(ks[t]-(ths->N[t]/2),t));
00678 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00679 k_plain_val = k_plain_val*ths->n[t] + k[t];
00680 }
00681
00682 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
00683 }
00684 }
00685 }
00686 #endif
00687
00688 #ifndef _OPENMP
00689 MACRO_nfft_D(T)
00690 #endif
00691
00692 static void nfft_D_T(nfft_plan *ths)
00693 {
00694 #ifdef _OPENMP
00695 nfft_D_openmp_T(ths);
00696 #else
00697 nfft_D_serial_T(ths);
00698 #endif
00699 }
00700
00704 #define MACRO_nfft_B_init_result_A memset(f,0,ths->M_total*sizeof(C));
00705 #define MACRO_nfft_B_init_result_T memset(g,0,ths->n_total*sizeof(C));
00706
00707 #define MACRO_nfft_B_PRE_FULL_PSI_compute_A \
00708 { \
00709 (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
00710 }
00711
00712 #define MACRO_nfft_B_PRE_FULL_PSI_compute_T \
00713 { \
00714 g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
00715 }
00716
00717 #define MACRO_nfft_B_compute_A \
00718 { \
00719 (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
00720 }
00721
00722 #define MACRO_nfft_B_compute_T \
00723 { \
00724 g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
00725 }
00726
00727 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
00728
00729 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
00730
00731 #define MACRO_without_PRE_PSI PHI(ths->x[j*ths->d+t2] \
00732 - ((R)l[t2])/((R)ths->n[t2]), t2)
00733
00734 #define MACRO_init_uo_l_lj_t \
00735 { \
00736 for (t = ths->d-1; t >= 0; t--) \
00737 { \
00738 nfft_uo(ths,j,&u[t],&o[t],t); \
00739 l[t] = u[t]; \
00740 lj[t] = 0; \
00741 } \
00742 t++; \
00743 }
00744
00745 #define MACRO_update_phi_prod_ll_plain(which_one) { \
00746 for(t2=t; t2<ths->d; t2++) \
00747 { \
00748 phi_prod[t2+1] = phi_prod[t2]* MACRO_ ## which_one; \
00749 ll_plain[t2+1] = ll_plain[t2]*ths->n[t2] +(l[t2]+ths->n[t2])%ths->n[t2];\
00750 } \
00751 }
00752
00753 #define MACRO_count_uo_l_lj_t { \
00754 for(t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \
00755 { \
00756 l[t] = u[t]; \
00757 lj[t] = 0; \
00758 } \
00759 \
00760 l[t]++; \
00761 lj[t]++; \
00762 }
00763
00764 #define MACRO_nfft_B(which_one) \
00765 static inline void nfft_B_serial_ ## which_one (nfft_plan *ths) \
00766 { \
00767 int lprod; \
00768 int u[ths->d], o[ths->d]; \
00769 int t, t2; \
00770 int j; \
00771 int l_L, ix; \
00772 int l[ths->d]; \
00773 int lj[ths->d]; \
00774 int ll_plain[ths->d+1]; \
00775 R phi_prod[ths->d+1]; \
00776 C *f, *g; \
00777 C *fj; \
00778 R y[ths->d]; \
00779 R fg_psi[ths->d][2*ths->m+2]; \
00780 R fg_exp_l[ths->d][2*ths->m+2]; \
00781 int l_fg,lj_fg; \
00782 R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
00783 R ip_w; \
00784 int ip_u; \
00785 int ip_s = ths->K/(ths->m+2); \
00786 \
00787 f = (C*)ths->f; g = (C*)ths->g; \
00788 \
00789 MACRO_nfft_B_init_result_ ## which_one; \
00790 \
00791 if (ths->nfft_flags & PRE_FULL_PSI) \
00792 { \
00793 for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
00794 for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
00795 MACRO_nfft_B_PRE_FULL_PSI_compute_ ## which_one; \
00796 return; \
00797 } \
00798 \
00799 phi_prod[0] = K(1.0); \
00800 ll_plain[0] = 0; \
00801 \
00802 for (t = 0, lprod = 1; t < ths->d; t++) \
00803 lprod *= (2*ths->m+2); \
00804 \
00805 if (ths->nfft_flags & PRE_PSI) \
00806 { \
00807 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
00808 { \
00809 MACRO_init_uo_l_lj_t; \
00810 \
00811 for (l_L = 0; l_L < lprod; l_L++) \
00812 { \
00813 MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \
00814 \
00815 MACRO_nfft_B_compute_ ## which_one; \
00816 \
00817 MACRO_count_uo_l_lj_t; \
00818 } \
00819 } \
00820 return; \
00821 } \
00822 \
00823 if (ths->nfft_flags & PRE_FG_PSI) \
00824 { \
00825 for(t2 = 0; t2 < ths->d; t2++) \
00826 { \
00827 tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
00828 tmpEXP2sq = tmpEXP2*tmpEXP2; \
00829 tmp2 = K(1.0); \
00830 tmp3 = K(1.0); \
00831 fg_exp_l[t2][0] = K(1.0); \
00832 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
00833 { \
00834 tmp3 = tmp2*tmpEXP2; \
00835 tmp2 *= tmpEXP2sq; \
00836 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
00837 } \
00838 } \
00839 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
00840 { \
00841 MACRO_init_uo_l_lj_t; \
00842 \
00843 for (t2 = 0; t2 < ths->d; t2++) \
00844 { \
00845 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
00846 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
00847 tmp1 = K(1.0); \
00848 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
00849 { \
00850 tmp1 *= tmpEXP1; \
00851 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
00852 } \
00853 } \
00854 \
00855 for (l_L= 0; l_L < lprod; l_L++) \
00856 { \
00857 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
00858 \
00859 MACRO_nfft_B_compute_ ## which_one; \
00860 \
00861 MACRO_count_uo_l_lj_t; \
00862 } \
00863 } \
00864 return; \
00865 } \
00866 \
00867 if (ths->nfft_flags & FG_PSI) \
00868 { \
00869 for (t2 = 0; t2 < ths->d; t2++) \
00870 { \
00871 tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
00872 tmpEXP2sq = tmpEXP2*tmpEXP2; \
00873 tmp2 = K(1.0); \
00874 tmp3 = K(1.0); \
00875 fg_exp_l[t2][0] = K(1.0); \
00876 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
00877 { \
00878 tmp3 = tmp2*tmpEXP2; \
00879 tmp2 *= tmpEXP2sq; \
00880 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
00881 } \
00882 } \
00883 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
00884 { \
00885 MACRO_init_uo_l_lj_t; \
00886 \
00887 for (t2 = 0; t2 < ths->d; t2++) \
00888 { \
00889 fg_psi[t2][0] = (PHI((ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));\
00890 \
00891 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2]) \
00892 /ths->b[t2]); \
00893 tmp1 = K(1.0); \
00894 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
00895 { \
00896 tmp1 *= tmpEXP1; \
00897 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
00898 } \
00899 } \
00900 \
00901 for (l_L = 0; l_L < lprod; l_L++) \
00902 { \
00903 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
00904 \
00905 MACRO_nfft_B_compute_ ## which_one; \
00906 \
00907 MACRO_count_uo_l_lj_t; \
00908 } \
00909 } \
00910 return; \
00911 } \
00912 \
00913 if (ths->nfft_flags & PRE_LIN_PSI) \
00914 { \
00915 for (j = 0, fj=f; j<ths->M_total; j++, fj++) \
00916 { \
00917 MACRO_init_uo_l_lj_t; \
00918 \
00919 for (t2 = 0; t2 < ths->d; t2++) \
00920 { \
00921 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2]) \
00922 * ((R)ths->K))/(ths->m+2); \
00923 ip_u = LRINT(FLOOR(y[t2])); \
00924 ip_w = y[t2]-ip_u; \
00925 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
00926 { \
00927 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
00928 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
00929 * (ip_w); \
00930 } \
00931 } \
00932 \
00933 for (l_L = 0; l_L < lprod; l_L++) \
00934 { \
00935 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
00936 \
00937 MACRO_nfft_B_compute_ ## which_one; \
00938 \
00939 MACRO_count_uo_l_lj_t; \
00940 } \
00941 } \
00942 return; \
00943 } \
00944 \
00945 \
00946 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
00947 { \
00948 MACRO_init_uo_l_lj_t; \
00949 \
00950 for (l_L = 0; l_L < lprod; l_L++) \
00951 { \
00952 MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \
00953 \
00954 MACRO_nfft_B_compute_ ## which_one; \
00955 \
00956 MACRO_count_uo_l_lj_t; \
00957 } \
00958 } \
00959 } \
00960
00961 #ifndef _OPENMP
00962 MACRO_nfft_B(A)
00963 #endif
00964
00965 #ifdef _OPENMP
00966 static inline void nfft_B_openmp_A (nfft_plan *ths)
00967 {
00968 int lprod;
00969 int k;
00970
00971 memset(ths->f,0,ths->M_total*sizeof(C));
00972
00973 for (k = 0, lprod = 1; k < ths->d; k++)
00974 lprod *= (2*ths->m+2);
00975
00976 if (ths->nfft_flags & PRE_FULL_PSI)
00977 {
00978 #pragma omp parallel for default(shared) private(k)
00979 for (k = 0; k < ths->M_total; k++)
00980 {
00981 int l;
00982 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
00983 ths->f[j] = K(0.0);
00984 for (l = 0; l < lprod; l++)
00985 ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
00986 }
00987 return;
00988 }
00989
00990 if (ths->nfft_flags & PRE_PSI)
00991 {
00992 #pragma omp parallel for default(shared) private(k)
00993 for (k = 0; k < ths->M_total; k++)
00994 {
00995 int u[ths->d], o[ths->d];
00996 int t, t2;
00997 int l_L;
00998 int l[ths->d];
00999 int lj[ths->d];
01000 int ll_plain[ths->d+1];
01001 R phi_prod[ths->d+1];
01002 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01003
01004 phi_prod[0] = K(1.0);
01005 ll_plain[0] = 0;
01006
01007 MACRO_init_uo_l_lj_t;
01008
01009 for (l_L = 0; l_L < lprod; l_L++)
01010 {
01011 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
01012
01013 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01014
01015 MACRO_count_uo_l_lj_t;
01016 }
01017 }
01018 return;
01019 }
01020
01021 if (ths->nfft_flags & PRE_FG_PSI)
01022 {
01023 int t, t2;
01024 R fg_exp_l[ths->d][2*ths->m+2];
01025
01026 for(t2 = 0; t2 < ths->d; t2++)
01027 {
01028 int lj_fg;
01029 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01030 R tmpEXP2sq = tmpEXP2*tmpEXP2;
01031 R tmp2 = K(1.0);
01032 R tmp3 = K(1.0);
01033 fg_exp_l[t2][0] = K(1.0);
01034 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01035 {
01036 tmp3 = tmp2*tmpEXP2;
01037 tmp2 *= tmpEXP2sq;
01038 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01039 }
01040 }
01041
01042 #pragma omp parallel for default(shared) private(k,t,t2)
01043 for (k = 0; k < ths->M_total; k++)
01044 {
01045 int ll_plain[ths->d+1];
01046 R phi_prod[ths->d+1];
01047 int u[ths->d], o[ths->d];
01048 int l[ths->d];
01049 int lj[ths->d];
01050 R fg_psi[ths->d][2*ths->m+2];
01051 R tmpEXP1, tmp1;
01052 int l_fg,lj_fg;
01053 int l_L;
01054 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01055
01056 phi_prod[0] = K(1.0);
01057 ll_plain[0] = 0;
01058
01059 MACRO_init_uo_l_lj_t;
01060
01061 for (t2 = 0; t2 < ths->d; t2++)
01062 {
01063 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
01064 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
01065 tmp1 = K(1.0);
01066 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01067 {
01068 tmp1 *= tmpEXP1;
01069 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01070 }
01071 }
01072
01073 for (l_L= 0; l_L < lprod; l_L++)
01074 {
01075 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01076
01077 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01078
01079 MACRO_count_uo_l_lj_t;
01080 }
01081 }
01082 return;
01083 }
01084
01085 if (ths->nfft_flags & FG_PSI)
01086 {
01087 int t, t2;
01088 R fg_exp_l[ths->d][2*ths->m+2];
01089
01090 nfft_sort_nodes(ths);
01091
01092 for (t2 = 0; t2 < ths->d; t2++)
01093 {
01094 int lj_fg;
01095 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01096 R tmpEXP2sq = tmpEXP2*tmpEXP2;
01097 R tmp2 = K(1.0);
01098 R tmp3 = K(1.0);
01099 fg_exp_l[t2][0] = K(1.0);
01100 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01101 {
01102 tmp3 = tmp2*tmpEXP2;
01103 tmp2 *= tmpEXP2sq;
01104 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01105 }
01106 }
01107
01108 #pragma omp parallel for default(shared) private(k,t,t2)
01109 for (k = 0; k < ths->M_total; k++)
01110 {
01111 int ll_plain[ths->d+1];
01112 R phi_prod[ths->d+1];
01113 int u[ths->d], o[ths->d];
01114 int l[ths->d];
01115 int lj[ths->d];
01116 R fg_psi[ths->d][2*ths->m+2];
01117 R tmpEXP1, tmp1;
01118 int l_fg,lj_fg;
01119 int l_L;
01120 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01121
01122 phi_prod[0] = K(1.0);
01123 ll_plain[0] = 0;
01124
01125 MACRO_init_uo_l_lj_t;
01126
01127 for (t2 = 0; t2 < ths->d; t2++)
01128 {
01129 fg_psi[t2][0] = (PHI((ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
01130
01131 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
01132 /ths->b[t2]);
01133 tmp1 = K(1.0);
01134 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01135 {
01136 tmp1 *= tmpEXP1;
01137 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01138 }
01139 }
01140
01141 for (l_L = 0; l_L < lprod; l_L++)
01142 {
01143 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01144
01145 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01146
01147 MACRO_count_uo_l_lj_t;
01148 }
01149 }
01150 return;
01151 }
01152
01153 if (ths->nfft_flags & PRE_LIN_PSI)
01154 {
01155 nfft_sort_nodes(ths);
01156
01157 #pragma omp parallel for default(shared) private(k)
01158 for (k = 0; k<ths->M_total; k++)
01159 {
01160 int u[ths->d], o[ths->d];
01161 int t, t2;
01162 int l_L;
01163 int l[ths->d];
01164 int lj[ths->d];
01165 int ll_plain[ths->d+1];
01166 R phi_prod[ths->d+1];
01167 R y[ths->d];
01168 R fg_psi[ths->d][2*ths->m+2];
01169 int l_fg,lj_fg;
01170 R ip_w;
01171 int ip_u;
01172 int ip_s = ths->K/(ths->m+2);
01173 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01174
01175 phi_prod[0] = K(1.0);
01176 ll_plain[0] = 0;
01177
01178 MACRO_init_uo_l_lj_t;
01179
01180 for (t2 = 0; t2 < ths->d; t2++)
01181 {
01182 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
01183 * ((R)ths->K))/(ths->m+2);
01184 ip_u = LRINT(FLOOR(y[t2]));
01185 ip_w = y[t2]-ip_u;
01186 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
01187 {
01188 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
01189 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
01190 * (ip_w);
01191 }
01192 }
01193
01194 for (l_L = 0; l_L < lprod; l_L++)
01195 {
01196 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01197
01198 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01199
01200 MACRO_count_uo_l_lj_t;
01201 }
01202 }
01203 return;
01204 }
01205
01206
01207 nfft_sort_nodes(ths);
01208
01209 #pragma omp parallel for default(shared) private(k)
01210 for (k = 0; k < ths->M_total; k++)
01211 {
01212 int u[ths->d], o[ths->d];
01213 int t, t2;
01214 int l_L;
01215 int l[ths->d];
01216 int lj[ths->d];
01217 int ll_plain[ths->d+1];
01218 R phi_prod[ths->d+1];
01219 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01220
01221 phi_prod[0] = K(1.0);
01222 ll_plain[0] = 0;
01223
01224 MACRO_init_uo_l_lj_t;
01225
01226 for (l_L = 0; l_L < lprod; l_L++)
01227 {
01228 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
01229
01230 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01231
01232 MACRO_count_uo_l_lj_t;
01233 }
01234 }
01235 }
01236 #endif
01237
01238 static void nfft_B_A(nfft_plan *ths)
01239 {
01240 #ifdef _OPENMP
01241 nfft_B_openmp_A(ths);
01242 #else
01243 nfft_B_serial_A(ths);
01244 #endif
01245 }
01246
01247 #ifdef _OPENMP
01248
01263 static inline int index_x_binary_search(const int *ar_x, const int len, const int key)
01264 {
01265 int left = 0, right = len - 1;
01266
01267 if (len == 1)
01268 return 0;
01269
01270 while (left < right - 1)
01271 {
01272 int i = (left + right) / 2;
01273 if (ar_x[2*i] >= key)
01274 right = i;
01275 else if (ar_x[2*i] < key)
01276 left = i;
01277 }
01278
01279 if (ar_x[2*left] < key && left != len-1)
01280 return left+1;
01281
01282 return left;
01283 }
01284 #endif
01285
01286 #ifdef _OPENMP
01287
01302 static void nfft_adjoint_B_omp_blockwise_init(int *my_u0, int *my_o0, int *min_u_a, int *max_u_a, int *min_u_b, int *max_u_b, const int d, const int *n, const int m)
01303 {
01304 const int n0 = n[0];
01305 int k;
01306 int nthreads = omp_get_num_threads();
01307 int nthreads_used = MIN(nthreads, n0);
01308 int size_per_thread = n0 / nthreads_used;
01309 int size_left = n0 - size_per_thread * nthreads_used;
01310 int size_g[nthreads_used];
01311 int offset_g[nthreads_used];
01312 int my_id = omp_get_thread_num();
01313 int n_prod_rest = 1;
01314
01315 for (k = 1; k < d; k++)
01316 n_prod_rest *= n[k];
01317
01318 *min_u_a = -1;
01319 *max_u_a = -1;
01320 *min_u_b = -1;
01321 *max_u_b = -1;
01322 *my_u0 = -1;
01323 *my_o0 = -1;
01324
01325 if (my_id < nthreads_used)
01326 {
01327 const int m22 = 2 * m + 2;
01328
01329 offset_g[0] = 0;
01330 for (k = 0; k < nthreads_used; k++)
01331 {
01332 if (k > 0)
01333 offset_g[k] = offset_g[k-1] + size_g[k-1];
01334 size_g[k] = size_per_thread;
01335 if (size_left > 0)
01336 {
01337 size_g[k]++;
01338 size_left--;
01339 }
01340 }
01341
01342 *my_u0 = offset_g[my_id];
01343 *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
01344
01345 if (nthreads_used > 1)
01346 {
01347 *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
01348 *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
01349 }
01350 else
01351 {
01352 *min_u_a = 0;
01353 *max_u_a = n_prod_rest * n0 - 1;
01354 }
01355
01356 if (*min_u_a < 0)
01357 {
01358 *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
01359 *max_u_b = n_prod_rest * n0 - 1;
01360 *min_u_a = 0;
01361 }
01362
01363 if (*min_u_b != -1 && *min_u_b <= *max_u_a)
01364 {
01365 *max_u_a = *max_u_b;
01366 *min_u_b = -1;
01367 *max_u_b = -1;
01368 }
01369 #ifdef OMP_ASSERT
01370 assert(*min_u_a <= *max_u_a);
01371 assert(*min_u_b <= *max_u_b);
01372 assert(*min_u_b == -1 || *max_u_a < *min_u_b);
01373 #endif
01374 }
01375 }
01376 #endif
01377
01386 static void nfft_adjoint_B_compute_full_psi(
01387 C *g, const int *psi_index_g, const R *psi, const C *f,
01388 const int M, const int d, const int *n, const int m, const int nfft_flags, const int *index_x)
01389 {
01390 int k;
01391 int lprod, lprod_m1;
01392 {
01393 int t;
01394 for(t = 0, lprod = 1; t < d; t++)
01395 lprod *= 2 * m + 2;
01396 }
01397 lprod_m1 = lprod / (2 * m + 2);
01398
01399 #ifdef _OPENMP
01400 if (nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)
01401 {
01402 #pragma omp parallel private(k)
01403 {
01404 int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
01405 const int *ar_x = index_x;
01406 int n_prod_rest = 1;
01407
01408 for (k = 1; k < d; k++)
01409 n_prod_rest *= n[k];
01410
01411 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
01412
01413 if (min_u_a != -1)
01414 {
01415 k = index_x_binary_search(ar_x, M, min_u_a);
01416 #ifdef OMP_ASSERT
01417 assert(ar_x[2*k] >= min_u_a || k == M-1);
01418 if (k > 0)
01419 assert(ar_x[2*k-2] < min_u_a);
01420 #endif
01421 while (k < M)
01422 {
01423 int l0, lrest;
01424 int u_prod = ar_x[2*k];
01425 int j = ar_x[2*k+1];
01426
01427 if (u_prod < min_u_a || u_prod > max_u_a)
01428 break;
01429
01430 for (l0 = 0; l0 < 2 * m + 2; l0++)
01431 {
01432 const int start_index = psi_index_g[j * lprod + l0 * lprod_m1];
01433
01434 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
01435 continue;
01436
01437 for (lrest = 0; lrest < lprod_m1; lrest++)
01438 {
01439 const int l = l0 * lprod_m1 + lrest;
01440 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
01441 }
01442 }
01443
01444 k++;
01445 }
01446 }
01447
01448 if (min_u_b != -1)
01449 {
01450 k = index_x_binary_search(ar_x, M, min_u_b);
01451 #ifdef OMP_ASSERT
01452 assert(ar_x[2*k] >= min_u_b || k == M-1);
01453 if (k > 0)
01454 assert(ar_x[2*k-2] < min_u_b);
01455 #endif
01456 while (k < M)
01457 {
01458 int l0, lrest;
01459 int u_prod = ar_x[2*k];
01460 int j = ar_x[2*k+1];
01461
01462 if (u_prod < min_u_b || u_prod > max_u_b)
01463 break;
01464
01465 for (l0 = 0; l0 < 2 * m + 2; l0++)
01466 {
01467 const int start_index = psi_index_g[j * lprod + l0 * lprod_m1];
01468
01469 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
01470 continue;
01471 for (lrest = 0; lrest < lprod_m1; lrest++)
01472 {
01473 const int l = l0 * lprod_m1 + lrest;
01474 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
01475 }
01476 }
01477
01478 k++;
01479 }
01480 }
01481 }
01482 return;
01483 }
01484 #endif
01485
01486 #pragma omp parallel for default(shared) private(k)
01487 for (k = 0; k < M; k++)
01488 {
01489 int l;
01490 int j = (nfft_flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
01491
01492 for (l = 0; l < lprod; l++)
01493 {
01494 #ifdef _OPENMP
01495 C val = psi[j * lprod + l] * f[j];
01496 C *gref = g + psi_index_g[j * lprod + l];
01497 R *gref_real = (R*) gref;
01498
01499 #pragma omp atomic
01500 gref_real[0] += creal(val);
01501
01502 #pragma omp atomic
01503 gref_real[1] += cimag(val);
01504 #else
01505 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
01506 #endif
01507 }
01508 }
01509 }
01510
01511 #ifndef _OPENMP
01512 MACRO_nfft_B(T)
01513 #endif
01514
01515 #ifdef _OPENMP
01516 static inline void nfft_B_openmp_T(nfft_plan *ths)
01517 {
01518 int lprod;
01519 int k;
01520
01521 memset(ths->g,0,ths->n_total*sizeof(C));
01522
01523 for (k = 0, lprod = 1; k < ths->d; k++)
01524 lprod *= (2*ths->m+2);
01525
01526 if (ths->nfft_flags & PRE_FULL_PSI)
01527 {
01528 nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f, ths->M_total,
01529 ths->d, ths->n, ths->m, ths->nfft_flags, ths->index_x);
01530 return;
01531 }
01532
01533 if (ths->nfft_flags & PRE_PSI)
01534 {
01535 #pragma omp parallel for default(shared) private(k)
01536 for (k = 0; k < ths->M_total; k++)
01537 {
01538 int u[ths->d], o[ths->d];
01539 int t, t2;
01540 int l_L;
01541 int l[ths->d];
01542 int lj[ths->d];
01543 int ll_plain[ths->d+1];
01544 R phi_prod[ths->d+1];
01545 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01546
01547 phi_prod[0] = K(1.0);
01548 ll_plain[0] = 0;
01549
01550 MACRO_init_uo_l_lj_t;
01551
01552 for (l_L = 0; l_L < lprod; l_L++)
01553 {
01554 C *lhs;
01555 R *lhs_real;
01556 C val;
01557
01558 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
01559
01560 lhs = ths->g + ll_plain[ths->d];
01561 lhs_real = (R*)lhs;
01562 val = phi_prod[ths->d] * ths->f[j];
01563
01564 #pragma omp atomic
01565 lhs_real[0] += creal(val);
01566
01567 #pragma omp atomic
01568 lhs_real[1] += cimag(val);
01569
01570 MACRO_count_uo_l_lj_t;
01571 }
01572 }
01573 return;
01574 }
01575
01576 if (ths->nfft_flags & PRE_FG_PSI)
01577 {
01578 int t, t2;
01579 R fg_exp_l[ths->d][2*ths->m+2];
01580 for(t2 = 0; t2 < ths->d; t2++)
01581 {
01582 int lj_fg;
01583 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01584 R tmpEXP2sq = tmpEXP2*tmpEXP2;
01585 R tmp2 = K(1.0);
01586 R tmp3 = K(1.0);
01587 fg_exp_l[t2][0] = K(1.0);
01588 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01589 {
01590 tmp3 = tmp2*tmpEXP2;
01591 tmp2 *= tmpEXP2sq;
01592 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01593 }
01594 }
01595
01596 #pragma omp parallel for default(shared) private(k,t,t2)
01597 for (k = 0; k < ths->M_total; k++)
01598 {
01599 int ll_plain[ths->d+1];
01600 R phi_prod[ths->d+1];
01601 int u[ths->d], o[ths->d];
01602 int l[ths->d];
01603 int lj[ths->d];
01604 R fg_psi[ths->d][2*ths->m+2];
01605 R tmpEXP1, tmp1;
01606 int l_fg,lj_fg;
01607 int l_L;
01608 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01609
01610 phi_prod[0] = K(1.0);
01611 ll_plain[0] = 0;
01612
01613 MACRO_init_uo_l_lj_t;
01614
01615 for (t2 = 0; t2 < ths->d; t2++)
01616 {
01617 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
01618 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
01619 tmp1 = K(1.0);
01620 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01621 {
01622 tmp1 *= tmpEXP1;
01623 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01624 }
01625 }
01626
01627 for (l_L= 0; l_L < lprod; l_L++)
01628 {
01629 C *lhs;
01630 R *lhs_real;
01631 C val;
01632
01633 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01634
01635 lhs = ths->g + ll_plain[ths->d];
01636 lhs_real = (R*)lhs;
01637 val = phi_prod[ths->d] * ths->f[j];
01638
01639 #pragma omp atomic
01640 lhs_real[0] += creal(val);
01641
01642 #pragma omp atomic
01643 lhs_real[1] += cimag(val);
01644
01645 MACRO_count_uo_l_lj_t;
01646 }
01647 }
01648 return;
01649 }
01650
01651 if (ths->nfft_flags & FG_PSI)
01652 {
01653 int t, t2;
01654 R fg_exp_l[ths->d][2*ths->m+2];
01655
01656 nfft_sort_nodes(ths);
01657
01658 for (t2 = 0; t2 < ths->d; t2++)
01659 {
01660 int lj_fg;
01661 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01662 R tmpEXP2sq = tmpEXP2*tmpEXP2;
01663 R tmp2 = K(1.0);
01664 R tmp3 = K(1.0);
01665 fg_exp_l[t2][0] = K(1.0);
01666 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01667 {
01668 tmp3 = tmp2*tmpEXP2;
01669 tmp2 *= tmpEXP2sq;
01670 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01671 }
01672 }
01673
01674 #pragma omp parallel for default(shared) private(k,t,t2)
01675 for (k = 0; k < ths->M_total; k++)
01676 {
01677 int ll_plain[ths->d+1];
01678 R phi_prod[ths->d+1];
01679 int u[ths->d], o[ths->d];
01680 int l[ths->d];
01681 int lj[ths->d];
01682 R fg_psi[ths->d][2*ths->m+2];
01683 R tmpEXP1, tmp1;
01684 int l_fg,lj_fg;
01685 int l_L;
01686 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01687
01688 phi_prod[0] = K(1.0);
01689 ll_plain[0] = 0;
01690
01691 MACRO_init_uo_l_lj_t;
01692
01693 for (t2 = 0; t2 < ths->d; t2++)
01694 {
01695 fg_psi[t2][0] = (PHI((ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
01696
01697 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
01698 /ths->b[t2]);
01699 tmp1 = K(1.0);
01700 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01701 {
01702 tmp1 *= tmpEXP1;
01703 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01704 }
01705 }
01706
01707 for (l_L = 0; l_L < lprod; l_L++)
01708 {
01709 C *lhs;
01710 R *lhs_real;
01711 C val;
01712
01713 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01714
01715 lhs = ths->g + ll_plain[ths->d];
01716 lhs_real = (R*)lhs;
01717 val = phi_prod[ths->d] * ths->f[j];
01718
01719 #pragma omp atomic
01720 lhs_real[0] += creal(val);
01721
01722 #pragma omp atomic
01723 lhs_real[1] += cimag(val);
01724
01725 MACRO_count_uo_l_lj_t;
01726 }
01727 }
01728 return;
01729 }
01730
01731 if (ths->nfft_flags & PRE_LIN_PSI)
01732 {
01733 nfft_sort_nodes(ths);
01734
01735 #pragma omp parallel for default(shared) private(k)
01736 for (k = 0; k<ths->M_total; k++)
01737 {
01738 int u[ths->d], o[ths->d];
01739 int t, t2;
01740 int l_L;
01741 int l[ths->d];
01742 int lj[ths->d];
01743 int ll_plain[ths->d+1];
01744 R phi_prod[ths->d+1];
01745 R y[ths->d];
01746 R fg_psi[ths->d][2*ths->m+2];
01747 int l_fg,lj_fg;
01748 R ip_w;
01749 int ip_u;
01750 int ip_s = ths->K/(ths->m+2);
01751 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01752
01753 phi_prod[0] = K(1.0);
01754 ll_plain[0] = 0;
01755
01756 MACRO_init_uo_l_lj_t;
01757
01758 for (t2 = 0; t2 < ths->d; t2++)
01759 {
01760 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
01761 * ((R)ths->K))/(ths->m+2);
01762 ip_u = LRINT(FLOOR(y[t2]));
01763 ip_w = y[t2]-ip_u;
01764 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
01765 {
01766 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
01767 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
01768 * (ip_w);
01769 }
01770 }
01771
01772 for (l_L = 0; l_L < lprod; l_L++)
01773 {
01774 C *lhs;
01775 R *lhs_real;
01776 C val;
01777
01778 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01779
01780 lhs = ths->g + ll_plain[ths->d];
01781 lhs_real = (R*)lhs;
01782 val = phi_prod[ths->d] * ths->f[j];
01783
01784 #pragma omp atomic
01785 lhs_real[0] += creal(val);
01786
01787 #pragma omp atomic
01788 lhs_real[1] += cimag(val);
01789
01790 MACRO_count_uo_l_lj_t;
01791 }
01792 }
01793 return;
01794 }
01795
01796
01797 nfft_sort_nodes(ths);
01798
01799 #pragma omp parallel for default(shared) private(k)
01800 for (k = 0; k < ths->M_total; k++)
01801 {
01802 int u[ths->d], o[ths->d];
01803 int t, t2;
01804 int l_L;
01805 int l[ths->d];
01806 int lj[ths->d];
01807 int ll_plain[ths->d+1];
01808 R phi_prod[ths->d+1];
01809 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01810
01811 phi_prod[0] = K(1.0);
01812 ll_plain[0] = 0;
01813
01814 MACRO_init_uo_l_lj_t;
01815
01816 for (l_L = 0; l_L < lprod; l_L++)
01817 {
01818 C *lhs;
01819 R *lhs_real;
01820 C val;
01821
01822 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
01823
01824 lhs = ths->g + ll_plain[ths->d];
01825 lhs_real = (R*)lhs;
01826 val = phi_prod[ths->d] * ths->f[j];
01827
01828 #pragma omp atomic
01829 lhs_real[0] += creal(val);
01830
01831 #pragma omp atomic
01832 lhs_real[1] += cimag(val);
01833
01834 MACRO_count_uo_l_lj_t;
01835 }
01836 }
01837 }
01838 #endif
01839
01840 static void nfft_B_T(nfft_plan *ths)
01841 {
01842 #ifdef _OPENMP
01843 nfft_B_openmp_T(ths);
01844 #else
01845 nfft_B_serial_T(ths);
01846 #endif
01847 }
01848
01849
01850
01851 static void nfft_1d_init_fg_exp_l(R *fg_exp_l, const int m, const R b)
01852 {
01853 const int tmp2 = 2*m+2;
01854 int l;
01855 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
01856
01857 fg_exp_b0 = EXP(K(-1.0)/b);
01858 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
01859 fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
01860
01861 for (l = 1; l < tmp2; l++)
01862 {
01863 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
01864 fg_exp_b1 *= fg_exp_b0_sq;
01865 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
01866 }
01867 }
01868
01869
01870 static void nfft_trafo_1d_compute(C *fj, const C *g,const R *psij_const,
01871 const R *xj, const int n, const int m)
01872 {
01873 int u, o, l;
01874 const C *gj;
01875 const R *psij;
01876 psij = psij_const;
01877
01878 nfft_uo2(&u, &o, *xj, n, m);
01879
01880 if (u < o)
01881 {
01882 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
01883 (*fj) += (*psij++) * (*gj++);
01884 }
01885 else
01886 {
01887 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
01888 (*fj) += (*psij++) * (*gj++);
01889 for (l = 0, gj = g; l <= o; l++)
01890 (*fj) += (*psij++) * (*gj++);
01891 }
01892 }
01893
01894 #ifndef _OPENMP
01895 static void nfft_adjoint_1d_compute_serial(const C *fj, C *g,const R *psij_const,
01896 const R *xj, const int n, const int m)
01897 {
01898 int u,o,l;
01899 C *gj;
01900 const R *psij;
01901 psij=psij_const;
01902
01903 nfft_uo2(&u,&o,*xj, n, m);
01904
01905 if(u<o)
01906 {
01907 for (l = 0, gj = g+u; l <= 2*m+1; l++)
01908 (*gj++) += (*psij++) * (*fj);
01909 }
01910 else
01911 {
01912 for (l = 0, gj = g+u; l < 2*m+1-o; l++)
01913 (*gj++) += (*psij++) * (*fj);
01914 for (l = 0, gj = g; l <= o; l++)
01915 (*gj++) += (*psij++) * (*fj);
01916 }
01917 }
01918 #endif
01919
01920 #ifdef _OPENMP
01921
01922 static void nfft_adjoint_1d_compute_omp_atomic(const C f, C *g,const R *psij_const,
01923 const R *xj, const int n, const int m)
01924 {
01925 int u,o,l;
01926 C *gj;
01927 int index_temp[2*m+2];
01928
01929 nfft_uo2(&u,&o,*xj, n, m);
01930
01931 for (l=0; l<=2*m+1; l++)
01932 index_temp[l] = (l+u)%n;
01933
01934 for (l = 0, gj = g+u; l <= 2*m+1; l++)
01935 {
01936 int i = index_temp[l];
01937 C *lhs = g+i;
01938 R *lhs_real = (R*)lhs;
01939 C val = psij_const[l] * f;
01940 #pragma omp atomic
01941 lhs_real[0] += creal(val);
01942
01943 #pragma omp atomic
01944 lhs_real[1] += cimag(val);
01945 }
01946 }
01947 #endif
01948
01949 #ifdef _OPENMP
01950
01965 static void nfft_adjoint_1d_compute_omp_blockwise(const C f, C *g,const R *psij_const,
01966 const R *xj, const int n, const int m, const int my_u0, const int my_o0)
01967 {
01968 int ar_u,ar_o,l;
01969
01970 nfft_uo2(&ar_u,&ar_o,*xj, n, m);
01971
01972 if(ar_u<ar_o)
01973 {
01974 int u = MAX(my_u0,ar_u);
01975 int o = MIN(my_o0,ar_o);
01976 int offset_psij = u-ar_u;
01977 #ifdef OMP_ASSERT
01978 assert(offset_psij >= 0);
01979 assert(o-u <= 2*m+1);
01980 assert(offset_psij+o-u <= 2*m+1);
01981 #endif
01982
01983 for (l = 0; l <= o-u; l++)
01984 g[u+l] += psij_const[offset_psij+l] * f;
01985 }
01986 else
01987 {
01988 int u = MAX(my_u0,ar_u);
01989 int o = my_o0;
01990 int offset_psij = u-ar_u;
01991 #ifdef OMP_ASSERT
01992 assert(offset_psij >= 0);
01993 assert(o-u <= 2*m+1);
01994 assert(offset_psij+o-u <= 2*m+1);
01995 #endif
01996
01997 for (l = 0; l <= o-u; l++)
01998 g[u+l] += psij_const[offset_psij+l] * f;
01999
02000 u = my_u0;
02001 o = MIN(my_o0,ar_o);
02002 offset_psij += my_u0-ar_u+n;
02003
02004 #ifdef OMP_ASSERT
02005 if (u<=o)
02006 {
02007 assert(o-u <= 2*m+1);
02008 if (offset_psij+o-u > 2*m+1)
02009 {
02010 fprintf(stderr, "ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
02011 }
02012 assert(offset_psij+o-u <= 2*m+1);
02013 }
02014 #endif
02015 for (l = 0; l <= o-u; l++)
02016 g[u+l] += psij_const[offset_psij+l] * f;
02017 }
02018 }
02019 #endif
02020
02021 static void nfft_trafo_1d_B(nfft_plan *ths)
02022 {
02023 const int n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
02024 const C *g = (C*)ths->g;
02025
02026 if (ths->nfft_flags & PRE_FULL_PSI)
02027 {
02028 int k;
02029 #pragma omp parallel for default(shared) private(k)
02030 for (k = 0; k < M; k++)
02031 {
02032 int l;
02033 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02034 ths->f[j] = K(0.0);
02035 for (l = 0; l < m2p2; l++)
02036 ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
02037 }
02038 return;
02039 }
02040
02041 if (ths->nfft_flags & PRE_PSI)
02042 {
02043 int k;
02044 #pragma omp parallel for default(shared) private(k)
02045 for (k = 0; k < M; k++)
02046 {
02047 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02048 nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
02049 &ths->x[j], n, m);
02050 }
02051 return;
02052 }
02053
02054 if (ths->nfft_flags & PRE_FG_PSI)
02055 {
02056 int k;
02057 R fg_exp_l[m2p2];
02058
02059 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02060
02061 #pragma omp parallel for default(shared) private(k)
02062 for (k = 0; k < M; k++)
02063 {
02064 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02065 const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
02066 R fg_psij2 = K(1.0);
02067 R psij_const[m2p2];
02068 int l;
02069
02070 psij_const[0] = fg_psij0;
02071
02072 for (l = 1; l < m2p2; l++)
02073 {
02074 fg_psij2 *= fg_psij1;
02075 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02076 }
02077
02078 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02079 }
02080
02081 return;
02082 }
02083
02084 if (ths->nfft_flags & FG_PSI)
02085 {
02086 int k;
02087 R fg_exp_l[m2p2];
02088
02089 nfft_sort_nodes(ths);
02090
02091 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02092
02093 #pragma omp parallel for default(shared) private(k)
02094 for (k = 0; k < M; k++)
02095 {
02096 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02097 int u, o, l;
02098 R fg_psij0, fg_psij1, fg_psij2;
02099 R psij_const[m2p2];
02100
02101 nfft_uo(ths, (int)j, &u, &o, 0);
02102 fg_psij0 = (PHI(ths->x[j]-((R)u)/n,0));
02103 fg_psij1 = EXP(K(2.0) * (n * ths->x[j] - u) / ths->b[0]);
02104 fg_psij2 = K(1.0);
02105
02106 psij_const[0] = fg_psij0;
02107
02108 for (l = 1; l < m2p2; l++)
02109 {
02110 fg_psij2 *= fg_psij1;
02111 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02112 }
02113
02114 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02115 }
02116 return;
02117 }
02118
02119 if (ths->nfft_flags & PRE_LIN_PSI)
02120 {
02121 const int K = ths->K, ip_s = K / (m + 2);
02122 int k;
02123
02124 nfft_sort_nodes(ths);
02125
02126 #pragma omp parallel for default(shared) private(k)
02127 for (k = 0; k < M; k++)
02128 {
02129 int u, o, l;
02130 R ip_y, ip_w;
02131 int ip_u;
02132 R psij_const[m2p2];
02133 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02134
02135 nfft_uo(ths, (int)j, &u, &o, 0);
02136
02137 ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s);
02138 ip_u = LRINT(FLOOR(ip_y));
02139 ip_w = ip_y - ip_u;
02140
02141 for (l = 0; l < m2p2; l++)
02142 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
02143 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
02144
02145 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02146 }
02147 return;
02148 }
02149 else
02150 {
02151
02152 int k;
02153
02154 nfft_sort_nodes(ths);
02155
02156 #pragma omp parallel for default(shared) private(k)
02157 for (k = 0; k < M; k++)
02158 {
02159 R psij_const[m2p2];
02160 int u, o, l;
02161 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02162
02163 nfft_uo(ths, (int)j, &u, &o, 0);
02164
02165 for (l = 0; l < m2p2; l++)
02166 psij_const[l] = (PHI(ths->x[j]-((R)((u+l)))/n,0));
02167
02168 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02169 }
02170 }
02171 }
02172
02173
02174 #ifdef OMP_ASSERT
02175 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
02176 { \
02177 assert(ar_x[2*k] >= min_u_a || k == M-1); \
02178 if (k > 0) \
02179 assert(ar_x[2*k-2] < min_u_a); \
02180 }
02181 #else
02182 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
02183 #endif
02184
02185 #ifdef OMP_ASSERT
02186 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
02187 { \
02188 assert(ar_x[2*k] >= min_u_b || k == M-1); \
02189 if (k > 0) \
02190 assert(ar_x[2*k-2] < min_u_b); \
02191 }
02192 #else
02193 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
02194 #endif
02195
02196 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
02197 { \
02198 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
02199 ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
02200 }
02201
02202 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
02203 { \
02204 R psij_const[2 * m + 2]; \
02205 int u, o, l; \
02206 R fg_psij0 = ths->psi[2 * j]; \
02207 R fg_psij1 = ths->psi[2 * j + 1]; \
02208 R fg_psij2 = K(1.0); \
02209 \
02210 psij_const[0] = fg_psij0; \
02211 for (l = 1; l <= 2 * m + 1; l++) \
02212 { \
02213 fg_psij2 *= fg_psij1; \
02214 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
02215 } \
02216 \
02217 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
02218 ths->x + j, n, m, my_u0, my_o0); \
02219 }
02220
02221 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
02222 { \
02223 R psij_const[2 * m + 2]; \
02224 R fg_psij0, fg_psij1, fg_psij2; \
02225 int u, o, l; \
02226 \
02227 nfft_uo(ths, j, &u, &o, 0); \
02228 fg_psij0 = (PHI(ths->x[j]-((R)u)/n,0)); \
02229 fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \
02230 fg_psij2 = K(1.0); \
02231 psij_const[0] = fg_psij0; \
02232 for (l = 1; l <= 2 * m + 1; l++) \
02233 { \
02234 fg_psij2 *= fg_psij1; \
02235 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
02236 } \
02237 \
02238 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
02239 ths->x + j, n, m, my_u0, my_o0); \
02240 }
02241
02242 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
02243 { \
02244 R psij_const[2 * m + 2]; \
02245 int ip_u; \
02246 R ip_y, ip_w; \
02247 int u, o, l; \
02248 \
02249 nfft_uo(ths, j, &u, &o, 0); \
02250 \
02251 ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \
02252 ip_u = LRINT(FLOOR(ip_y)); \
02253 ip_w = ip_y - ip_u; \
02254 for (l = 0; l < 2 * m + 2; l++) \
02255 psij_const[l] \
02256 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
02257 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
02258 \
02259 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
02260 ths->x + j, n, m, my_u0, my_o0); \
02261 }
02262
02263 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
02264 { \
02265 R psij_const[2 * m + 2]; \
02266 int u, o, l; \
02267 \
02268 nfft_uo(ths, j, &u, &o, 0); \
02269 \
02270 for (l = 0; l <= 2 * m + 1; l++) \
02271 psij_const[l] = (PHI(ths->x[j]-((R)((u+l)))/n,0)); \
02272 \
02273 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
02274 ths->x + j, n, m, my_u0, my_o0); \
02275 }
02276
02277 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
02278 { \
02279 if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
02280 { \
02281 _Pragma("omp parallel private(k)") \
02282 { \
02283 int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
02284 int *ar_x = ths->index_x; \
02285 \
02286 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
02287 &min_u_b, &max_u_b, 1, &n, m); \
02288 \
02289 if (min_u_a != -1) \
02290 { \
02291 k = index_x_binary_search(ar_x, M, min_u_a); \
02292 \
02293 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
02294 \
02295 while (k < M) \
02296 { \
02297 int u_prod = ar_x[2*k]; \
02298 int j = ar_x[2*k+1]; \
02299 \
02300 if (u_prod < min_u_a || u_prod > max_u_a) \
02301 break; \
02302 \
02303 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
02304 \
02305 k++; \
02306 } \
02307 } \
02308 \
02309 if (min_u_b != -1) \
02310 { \
02311 k = index_x_binary_search(ar_x, M, min_u_b); \
02312 \
02313 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
02314 \
02315 while (k < M) \
02316 { \
02317 int u_prod = ar_x[2*k]; \
02318 int j = ar_x[2*k+1]; \
02319 \
02320 if (u_prod < min_u_b || u_prod > max_u_b) \
02321 break; \
02322 \
02323 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
02324 \
02325 k++; \
02326 } \
02327 } \
02328 } \
02329 return; \
02330 } \
02331 }
02332
02333 static void nfft_adjoint_1d_B(nfft_plan *ths)
02334 {
02335 const int n = ths->n[0], M = ths->M_total, m = ths->m;
02336 int k;
02337 C *g = (C*)ths->g;
02338
02339 memset(g,0,ths->n_total*sizeof(C));
02340
02341 if (ths->nfft_flags & PRE_FULL_PSI)
02342 {
02343 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
02344 1, ths->n, m, ths->nfft_flags, ths->index_x);
02345 return;
02346 }
02347
02348 if (ths->nfft_flags & PRE_PSI)
02349 {
02350 #ifdef _OPENMP
02351 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
02352 #endif
02353
02354 #pragma omp parallel for default(shared) private(k)
02355 for (k = 0; k < M; k++)
02356 {
02357 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02358 #ifdef _OPENMP
02359 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
02360 #else
02361 nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
02362 #endif
02363 }
02364
02365 return;
02366 }
02367
02368 if (ths->nfft_flags & PRE_FG_PSI)
02369 {
02370 R fg_exp_l[2 * m + 2];
02371
02372 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02373
02374 #ifdef _OPENMP
02375 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
02376 #endif
02377
02378
02379 #pragma omp parallel for default(shared) private(k)
02380 for (k = 0; k < M; k++)
02381 {
02382 R psij_const[2 * m + 2];
02383 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02384 int l;
02385 R fg_psij0 = ths->psi[2 * j];
02386 R fg_psij1 = ths->psi[2 * j + 1];
02387 R fg_psij2 = K(1.0);
02388
02389 psij_const[0] = fg_psij0;
02390 for (l = 1; l <= 2 * m + 1; l++)
02391 {
02392 fg_psij2 *= fg_psij1;
02393 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02394 }
02395
02396 #ifdef _OPENMP
02397 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02398 #else
02399 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02400 #endif
02401 }
02402
02403 return;
02404 }
02405
02406 if (ths->nfft_flags & FG_PSI)
02407 {
02408 R fg_exp_l[2 * m + 2];
02409
02410 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02411
02412 nfft_sort_nodes(ths);
02413
02414 #ifdef _OPENMP
02415 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
02416 #endif
02417
02418 #pragma omp parallel for default(shared) private(k)
02419 for (k = 0; k < M; k++)
02420 {
02421 int u,o,l;
02422 R psij_const[2 * m + 2];
02423 R fg_psij0, fg_psij1, fg_psij2;
02424 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02425
02426 nfft_uo(ths, j, &u, &o, 0);
02427 fg_psij0 = (PHI(ths->x[j]-((R)u)/n,0));
02428 fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]);
02429 fg_psij2 = K(1.0);
02430 psij_const[0] = fg_psij0;
02431 for (l = 1; l <= 2 * m + 1; l++)
02432 {
02433 fg_psij2 *= fg_psij1;
02434 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02435 }
02436
02437 #ifdef _OPENMP
02438 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02439 #else
02440 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02441 #endif
02442 }
02443
02444 return;
02445 }
02446
02447 if (ths->nfft_flags & PRE_LIN_PSI)
02448 {
02449 const int K = ths->K;
02450 const int ip_s = K / (m + 2);
02451
02452 nfft_sort_nodes(ths);
02453
02454 #ifdef _OPENMP
02455 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
02456 #endif
02457
02458 #pragma openmp parallel for default(shared) private(k)
02459 for (k = 0; k < M; k++)
02460 {
02461 int u,o,l;
02462 int ip_u;
02463 R ip_y, ip_w;
02464 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02465 R psij_const[2 * m + 2];
02466
02467 nfft_uo(ths, j, &u, &o, 0);
02468
02469 ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s);
02470 ip_u = LRINT(FLOOR(ip_y));
02471 ip_w = ip_y - ip_u;
02472 for (l = 0; l < 2 * m + 2; l++)
02473 psij_const[l]
02474 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
02475 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
02476
02477 #ifdef _OPENMP
02478 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02479 #else
02480 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02481 #endif
02482 }
02483 return;
02484 }
02485
02486
02487 nfft_sort_nodes(ths);
02488
02489 #ifdef _OPENMP
02490 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
02491 #endif
02492
02493 #pragma omp parallel for default(shared) private(k)
02494 for (k = 0; k < M; k++)
02495 {
02496 int u,o,l;
02497 R psij_const[2 * m + 2];
02498 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02499
02500 nfft_uo(ths, j, &u, &o, 0);
02501
02502 for (l = 0; l <= 2 * m + 1; l++)
02503 psij_const[l] = (PHI(ths->x[j]-((R)((u+l)))/n,0));
02504
02505 #ifdef _OPENMP
02506 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02507 #else
02508 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02509 #endif
02510 }
02511 }
02512
02513 void nfft_trafo_1d(nfft_plan *ths)
02514 {
02515 const int N = ths->N[0], N2 = N/2, n = ths->n[0];
02516 C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
02517
02518 ths->g_hat = ths->g1;
02519 ths->g = ths->g2;
02520
02521 {
02522 C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
02523 R *c_phi_inv1, *c_phi_inv2;
02524
02525 TIC(0)
02526 #ifdef _OPENMP
02527 {
02528 int k;
02529 #pragma omp parallel for default(shared) private(k)
02530 for (k = 0; k < ths->n_total; k++)
02531 ths->g_hat[k] = 0.0;
02532 }
02533 #else
02534 memset(ths->g_hat, 0, ths->n_total*sizeof(C));
02535 #endif
02536 if(ths->nfft_flags & PRE_PHI_HUT)
02537 {
02538 int k;
02539 c_phi_inv1 = ths->c_phi_inv[0];
02540 c_phi_inv2 = &ths->c_phi_inv[0][N2];
02541
02542 #pragma omp parallel for default(shared) private(k)
02543 for (k = 0; k < N2; k++)
02544 {
02545 g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
02546 g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
02547 }
02548 }
02549 else
02550 {
02551 int k;
02552 #pragma omp parallel for default(shared) private(k)
02553 for (k = 0; k < N2; k++)
02554 {
02555 g_hat1[k] = f_hat1[k] / (PHI_HUT(k-N2,0));
02556 g_hat2[k] = f_hat2[k] / (PHI_HUT(k,0));
02557 }
02558 }
02559 TOC(0)
02560
02561 TIC_FFTW(1)
02562 fftw_execute(ths->my_fftw_plan1);
02563 TOC_FFTW(1);
02564
02565 TIC(2);
02566 nfft_trafo_1d_B(ths);
02567 TOC(2);
02568 }
02569 }
02570
02571 void nfft_adjoint_1d(nfft_plan *ths)
02572 {
02573 int n,N;
02574 C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
02575 R *c_phi_inv1, *c_phi_inv2;
02576
02577 N=ths->N[0];
02578 n=ths->n[0];
02579
02580 ths->g_hat=ths->g1;
02581 ths->g=ths->g2;
02582
02583 f_hat1=(C*)ths->f_hat;
02584 f_hat2=(C*)&ths->f_hat[N/2];
02585 g_hat1=(C*)&ths->g_hat[n-N/2];
02586 g_hat2=(C*)ths->g_hat;
02587
02588 TIC(2)
02589 nfft_adjoint_1d_B(ths);
02590 TOC(2)
02591
02592 TIC_FFTW(1)
02593 fftw_execute(ths->my_fftw_plan2);
02594 TOC_FFTW(1);
02595
02596 TIC(0)
02597 if(ths->nfft_flags & PRE_PHI_HUT)
02598 {
02599 int k;
02600 c_phi_inv1=ths->c_phi_inv[0];
02601 c_phi_inv2=&ths->c_phi_inv[0][N/2];
02602
02603 #pragma omp parallel for default(shared) private(k)
02604 for (k = 0; k < N/2; k++)
02605 {
02606 f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
02607 f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
02608 }
02609 }
02610 else
02611 {
02612 int k;
02613
02614 #pragma omp parallel for default(shared) private(k)
02615 for (k = 0; k < N/2; k++)
02616 {
02617 f_hat1[k] = g_hat1[k] / (PHI_HUT(k-N/2,0));
02618 f_hat2[k] = g_hat2[k] / (PHI_HUT(k,0));
02619 }
02620 }
02621 TOC(0)
02622 }
02623
02624
02625
02626
02627 static void nfft_2d_init_fg_exp_l(R *fg_exp_l, const int m, const R b)
02628 {
02629 int l;
02630 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
02631
02632 fg_exp_b0 = EXP(K(-1.0)/b);
02633 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
02634 fg_exp_b1 = K(1.0);
02635 fg_exp_b2 = K(1.0);
02636 fg_exp_l[0] = K(1.0);
02637 for(l=1; l <= 2*m+1; l++)
02638 {
02639 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
02640 fg_exp_b1 *= fg_exp_b0_sq;
02641 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
02642 }
02643 }
02644
02645 static void nfft_trafo_2d_compute(C *fj, const C *g,
02646 const R *psij_const0, const R *psij_const1,
02647 const R *xj0, const R *xj1,
02648 const int n0, const int n1, const int m)
02649 {
02650 int u0,o0,l0,u1,o1,l1;
02651 const C *gj;
02652 const R *psij0,*psij1;
02653
02654 psij0=psij_const0;
02655 psij1=psij_const1;
02656
02657 nfft_uo2(&u0,&o0,*xj0, n0, m);
02658 nfft_uo2(&u1,&o1,*xj1, n1, m);
02659
02660 *fj=0;
02661
02662 if(u0<o0)
02663 if(u1<o1)
02664 for(l0=0; l0<=2*m+1; l0++,psij0++)
02665 {
02666 psij1=psij_const1;
02667 gj=g+(u0+l0)*n1+u1;
02668 for(l1=0; l1<=2*m+1; l1++)
02669 (*fj) += (*psij0) * (*psij1++) * (*gj++);
02670 }
02671 else
02672 for(l0=0; l0<=2*m+1; l0++,psij0++)
02673 {
02674 psij1=psij_const1;
02675 gj=g+(u0+l0)*n1+u1;
02676 for(l1=0; l1<2*m+1-o1; l1++)
02677 (*fj) += (*psij0) * (*psij1++) * (*gj++);
02678 gj=g+(u0+l0)*n1;
02679 for(l1=0; l1<=o1; l1++)
02680 (*fj) += (*psij0) * (*psij1++) * (*gj++);
02681 }
02682 else
02683 if(u1<o1)
02684 {
02685 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02686 {
02687 psij1=psij_const1;
02688 gj=g+(u0+l0)*n1+u1;
02689 for(l1=0; l1<=2*m+1; l1++)
02690 (*fj) += (*psij0) * (*psij1++) * (*gj++);
02691 }
02692 for(l0=0; l0<=o0; l0++,psij0++)
02693 {
02694 psij1=psij_const1;
02695 gj=g+l0*n1+u1;
02696 for(l1=0; l1<=2*m+1; l1++)
02697 (*fj) += (*psij0) * (*psij1++) * (*gj++);
02698 }
02699 }
02700 else
02701 {
02702 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02703 {
02704 psij1=psij_const1;
02705 gj=g+(u0+l0)*n1+u1;
02706 for(l1=0; l1<2*m+1-o1; l1++)
02707 (*fj) += (*psij0) * (*psij1++) * (*gj++);
02708 gj=g+(u0+l0)*n1;
02709 for(l1=0; l1<=o1; l1++)
02710 (*fj) += (*psij0) * (*psij1++) * (*gj++);
02711 }
02712 for(l0=0; l0<=o0; l0++,psij0++)
02713 {
02714 psij1=psij_const1;
02715 gj=g+l0*n1+u1;
02716 for(l1=0; l1<2*m+1-o1; l1++)
02717 (*fj) += (*psij0) * (*psij1++) * (*gj++);
02718 gj=g+l0*n1;
02719 for(l1=0; l1<=o1; l1++)
02720 (*fj) += (*psij0) * (*psij1++) * (*gj++);
02721 }
02722 }
02723 }
02724
02725 #ifdef _OPENMP
02726
02727 static void nfft_adjoint_2d_compute_omp_atomic(const C f, C *g,
02728 const R *psij_const0, const R *psij_const1,
02729 const R *xj0, const R *xj1,
02730 const int n0, const int n1, const int m)
02731 {
02732 int u0,o0,l0,u1,o1,l1;
02733 const int lprod = (2*m+2) * (2*m+2);
02734
02735 unsigned long int index_temp0[2*m+2];
02736 unsigned long int index_temp1[2*m+2];
02737
02738 nfft_uo2(&u0,&o0,*xj0, n0, m);
02739 nfft_uo2(&u1,&o1,*xj1, n1, m);
02740
02741 for (l0=0; l0<=2*m+1; l0++)
02742 index_temp0[l0] = (u0+l0)%n0;
02743
02744 for (l1=0; l1<=2*m+1; l1++)
02745 index_temp1[l1] = (u1+l1)%n1;
02746
02747 for(l0=0; l0<=2*m+1; l0++)
02748 {
02749 for(l1=0; l1<=2*m+1; l1++)
02750 {
02751 unsigned long int i = index_temp0[l0] * n1 + index_temp1[l1];
02752 C *lhs = g+i;
02753 R *lhs_real = (R*)lhs;
02754 C val = psij_const0[l0] * psij_const1[l1] * f;
02755
02756 #pragma omp atomic
02757 lhs_real[0] += creal(val);
02758
02759 #pragma omp atomic
02760 lhs_real[1] += cimag(val);
02761 }
02762 }
02763 }
02764 #endif
02765
02766 #ifdef _OPENMP
02767
02785 static void nfft_adjoint_2d_compute_omp_blockwise(const C f, C *g,
02786 const R *psij_const0, const R *psij_const1,
02787 const R *xj0, const R *xj1,
02788 const int n0, const int n1, const int m,
02789 const int my_u0, const int my_o0)
02790 {
02791 int ar_u0,ar_o0,l0,u1,o1,l1;
02792 const int lprod = (2*m+2) * (2*m+2);
02793 unsigned long int index_temp1[2*m+2];
02794
02795 nfft_uo2(&ar_u0,&ar_o0,*xj0, n0, m);
02796 nfft_uo2(&u1,&o1,*xj1, n1, m);
02797
02798 for (l1=0; l1<=2*m+1; l1++)
02799 index_temp1[l1] = (u1+l1)%n1;
02800
02801 if(ar_u0<ar_o0)
02802 {
02803 int u0 = MAX(my_u0,ar_u0);
02804 int o0 = MIN(my_o0,ar_o0);
02805 int offset_psij = u0-ar_u0;
02806 #ifdef OMP_ASSERT
02807 assert(offset_psij >= 0);
02808 assert(o0-u0 <= 2*m+1);
02809 assert(offset_psij+o0-u0 <= 2*m+1);
02810 #endif
02811
02812 for (l0 = 0; l0 <= o0-u0; l0++)
02813 {
02814 unsigned long int i0 = (u0+l0) * n1;
02815 const C val0 = psij_const0[offset_psij+l0];
02816
02817 for(l1=0; l1<=2*m+1; l1++)
02818 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
02819 }
02820 }
02821 else
02822 {
02823 int u0 = MAX(my_u0,ar_u0);
02824 int o0 = my_o0;
02825 int offset_psij = u0-ar_u0;
02826 #ifdef OMP_ASSERT
02827 assert(offset_psij >= 0);
02828 assert(o0-u0 <= 2*m+1);
02829 assert(offset_psij+o0-u0 <= 2*m+1);
02830 #endif
02831
02832 for (l0 = 0; l0 <= o0-u0; l0++)
02833 {
02834 unsigned long int i0 = (u0+l0) * n1;
02835 const C val0 = psij_const0[offset_psij+l0];
02836
02837 for(l1=0; l1<=2*m+1; l1++)
02838 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
02839 }
02840
02841 u0 = my_u0;
02842 o0 = MIN(my_o0,ar_o0);
02843 offset_psij += my_u0-ar_u0+n0;
02844
02845 #ifdef OMP_ASSERT
02846 if (u0<=o0)
02847 {
02848 assert(o0-u0 <= 2*m+1);
02849 assert(offset_psij+o0-u0 <= 2*m+1);
02850 }
02851 #endif
02852
02853 for (l0 = 0; l0 <= o0-u0; l0++)
02854 {
02855 unsigned long int i0 = (u0+l0) * n1;
02856 const C val0 = psij_const0[offset_psij+l0];
02857
02858 for(l1=0; l1<=2*m+1; l1++)
02859 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
02860 }
02861 }
02862 }
02863 #endif
02864
02865 #ifndef _OPENMP
02866 static void nfft_adjoint_2d_compute_serial(const C *fj, C *g,
02867 const R *psij_const0, const R *psij_const1,
02868 const R *xj0, const R *xj1,
02869 const int n0, const int n1, const int m)
02870 {
02871 int u0,o0,l0,u1,o1,l1;
02872 C *gj;
02873 const R *psij0,*psij1;
02874
02875 psij0=psij_const0;
02876 psij1=psij_const1;
02877
02878 nfft_uo2(&u0,&o0,*xj0, n0, m);
02879 nfft_uo2(&u1,&o1,*xj1, n1, m);
02880
02881 if(u0<o0)
02882 if(u1<o1)
02883 for(l0=0; l0<=2*m+1; l0++,psij0++)
02884 {
02885 psij1=psij_const1;
02886 gj=g+(u0+l0)*n1+u1;
02887 for(l1=0; l1<=2*m+1; l1++)
02888 (*gj++) += (*psij0) * (*psij1++) * (*fj);
02889 }
02890 else
02891 for(l0=0; l0<=2*m+1; l0++,psij0++)
02892 {
02893 psij1=psij_const1;
02894 gj=g+(u0+l0)*n1+u1;
02895 for(l1=0; l1<2*m+1-o1; l1++)
02896 (*gj++) += (*psij0) * (*psij1++) * (*fj);
02897 gj=g+(u0+l0)*n1;
02898 for(l1=0; l1<=o1; l1++)
02899 (*gj++) += (*psij0) * (*psij1++) * (*fj);
02900 }
02901 else
02902 if(u1<o1)
02903 {
02904 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02905 {
02906 psij1=psij_const1;
02907 gj=g+(u0+l0)*n1+u1;
02908 for(l1=0; l1<=2*m+1; l1++)
02909 (*gj++) += (*psij0) * (*psij1++) * (*fj);
02910 }
02911 for(l0=0; l0<=o0; l0++,psij0++)
02912 {
02913 psij1=psij_const1;
02914 gj=g+l0*n1+u1;
02915 for(l1=0; l1<=2*m+1; l1++)
02916 (*gj++) += (*psij0) * (*psij1++) * (*fj);
02917 }
02918 }
02919 else
02920 {
02921 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02922 {
02923 psij1=psij_const1;
02924 gj=g+(u0+l0)*n1+u1;
02925 for(l1=0; l1<2*m+1-o1; l1++)
02926 (*gj++) += (*psij0) * (*psij1++) * (*fj);
02927 gj=g+(u0+l0)*n1;
02928 for(l1=0; l1<=o1; l1++)
02929 (*gj++) += (*psij0) * (*psij1++) * (*fj);
02930 }
02931 for(l0=0; l0<=o0; l0++,psij0++)
02932 {
02933 psij1=psij_const1;
02934 gj=g+l0*n1+u1;
02935 for(l1=0; l1<2*m+1-o1; l1++)
02936 (*gj++) += (*psij0) * (*psij1++) * (*fj);
02937 gj=g+l0*n1;
02938 for(l1=0; l1<=o1; l1++)
02939 (*gj++) += (*psij0) * (*psij1++) * (*fj);
02940 }
02941 }
02942 }
02943 #endif
02944
02945 static void nfft_trafo_2d_B(nfft_plan *ths)
02946 {
02947 const C *g = (C*)ths->g;
02948 const int N0 = ths->N[0];
02949 const int n0 = ths->n[0];
02950 const int N1 = ths->N[1];
02951 const int n1 = ths->n[1];
02952 const int M = ths->M_total;
02953 const int m = ths->m;
02954
02955 int k;
02956
02957 if(ths->nfft_flags & PRE_FULL_PSI)
02958 {
02959 const int lprod = (2*m+2) * (2*m+2);
02960 #pragma omp parallel for default(shared) private(k)
02961 for (k = 0; k < M; k++)
02962 {
02963 int l;
02964 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02965 ths->f[j] = K(0.0);
02966 for (l = 0; l < lprod; l++)
02967 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
02968 }
02969 return;
02970 }
02971
02972 if(ths->nfft_flags & PRE_PSI)
02973 {
02974 #pragma omp parallel for default(shared) private(k)
02975 for (k = 0; k < M; k++)
02976 {
02977 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02978 nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
02979 }
02980
02981 return;
02982 }
02983
02984 if(ths->nfft_flags & PRE_FG_PSI)
02985 {
02986 R fg_exp_l[2*(2*m+2)];
02987
02988 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02989 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
02990
02991 #pragma omp parallel for default(shared) private(k)
02992 for (k = 0; k < M; k++)
02993 {
02994 R psij_const[2*(2*m+2)];
02995 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02996 int l;
02997 R fg_psij0 = ths->psi[2*j*2];
02998 R fg_psij1 = ths->psi[2*j*2+1];
02999 R fg_psij2 = K(1.0);
03000
03001 psij_const[0] = fg_psij0;
03002 for(l=1; l<=2*m+1; l++)
03003 {
03004 fg_psij2 *= fg_psij1;
03005 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03006 }
03007
03008 fg_psij0 = ths->psi[2*(j*2+1)];
03009 fg_psij1 = ths->psi[2*(j*2+1)+1];
03010 fg_psij2 = K(1.0);
03011 psij_const[2*m+2] = fg_psij0;
03012 for(l=1; l<=2*m+1; l++)
03013 {
03014 fg_psij2 *= fg_psij1;
03015 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03016 }
03017
03018 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03019 }
03020
03021 return;
03022 }
03023
03024 if(ths->nfft_flags & FG_PSI)
03025 {
03026 R fg_exp_l[2*(2*m+2)];
03027
03028 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
03029 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
03030
03031 nfft_sort_nodes(ths);
03032
03033 #pragma omp parallel for default(shared) private(k)
03034 for (k = 0; k < M; k++)
03035 {
03036 int u, o, l;
03037 R fg_psij0, fg_psij1, fg_psij2;
03038 R psij_const[2*(2*m+2)];
03039 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03040
03041 nfft_uo(ths,j,&u,&o,0);
03042 fg_psij0 = (PHI(ths->x[2*j]-((R)u)/n0,0));
03043 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]);
03044 fg_psij2 = K(1.0);
03045 psij_const[0] = fg_psij0;
03046 for(l=1; l<=2*m+1; l++)
03047 {
03048 fg_psij2 *= fg_psij1;
03049 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03050 }
03051
03052 nfft_uo(ths,j,&u,&o,1);
03053 fg_psij0 = (PHI(ths->x[2*j+1]-((R)u)/n1,1));
03054 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]);
03055 fg_psij2 = K(1.0);
03056 psij_const[2*m+2] = fg_psij0;
03057 for(l=1; l<=2*m+1; l++)
03058 {
03059 fg_psij2 *= fg_psij1;
03060 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03061 }
03062
03063 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03064 }
03065
03066 return;
03067 }
03068
03069 if(ths->nfft_flags & PRE_LIN_PSI)
03070 {
03071 const int K = ths->K, ip_s = K / (m + 2);
03072
03073 nfft_sort_nodes(ths);
03074
03075 #pragma omp parallel for default(shared) private(k)
03076 for (k = 0; k < M; k++)
03077 {
03078 int u, o, l;
03079 R ip_y, ip_w;
03080 int ip_u;
03081 R psij_const[2*(2*m+2)];
03082 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03083
03084 nfft_uo(ths,j,&u,&o,0);
03085 ip_y = FABS(n0*ths->x[2*j] - u)*((R)ip_s);
03086 ip_u = LRINT(FLOOR(ip_y));
03087 ip_w = ip_y-ip_u;
03088 for(l=0; l < 2*m+2; l++)
03089 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
03090
03091 nfft_uo(ths,j,&u,&o,1);
03092 ip_y = FABS(n1*ths->x[2*j+1] - u)*((R)ip_s);
03093 ip_u = LRINT(FLOOR(ip_y));
03094 ip_w = ip_y-ip_u;
03095 for(l=0; l < 2*m+2; l++)
03096 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
03097
03098 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03099 }
03100 return;
03101 }
03102
03103
03104
03105 nfft_sort_nodes(ths);
03106
03107 #pragma omp parallel for default(shared) private(k)
03108 for (k = 0; k < M; k++)
03109 {
03110 R psij_const[2*(2*m+2)];
03111 int u, o, l;
03112 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03113
03114 nfft_uo(ths,j,&u,&o,0);
03115 for(l=0;l<=2*m+1;l++)
03116 psij_const[l]=(PHI(ths->x[2*j]-((R)((u+l)))/n0,0));
03117
03118 nfft_uo(ths,j,&u,&o,1);
03119 for(l=0;l<=2*m+1;l++)
03120 psij_const[2*m+2+l]=(PHI(ths->x[2*j+1]-((R)((u+l)))/n1,1));
03121
03122 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03123 }
03124 }
03125
03126 #ifdef OMP_ASSERT
03127 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
03128 { \
03129 assert(ar_x[2*k] >= min_u_a || k == M-1); \
03130 if (k > 0) \
03131 assert(ar_x[2*k-2] < min_u_a); \
03132 }
03133 #else
03134 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
03135 #endif
03136
03137 #ifdef OMP_ASSERT
03138 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
03139 { \
03140 assert(ar_x[2*k] >= min_u_b || k == M-1); \
03141 if (k > 0) \
03142 assert(ar_x[2*k-2] < min_u_b); \
03143 }
03144 #else
03145 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
03146 #endif
03147
03148 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
03149 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
03150 ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
03151 ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
03152
03153 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
03154 { \
03155 R psij_const[2*(2*m+2)]; \
03156 int u, o, l; \
03157 R fg_psij0 = ths->psi[2*j*2]; \
03158 R fg_psij1 = ths->psi[2*j*2+1]; \
03159 R fg_psij2 = K(1.0); \
03160 \
03161 psij_const[0] = fg_psij0; \
03162 for(l=1; l<=2*m+1; l++) \
03163 { \
03164 fg_psij2 *= fg_psij1; \
03165 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
03166 } \
03167 \
03168 fg_psij0 = ths->psi[2*(j*2+1)]; \
03169 fg_psij1 = ths->psi[2*(j*2+1)+1]; \
03170 fg_psij2 = K(1.0); \
03171 psij_const[2*m+2] = fg_psij0; \
03172 for(l=1; l<=2*m+1; l++) \
03173 { \
03174 fg_psij2 *= fg_psij1; \
03175 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
03176 } \
03177 \
03178 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
03179 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
03180 n0, n1, m, my_u0, my_o0); \
03181 }
03182
03183 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
03184 { \
03185 R psij_const[2*(2*m+2)]; \
03186 R fg_psij0, fg_psij1, fg_psij2; \
03187 int u, o, l; \
03188 \
03189 nfft_uo(ths,j,&u,&o,0); \
03190 fg_psij0 = (PHI(ths->x[2*j]-((R)u)/n0,0)); \
03191 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \
03192 fg_psij2 = K(1.0); \
03193 psij_const[0] = fg_psij0; \
03194 for(l=1; l<=2*m+1; l++) \
03195 { \
03196 fg_psij2 *= fg_psij1; \
03197 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
03198 } \
03199 \
03200 nfft_uo(ths,j,&u,&o,1); \
03201 fg_psij0 = (PHI(ths->x[2*j+1]-((R)u)/n1,1)); \
03202 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \
03203 fg_psij2 = K(1.0); \
03204 psij_const[2*m+2] = fg_psij0; \
03205 for(l=1; l<=2*m+1; l++) \
03206 { \
03207 fg_psij2 *= fg_psij1; \
03208 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
03209 } \
03210 \
03211 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
03212 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
03213 n0, n1, m, my_u0, my_o0); \
03214 }
03215
03216 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
03217 { \
03218 R psij_const[2*(2*m+2)]; \
03219 int u, o, l; \
03220 int ip_u; \
03221 R ip_y, ip_w; \
03222 \
03223 nfft_uo(ths,j,&u,&o,0); \
03224 ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \
03225 ip_u = LRINT(FLOOR(ip_y)); \
03226 ip_w = ip_y-ip_u; \
03227 for(l=0; l < 2*m+2; l++) \
03228 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
03229 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
03230 \
03231 nfft_uo(ths,j,&u,&o,1); \
03232 ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \
03233 ip_u = LRINT(FLOOR(ip_y)); \
03234 ip_w = ip_y-ip_u; \
03235 for(l=0; l < 2*m+2; l++) \
03236 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
03237 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
03238 \
03239 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
03240 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
03241 n0, n1, m, my_u0, my_o0); \
03242 }
03243
03244 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
03245 { \
03246 R psij_const[2*(2*m+2)]; \
03247 int u, o, l; \
03248 \
03249 nfft_uo(ths,j,&u,&o,0); \
03250 for(l=0;l<=2*m+1;l++) \
03251 psij_const[l]=(PHI(ths->x[2*j]-((R)((u+l)))/n0,0)); \
03252 \
03253 nfft_uo(ths,j,&u,&o,1); \
03254 for(l=0;l<=2*m+1;l++) \
03255 psij_const[2*m+2+l]=(PHI(ths->x[2*j+1]-((R)((u+l)))/n1,1)); \
03256 \
03257 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
03258 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
03259 n0, n1, m, my_u0, my_o0); \
03260 }
03261
03262 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
03263 { \
03264 if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
03265 { \
03266 _Pragma("omp parallel private(k)") \
03267 { \
03268 int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
03269 int *ar_x = ths->index_x; \
03270 \
03271 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
03272 &min_u_b, &max_u_b, 2, ths->n, m); \
03273 \
03274 if (min_u_a != -1) \
03275 { \
03276 k = index_x_binary_search(ar_x, M, min_u_a); \
03277 \
03278 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
03279 \
03280 while (k < M) \
03281 { \
03282 int u_prod = ar_x[2*k]; \
03283 int j = ar_x[2*k+1]; \
03284 \
03285 if (u_prod < min_u_a || u_prod > max_u_a) \
03286 break; \
03287 \
03288 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
03289 \
03290 k++; \
03291 } \
03292 } \
03293 \
03294 if (min_u_b != -1) \
03295 { \
03296 int k = index_x_binary_search(ar_x, M, min_u_b); \
03297 \
03298 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
03299 \
03300 while (k < M) \
03301 { \
03302 int u_prod = ar_x[2*k]; \
03303 int j = ar_x[2*k+1]; \
03304 \
03305 if (u_prod < min_u_b || u_prod > max_u_b) \
03306 break; \
03307 \
03308 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
03309 \
03310 k++; \
03311 } \
03312 } \
03313 } \
03314 return; \
03315 } \
03316 }
03317
03318
03319 static void nfft_adjoint_2d_B(nfft_plan *ths)
03320 {
03321 const int N0 = ths->N[0];
03322 const int n0 = ths->n[0];
03323 const int N1 = ths->N[1];
03324 const int n1 = ths->n[1];
03325 const int M = ths->M_total;
03326 const int m = ths->m;
03327 C* g = (C*) ths->g;
03328 int k;
03329
03330 memset(g,0,ths->n_total*sizeof(C));
03331
03332 if(ths->nfft_flags & PRE_FULL_PSI)
03333 {
03334 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
03335 2, ths->n, m, ths->nfft_flags, ths->index_x);
03336 return;
03337 }
03338
03339 if(ths->nfft_flags & PRE_PSI)
03340 {
03341 #ifdef _OPENMP
03342 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
03343 #endif
03344
03345 #pragma omp parallel for default(shared) private(k)
03346 for (k = 0; k < M; k++)
03347 {
03348 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03349 #ifdef _OPENMP
03350 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03351 #else
03352 nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03353 #endif
03354 }
03355 return;
03356 }
03357
03358 if(ths->nfft_flags & PRE_FG_PSI)
03359 {
03360 R fg_exp_l[2*(2*m+2)];
03361
03362 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
03363 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
03364
03365 #ifdef _OPENMP
03366 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
03367 #endif
03368
03369
03370 #pragma omp parallel for default(shared) private(k)
03371 for (k = 0; k < M; k++)
03372 {
03373 R psij_const[2*(2*m+2)];
03374 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03375 int l;
03376 R fg_psij0 = ths->psi[2*j*2];
03377 R fg_psij1 = ths->psi[2*j*2+1];
03378 R fg_psij2 = K(1.0);
03379
03380 psij_const[0] = fg_psij0;
03381 for(l=1; l<=2*m+1; l++)
03382 {
03383 fg_psij2 *= fg_psij1;
03384 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03385 }
03386
03387 fg_psij0 = ths->psi[2*(j*2+1)];
03388 fg_psij1 = ths->psi[2*(j*2+1)+1];
03389 fg_psij2 = K(1.0);
03390 psij_const[2*m+2] = fg_psij0;
03391 for(l=1; l<=2*m+1; l++)
03392 {
03393 fg_psij2 *= fg_psij1;
03394 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03395 }
03396
03397 #ifdef _OPENMP
03398 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03399 #else
03400 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03401 #endif
03402 }
03403
03404 return;
03405 }
03406
03407 if(ths->nfft_flags & FG_PSI)
03408 {
03409 R fg_exp_l[2*(2*m+2)];
03410
03411 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
03412 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
03413
03414 nfft_sort_nodes(ths);
03415
03416 #ifdef _OPENMP
03417 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
03418 #endif
03419
03420 #pragma omp parallel for default(shared) private(k)
03421 for (k = 0; k < M; k++)
03422 {
03423 int u, o, l;
03424 R fg_psij0, fg_psij1, fg_psij2;
03425 R psij_const[2*(2*m+2)];
03426 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03427
03428 nfft_uo(ths,j,&u,&o,0);
03429 fg_psij0 = (PHI(ths->x[2*j]-((R)u)/n0,0));
03430 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]);
03431 fg_psij2 = K(1.0);
03432 psij_const[0] = fg_psij0;
03433 for(l=1; l<=2*m+1; l++)
03434 {
03435 fg_psij2 *= fg_psij1;
03436 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03437 }
03438
03439 nfft_uo(ths,j,&u,&o,1);
03440 fg_psij0 = (PHI(ths->x[2*j+1]-((R)u)/n1,1));
03441 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]);
03442 fg_psij2 = K(1.0);
03443 psij_const[2*m+2] = fg_psij0;
03444 for(l=1; l<=2*m+1; l++)
03445 {
03446 fg_psij2 *= fg_psij1;
03447 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03448 }
03449
03450 #ifdef _OPENMP
03451 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03452 #else
03453 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03454 #endif
03455 }
03456
03457 return;
03458 }
03459
03460 if(ths->nfft_flags & PRE_LIN_PSI)
03461 {
03462 const int K = ths->K;
03463 const int ip_s = K / (m + 2);
03464
03465 nfft_sort_nodes(ths);
03466
03467 #ifdef _OPENMP
03468 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
03469 #endif
03470
03471 #pragma openmp parallel for default(shared) private(k)
03472 for (k = 0; k < M; k++)
03473 {
03474 int u,o,l;
03475 int ip_u;
03476 R ip_y, ip_w;
03477 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03478 R psij_const[2*(2*m+2)];
03479
03480 nfft_uo(ths,j,&u,&o,0);
03481 ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s);
03482 ip_u = LRINT(FLOOR(ip_y));
03483 ip_w = ip_y-ip_u;
03484 for(l=0; l < 2*m+2; l++)
03485 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
03486 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
03487
03488 nfft_uo(ths,j,&u,&o,1);
03489 ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s);
03490 ip_u = LRINT(FLOOR(ip_y));
03491 ip_w = ip_y-ip_u;
03492 for(l=0; l < 2*m+2; l++)
03493 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
03494 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
03495
03496 #ifdef _OPENMP
03497 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03498 #else
03499 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03500 #endif
03501 }
03502 return;
03503 }
03504
03505
03506 nfft_sort_nodes(ths);
03507
03508 #ifdef _OPENMP
03509 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
03510 #endif
03511
03512 #pragma omp parallel for default(shared) private(k)
03513 for (k = 0; k < M; k++)
03514 {
03515 int u,o,l;
03516 R psij_const[2*(2*m+2)];
03517 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03518
03519 nfft_uo(ths,j,&u,&o,0);
03520 for(l=0;l<=2*m+1;l++)
03521 psij_const[l]=(PHI(ths->x[2*j]-((R)((u+l)))/n0,0));
03522
03523 nfft_uo(ths,j,&u,&o,1);
03524 for(l=0;l<=2*m+1;l++)
03525 psij_const[2*m+2+l]=(PHI(ths->x[2*j+1]-((R)((u+l)))/n1,1));
03526
03527 #ifdef _OPENMP
03528 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03529 #else
03530 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03531 #endif
03532 }
03533 }
03534
03535
03536 void nfft_trafo_2d(nfft_plan *ths)
03537 {
03538 int k0,k1,n0,n1,N0,N1;
03539 C *g_hat,*f_hat;
03540 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
03541 R ck01, ck02, ck11, ck12;
03542 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
03543
03544 ths->g_hat=ths->g1;
03545 ths->g=ths->g2;
03546
03547 N0=ths->N[0];
03548 N1=ths->N[1];
03549 n0=ths->n[0];
03550 n1=ths->n[1];
03551
03552 f_hat=(C*)ths->f_hat;
03553 g_hat=(C*)ths->g_hat;
03554
03555 TIC(0)
03556 #ifdef _OPENMP
03557 #pragma omp parallel for default(shared) private(k0)
03558 for (k0 = 0; k0 < ths->n_total; k0++)
03559 ths->g_hat[k0] = 0.0;
03560 #else
03561 memset(ths->g_hat,0,ths->n_total*sizeof(C));
03562 #endif
03563 if(ths->nfft_flags & PRE_PHI_HUT)
03564 {
03565 c_phi_inv01=ths->c_phi_inv[0];
03566 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
03567
03568 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
03569 for(k0=0;k0<N0/2;k0++)
03570 {
03571 ck01=c_phi_inv01[k0];
03572 ck02=c_phi_inv02[k0];
03573
03574 c_phi_inv11=ths->c_phi_inv[1];
03575 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
03576
03577 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
03578 f_hat11=f_hat + k0*N1;
03579 g_hat21=g_hat + k0*n1+n1-(N1/2);
03580 f_hat21=f_hat + ((N0/2)+k0)*N1;
03581 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
03582 f_hat12=f_hat + k0*N1+(N1/2);
03583 g_hat22=g_hat + k0*n1;
03584 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
03585
03586 for(k1=0;k1<N1/2;k1++)
03587 {
03588 ck11=c_phi_inv11[k1];
03589 ck12=c_phi_inv12[k1];
03590
03591 g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
03592 g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
03593 g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
03594 g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
03595 }
03596 }
03597 }
03598 else
03599 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
03600 for(k0=0;k0<N0/2;k0++)
03601 {
03602 ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
03603 ck02=K(1.0)/(PHI_HUT(k0,0));
03604 for(k1=0;k1<N1/2;k1++)
03605 {
03606 ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
03607 ck12=K(1.0)/(PHI_HUT(k1,1));
03608 g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
03609 g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
03610 g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
03611 g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
03612 }
03613 }
03614
03615 TOC(0)
03616
03617 TIC_FFTW(1)
03618 fftw_execute(ths->my_fftw_plan1);
03619 TOC_FFTW(1);
03620
03621 TIC(2);
03622 nfft_trafo_2d_B(ths);
03623 TOC(2);
03624 }
03625
03626 void nfft_adjoint_2d(nfft_plan *ths)
03627 {
03628 int k0,k1,n0,n1,N0,N1;
03629 C *g_hat,*f_hat;
03630 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
03631 R ck01, ck02, ck11, ck12;
03632 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
03633
03634 ths->g_hat=ths->g1;
03635 ths->g=ths->g2;
03636
03637 N0=ths->N[0];
03638 N1=ths->N[1];
03639 n0=ths->n[0];
03640 n1=ths->n[1];
03641
03642 f_hat=(C*)ths->f_hat;
03643 g_hat=(C*)ths->g_hat;
03644
03645 TIC(2);
03646 nfft_adjoint_2d_B(ths);
03647 TOC(2);
03648
03649 TIC_FFTW(1)
03650 fftw_execute(ths->my_fftw_plan2);
03651 TOC_FFTW(1);
03652
03653 TIC(0)
03654 if(ths->nfft_flags & PRE_PHI_HUT)
03655 {
03656 c_phi_inv01=ths->c_phi_inv[0];
03657 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
03658
03659 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
03660 for(k0=0;k0<N0/2;k0++)
03661 {
03662 ck01=c_phi_inv01[k0];
03663 ck02=c_phi_inv02[k0];
03664
03665 c_phi_inv11=ths->c_phi_inv[1];
03666 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
03667
03668 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
03669 f_hat11=f_hat + k0*N1;
03670 g_hat21=g_hat + k0*n1+n1-(N1/2);
03671 f_hat21=f_hat + ((N0/2)+k0)*N1;
03672 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
03673 f_hat12=f_hat + k0*N1+(N1/2);
03674 g_hat22=g_hat + k0*n1;
03675 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
03676
03677 for(k1=0;k1<N1/2;k1++)
03678 {
03679 ck11=c_phi_inv11[k1];
03680 ck12=c_phi_inv12[k1];
03681
03682 f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
03683 f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
03684 f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
03685 f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
03686 }
03687 }
03688 }
03689 else
03690 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
03691 for(k0=0;k0<N0/2;k0++)
03692 {
03693 ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
03694 ck02=K(1.0)/(PHI_HUT(k0,0));
03695 for(k1=0;k1<N1/2;k1++)
03696 {
03697 ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
03698 ck12=K(1.0)/(PHI_HUT(k1,1));
03699 f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
03700 f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
03701 f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
03702 f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
03703 }
03704 }
03705 TOC(0)
03706 }
03707
03708
03709
03710 static void nfft_3d_init_fg_exp_l(R *fg_exp_l, const int m, const R b)
03711 {
03712 int l;
03713 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
03714
03715 fg_exp_b0 = EXP(-1.0/b);
03716 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
03717 fg_exp_b1 = K(1.0);
03718 fg_exp_b2 = K(1.0);
03719 fg_exp_l[0] = K(1.0);
03720 for(l=1; l <= 2*m+1; l++)
03721 {
03722 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
03723 fg_exp_b1 *= fg_exp_b0_sq;
03724 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
03725 }
03726 }
03727
03728 static void nfft_trafo_3d_compute(C *fj, const C *g,
03729 const R *psij_const0, const R *psij_const1, const R *psij_const2,
03730 const R *xj0, const R *xj1, const R *xj2,
03731 const int n0, const int n1, const int n2, const int m)
03732 {
03733 int u0,o0,l0,u1,o1,l1,u2,o2,l2;
03734 const C *gj;
03735 const R *psij0,*psij1,*psij2;
03736
03737 psij0=psij_const0;
03738 psij1=psij_const1;
03739 psij2=psij_const2;
03740
03741 nfft_uo2(&u0,&o0,*xj0, n0, m);
03742 nfft_uo2(&u1,&o1,*xj1, n1, m);
03743 nfft_uo2(&u2,&o2,*xj2, n2, m);
03744
03745 *fj=0;
03746
03747 if(u0<o0)
03748 if(u1<o1)
03749 if(u2<o2)
03750 for(l0=0; l0<=2*m+1; l0++,psij0++)
03751 {
03752 psij1=psij_const1;
03753 for(l1=0; l1<=2*m+1; l1++,psij1++)
03754 {
03755 psij2=psij_const2;
03756 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03757 for(l2=0; l2<=2*m+1; l2++)
03758 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03759 }
03760 }
03761 else
03762 for(l0=0; l0<=2*m+1; l0++,psij0++)
03763 {
03764 psij1=psij_const1;
03765 for(l1=0; l1<=2*m+1; l1++,psij1++)
03766 {
03767 psij2=psij_const2;
03768 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03769 for(l2=0; l2<2*m+1-o2; l2++)
03770 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03771 gj=g+((u0+l0)*n1+(u1+l1))*n2;
03772 for(l2=0; l2<=o2; l2++)
03773 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03774 }
03775 }
03776 else
03777 if(u2<o2)
03778 for(l0=0; l0<=2*m+1; l0++,psij0++)
03779 {
03780 psij1=psij_const1;
03781 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03782 {
03783 psij2=psij_const2;
03784 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03785 for(l2=0; l2<=2*m+1; l2++)
03786 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03787 }
03788 for(l1=0; l1<=o1; l1++,psij1++)
03789 {
03790 psij2=psij_const2;
03791 gj=g+((u0+l0)*n1+l1)*n2+u2;
03792 for(l2=0; l2<=2*m+1; l2++)
03793 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03794 }
03795 }
03796 else
03797 {
03798 for(l0=0; l0<=2*m+1; l0++,psij0++)
03799 {
03800 psij1=psij_const1;
03801 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03802 {
03803 psij2=psij_const2;
03804 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03805 for(l2=0; l2<2*m+1-o2; l2++)
03806 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03807 gj=g+((u0+l0)*n1+(u1+l1))*n2;
03808 for(l2=0; l2<=o2; l2++)
03809 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03810 }
03811 for(l1=0; l1<=o1; l1++,psij1++)
03812 {
03813 psij2=psij_const2;
03814 gj=g+((u0+l0)*n1+l1)*n2+u2;
03815 for(l2=0; l2<2*m+1-o2; l2++)
03816 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03817 gj=g+((u0+l0)*n1+l1)*n2;
03818 for(l2=0; l2<=o2; l2++)
03819 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03820 }
03821 }
03822 }
03823 else
03824 if(u1<o1)
03825 if(u2<o2)
03826 {
03827 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
03828 {
03829 psij1=psij_const1;
03830 for(l1=0; l1<=2*m+1; l1++,psij1++)
03831 {
03832 psij2=psij_const2;
03833 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03834 for(l2=0; l2<=2*m+1; l2++)
03835 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03836 }
03837 }
03838
03839 for(l0=0; l0<=o0; l0++,psij0++)
03840 {
03841 psij1=psij_const1;
03842 for(l1=0; l1<=2*m+1; l1++,psij1++)
03843 {
03844 psij2=psij_const2;
03845 gj=g+(l0*n1+(u1+l1))*n2+u2;
03846 for(l2=0; l2<=2*m+1; l2++)
03847 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03848 }
03849 }
03850 }
03851 else
03852 {
03853 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
03854 {
03855 psij1=psij_const1;
03856 for(l1=0; l1<=2*m+1; l1++,psij1++)
03857 {
03858 psij2=psij_const2;
03859 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03860 for(l2=0; l2<2*m+1-o2; l2++)
03861 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03862 gj=g+((u0+l0)*n1+(u1+l1))*n2;
03863 for(l2=0; l2<=o2; l2++)
03864 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03865 }
03866 }
03867
03868 for(l0=0; l0<=o0; l0++,psij0++)
03869 {
03870 psij1=psij_const1;
03871 for(l1=0; l1<=2*m+1; l1++,psij1++)
03872 {
03873 psij2=psij_const2;
03874 gj=g+(l0*n1+(u1+l1))*n2+u2;
03875 for(l2=0; l2<2*m+1-o2; l2++)
03876 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03877 gj=g+(l0*n1+(u1+l1))*n2;
03878 for(l2=0; l2<=o2; l2++)
03879 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03880 }
03881 }
03882 }
03883 else
03884 if(u2<o2)
03885 {
03886 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
03887 {
03888 psij1=psij_const1;
03889 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03890 {
03891 psij2=psij_const2;
03892 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03893 for(l2=0; l2<=2*m+1; l2++)
03894 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03895 }
03896 for(l1=0; l1<=o1; l1++,psij1++)
03897 {
03898 psij2=psij_const2;
03899 gj=g+((u0+l0)*n1+l1)*n2+u2;
03900 for(l2=0; l2<=2*m+1; l2++)
03901 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03902 }
03903 }
03904 for(l0=0; l0<=o0; l0++,psij0++)
03905 {
03906 psij1=psij_const1;
03907 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03908 {
03909 psij2=psij_const2;
03910 gj=g+(l0*n1+(u1+l1))*n2+u2;
03911 for(l2=0; l2<=2*m+1; l2++)
03912 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03913 }
03914 for(l1=0; l1<=o1; l1++,psij1++)
03915 {
03916 psij2=psij_const2;
03917 gj=g+(l0*n1+l1)*n2+u2;
03918 for(l2=0; l2<=2*m+1; l2++)
03919 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03920 }
03921 }
03922 }
03923 else
03924 {
03925 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
03926 {
03927 psij1=psij_const1;
03928 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03929 {
03930 psij2=psij_const2;
03931 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03932 for(l2=0; l2<2*m+1-o2; l2++)
03933 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03934 gj=g+((u0+l0)*n1+(u1+l1))*n2;
03935 for(l2=0; l2<=o2; l2++)
03936 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03937 }
03938 for(l1=0; l1<=o1; l1++,psij1++)
03939 {
03940 psij2=psij_const2;
03941 gj=g+((u0+l0)*n1+l1)*n2+u2;
03942 for(l2=0; l2<2*m+1-o2; l2++)
03943 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03944 gj=g+((u0+l0)*n1+l1)*n2;
03945 for(l2=0; l2<=o2; l2++)
03946 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03947 }
03948 }
03949
03950 for(l0=0; l0<=o0; l0++,psij0++)
03951 {
03952 psij1=psij_const1;
03953 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03954 {
03955 psij2=psij_const2;
03956 gj=g+(l0*n1+(u1+l1))*n2+u2;
03957 for(l2=0; l2<2*m+1-o2; l2++)
03958 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03959 gj=g+(l0*n1+(u1+l1))*n2;
03960 for(l2=0; l2<=o2; l2++)
03961 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03962 }
03963 for(l1=0; l1<=o1; l1++,psij1++)
03964 {
03965 psij2=psij_const2;
03966 gj=g+(l0*n1+l1)*n2+u2;
03967 for(l2=0; l2<2*m+1-o2; l2++)
03968 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03969 gj=g+(l0*n1+l1)*n2;
03970 for(l2=0; l2<=o2; l2++)
03971 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03972 }
03973 }
03974 }
03975 }
03976
03977 #ifdef _OPENMP
03978
03999 static void nfft_adjoint_3d_compute_omp_blockwise(const C f, C *g,
04000 const R *psij_const0, const R *psij_const1, const R *psij_const2,
04001 const R *xj0, const R *xj1, const R *xj2,
04002 const int n0, const int n1, const int n2, const int m,
04003 const int my_u0, const int my_o0)
04004 {
04005 int ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
04006 const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
04007
04008 unsigned long int index_temp1[2*m+2];
04009 unsigned long int index_temp2[2*m+2];
04010
04011 nfft_uo2(&ar_u0,&ar_o0,*xj0, n0, m);
04012 nfft_uo2(&u1,&o1,*xj1, n1, m);
04013 nfft_uo2(&u2,&o2,*xj2, n2, m);
04014
04015 for (l1=0; l1<=2*m+1; l1++)
04016 index_temp1[l1] = (u1+l1)%n1;
04017
04018 for (l2=0; l2<=2*m+1; l2++)
04019 index_temp2[l2] = (u2+l2)%n2;
04020
04021 if(ar_u0<ar_o0)
04022 {
04023 int u0 = MAX(my_u0,ar_u0);
04024 int o0 = MIN(my_o0,ar_o0);
04025 int offset_psij = u0-ar_u0;
04026 #ifdef OMP_ASSERT
04027 assert(offset_psij >= 0);
04028 assert(o0-u0 <= 2*m+1);
04029 assert(offset_psij+o0-u0 <= 2*m+1);
04030 #endif
04031
04032 for (l0 = 0; l0 <= o0-u0; l0++)
04033 {
04034 const unsigned long int i0 = (u0+l0) * n1;
04035 const C val0 = psij_const0[offset_psij+l0];
04036
04037 for(l1=0; l1<=2*m+1; l1++)
04038 {
04039 const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
04040 const C val1 = psij_const1[l1];
04041
04042 for(l2=0; l2<=2*m+1; l2++)
04043 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
04044 }
04045 }
04046 }
04047 else
04048 {
04049 int u0 = MAX(my_u0,ar_u0);
04050 int o0 = my_o0;
04051 int offset_psij = u0-ar_u0;
04052 #ifdef OMP_ASSERT
04053 assert(offset_psij >= 0);
04054 assert(o0-u0 <= 2*m+1);
04055 assert(offset_psij+o0-u0 <= 2*m+1);
04056 #endif
04057
04058 for (l0 = 0; l0 <= o0-u0; l0++)
04059 {
04060 unsigned long int i0 = (u0+l0) * n1;
04061 const C val0 = psij_const0[offset_psij+l0];
04062
04063 for(l1=0; l1<=2*m+1; l1++)
04064 {
04065 const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
04066 const C val1 = psij_const1[l1];
04067
04068 for(l2=0; l2<=2*m+1; l2++)
04069 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
04070 }
04071 }
04072
04073 u0 = my_u0;
04074 o0 = MIN(my_o0,ar_o0);
04075 offset_psij += my_u0-ar_u0+n0;
04076
04077 #ifdef OMP_ASSERT
04078 if (u0<=o0)
04079 {
04080 assert(o0-u0 <= 2*m+1);
04081 assert(offset_psij+o0-u0 <= 2*m+1);
04082 }
04083 #endif
04084 for (l0 = 0; l0 <= o0-u0; l0++)
04085 {
04086 unsigned long int i0 = (u0+l0) * n1;
04087 const C val0 = psij_const0[offset_psij+l0];
04088
04089 for(l1=0; l1<=2*m+1; l1++)
04090 {
04091 const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
04092 const C val1 = psij_const1[l1];
04093
04094 for(l2=0; l2<=2*m+1; l2++)
04095 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
04096 }
04097 }
04098 }
04099 }
04100 #endif
04101
04102 #ifdef _OPENMP
04103
04104 static void nfft_adjoint_3d_compute_omp_atomic(const C f, C *g,
04105 const R *psij_const0, const R *psij_const1, const R *psij_const2,
04106 const R *xj0, const R *xj1, const R *xj2,
04107 const int n0, const int n1, const int n2, const int m)
04108 {
04109 int u0,o0,l0,u1,o1,l1,u2,o2,l2;
04110 const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
04111
04112 unsigned long int index_temp0[2*m+2];
04113 unsigned long int index_temp1[2*m+2];
04114 unsigned long int index_temp2[2*m+2];
04115
04116 nfft_uo2(&u0,&o0,*xj0, n0, m);
04117 nfft_uo2(&u1,&o1,*xj1, n1, m);
04118 nfft_uo2(&u2,&o2,*xj2, n2, m);
04119
04120 for (l0=0; l0<=2*m+1; l0++)
04121 index_temp0[l0] = (u0+l0)%n0;
04122
04123 for (l1=0; l1<=2*m+1; l1++)
04124 index_temp1[l1] = (u1+l1)%n1;
04125
04126 for (l2=0; l2<=2*m+1; l2++)
04127 index_temp2[l2] = (u2+l2)%n2;
04128
04129 for(l0=0; l0<=2*m+1; l0++)
04130 {
04131 for(l1=0; l1<=2*m+1; l1++)
04132 {
04133 for(l2=0; l2<=2*m+1; l2++)
04134 {
04135 unsigned long int i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
04136 C *lhs = g+i;
04137 R *lhs_real = (R*)lhs;
04138 C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
04139
04140 #pragma omp atomic
04141 lhs_real[0] += creal(val);
04142
04143 #pragma omp atomic
04144 lhs_real[1] += cimag(val);
04145 }
04146 }
04147 }
04148 }
04149 #endif
04150
04151 #ifndef _OPENMP
04152 static void nfft_adjoint_3d_compute_serial(const C *fj, C *g,
04153 const R *psij_const0, const R *psij_const1, const R *psij_const2,
04154 const R *xj0, const R *xj1, const R *xj2,
04155 const int n0, const int n1, const int n2, const int m)
04156 {
04157 int u0,o0,l0,u1,o1,l1,u2,o2,l2;
04158 C *gj;
04159 const R *psij0,*psij1,*psij2;
04160
04161 psij0=psij_const0;
04162 psij1=psij_const1;
04163 psij2=psij_const2;
04164
04165 nfft_uo2(&u0,&o0,*xj0, n0, m);
04166 nfft_uo2(&u1,&o1,*xj1, n1, m);
04167 nfft_uo2(&u2,&o2,*xj2, n2, m);
04168
04169 if(u0<o0)
04170 if(u1<o1)
04171 if(u2<o2)
04172 for(l0=0; l0<=2*m+1; l0++,psij0++)
04173 {
04174 psij1=psij_const1;
04175 for(l1=0; l1<=2*m+1; l1++,psij1++)
04176 {
04177 psij2=psij_const2;
04178 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04179 for(l2=0; l2<=2*m+1; l2++)
04180 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04181 }
04182 }
04183 else
04184 for(l0=0; l0<=2*m+1; l0++,psij0++)
04185 {
04186 psij1=psij_const1;
04187 for(l1=0; l1<=2*m+1; l1++,psij1++)
04188 {
04189 psij2=psij_const2;
04190 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04191 for(l2=0; l2<2*m+1-o2; l2++)
04192 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04193 gj=g+((u0+l0)*n1+(u1+l1))*n2;
04194 for(l2=0; l2<=o2; l2++)
04195 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04196 }
04197 }
04198 else
04199 if(u2<o2)
04200 for(l0=0; l0<=2*m+1; l0++,psij0++)
04201 {
04202 psij1=psij_const1;
04203 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04204 {
04205 psij2=psij_const2;
04206 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04207 for(l2=0; l2<=2*m+1; l2++)
04208 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04209 }
04210 for(l1=0; l1<=o1; l1++,psij1++)
04211 {
04212 psij2=psij_const2;
04213 gj=g+((u0+l0)*n1+l1)*n2+u2;
04214 for(l2=0; l2<=2*m+1; l2++)
04215 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04216 }
04217 }
04218 else
04219 {
04220 for(l0=0; l0<=2*m+1; l0++,psij0++)
04221 {
04222 psij1=psij_const1;
04223 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04224 {
04225 psij2=psij_const2;
04226 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04227 for(l2=0; l2<2*m+1-o2; l2++)
04228 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04229 gj=g+((u0+l0)*n1+(u1+l1))*n2;
04230 for(l2=0; l2<=o2; l2++)
04231 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04232 }
04233 for(l1=0; l1<=o1; l1++,psij1++)
04234 {
04235 psij2=psij_const2;
04236 gj=g+((u0+l0)*n1+l1)*n2+u2;
04237 for(l2=0; l2<2*m+1-o2; l2++)
04238 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04239 gj=g+((u0+l0)*n1+l1)*n2;
04240 for(l2=0; l2<=o2; l2++)
04241 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04242 }
04243 }
04244 }
04245 else
04246 if(u1<o1)
04247 if(u2<o2)
04248 {
04249 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
04250 {
04251 psij1=psij_const1;
04252 for(l1=0; l1<=2*m+1; l1++,psij1++)
04253 {
04254 psij2=psij_const2;
04255 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04256 for(l2=0; l2<=2*m+1; l2++)
04257 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04258 }
04259 }
04260
04261 for(l0=0; l0<=o0; l0++,psij0++)
04262 {
04263 psij1=psij_const1;
04264 for(l1=0; l1<=2*m+1; l1++,psij1++)
04265 {
04266 psij2=psij_const2;
04267 gj=g+(l0*n1+(u1+l1))*n2+u2;
04268 for(l2=0; l2<=2*m+1; l2++)
04269 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04270 }
04271 }
04272 }
04273 else
04274 {
04275 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
04276 {
04277 psij1=psij_const1;
04278 for(l1=0; l1<=2*m+1; l1++,psij1++)
04279 {
04280 psij2=psij_const2;
04281 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04282 for(l2=0; l2<2*m+1-o2; l2++)
04283 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04284 gj=g+((u0+l0)*n1+(u1+l1))*n2;
04285 for(l2=0; l2<=o2; l2++)
04286 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04287 }
04288 }
04289
04290 for(l0=0; l0<=o0; l0++,psij0++)
04291 {
04292 psij1=psij_const1;
04293 for(l1=0; l1<=2*m+1; l1++,psij1++)
04294 {
04295 psij2=psij_const2;
04296 gj=g+(l0*n1+(u1+l1))*n2+u2;
04297 for(l2=0; l2<2*m+1-o2; l2++)
04298 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04299 gj=g+(l0*n1+(u1+l1))*n2;
04300 for(l2=0; l2<=o2; l2++)
04301 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04302 }
04303 }
04304 }
04305 else
04306 if(u2<o2)
04307 {
04308 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
04309 {
04310 psij1=psij_const1;
04311 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04312 {
04313 psij2=psij_const2;
04314 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04315 for(l2=0; l2<=2*m+1; l2++)
04316 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04317 }
04318 for(l1=0; l1<=o1; l1++,psij1++)
04319 {
04320 psij2=psij_const2;
04321 gj=g+((u0+l0)*n1+l1)*n2+u2;
04322 for(l2=0; l2<=2*m+1; l2++)
04323 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04324 }
04325 }
04326 for(l0=0; l0<=o0; l0++,psij0++)
04327 {
04328 psij1=psij_const1;
04329 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04330 {
04331 psij2=psij_const2;
04332 gj=g+(l0*n1+(u1+l1))*n2+u2;
04333 for(l2=0; l2<=2*m+1; l2++)
04334 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04335 }
04336 for(l1=0; l1<=o1; l1++,psij1++)
04337 {
04338 psij2=psij_const2;
04339 gj=g+(l0*n1+l1)*n2+u2;
04340 for(l2=0; l2<=2*m+1; l2++)
04341 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04342 }
04343 }
04344 }
04345 else
04346 {
04347 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
04348 {
04349 psij1=psij_const1;
04350 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04351 {
04352 psij2=psij_const2;
04353 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04354 for(l2=0; l2<2*m+1-o2; l2++)
04355 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04356 gj=g+((u0+l0)*n1+(u1+l1))*n2;
04357 for(l2=0; l2<=o2; l2++)
04358 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04359 }
04360 for(l1=0; l1<=o1; l1++,psij1++)
04361 {
04362 psij2=psij_const2;
04363 gj=g+((u0+l0)*n1+l1)*n2+u2;
04364 for(l2=0; l2<2*m+1-o2; l2++)
04365 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04366 gj=g+((u0+l0)*n1+l1)*n2;
04367 for(l2=0; l2<=o2; l2++)
04368 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04369 }
04370 }
04371
04372 for(l0=0; l0<=o0; l0++,psij0++)
04373 {
04374 psij1=psij_const1;
04375 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04376 {
04377 psij2=psij_const2;
04378 gj=g+(l0*n1+(u1+l1))*n2+u2;
04379 for(l2=0; l2<2*m+1-o2; l2++)
04380 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04381 gj=g+(l0*n1+(u1+l1))*n2;
04382 for(l2=0; l2<=o2; l2++)
04383 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04384 }
04385 for(l1=0; l1<=o1; l1++,psij1++)
04386 {
04387 psij2=psij_const2;
04388 gj=g+(l0*n1+l1)*n2+u2;
04389 for(l2=0; l2<2*m+1-o2; l2++)
04390 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04391 gj=g+(l0*n1+l1)*n2;
04392 for(l2=0; l2<=o2; l2++)
04393 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04394 }
04395 }
04396 }
04397 }
04398 #endif
04399
04400 static void nfft_trafo_3d_B(nfft_plan *ths)
04401 {
04402 const int N0 = ths->N[0];
04403 const int n0 = ths->n[0];
04404 const int N1 = ths->N[1];
04405 const int n1 = ths->n[1];
04406 const int N2 = ths->N[2];
04407 const int n2 = ths->n[2];
04408 const int M = ths->M_total;
04409 const int m = ths->m;
04410
04411 const C* g = (C*) ths->g;
04412
04413 int k;
04414
04415 if(ths->nfft_flags & PRE_FULL_PSI)
04416 {
04417 const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
04418 #pragma omp parallel for default(shared) private(k)
04419 for (k = 0; k < M; k++)
04420 {
04421 int l;
04422 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04423 ths->f[j] = K(0.0);
04424 for (l = 0; l < lprod; l++)
04425 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
04426 }
04427 return;
04428 }
04429
04430 if(ths->nfft_flags & PRE_PSI)
04431 {
04432 #pragma omp parallel for default(shared) private(k)
04433 for (k = 0; k < M; k++)
04434 {
04435 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04436 nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04437 }
04438 return;
04439 }
04440
04441 if(ths->nfft_flags & PRE_FG_PSI)
04442 {
04443 R fg_exp_l[3*(2*m+2)];
04444
04445 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04446 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04447 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04448
04449 #pragma omp parallel for default(shared) private(k)
04450 for (k = 0; k < M; k++)
04451 {
04452 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04453 int l;
04454 R psij_const[3*(2*m+2)];
04455 R fg_psij0 = ths->psi[2*j*3];
04456 R fg_psij1 = ths->psi[2*j*3+1];
04457 R fg_psij2 = K(1.0);
04458
04459 psij_const[0] = fg_psij0;
04460 for(l=1; l<=2*m+1; l++)
04461 {
04462 fg_psij2 *= fg_psij1;
04463 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04464 }
04465
04466 fg_psij0 = ths->psi[2*(j*3+1)];
04467 fg_psij1 = ths->psi[2*(j*3+1)+1];
04468 fg_psij2 = K(1.0);
04469 psij_const[2*m+2] = fg_psij0;
04470 for(l=1; l<=2*m+1; l++)
04471 {
04472 fg_psij2 *= fg_psij1;
04473 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04474 }
04475
04476 fg_psij0 = ths->psi[2*(j*3+2)];
04477 fg_psij1 = ths->psi[2*(j*3+2)+1];
04478 fg_psij2 = K(1.0);
04479 psij_const[2*(2*m+2)] = fg_psij0;
04480 for(l=1; l<=2*m+1; l++)
04481 {
04482 fg_psij2 *= fg_psij1;
04483 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
04484 }
04485
04486 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04487 }
04488
04489 return;
04490 }
04491
04492 if(ths->nfft_flags & FG_PSI)
04493 {
04494 R fg_exp_l[3*(2*m+2)];
04495
04496 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04497 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04498 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04499
04500 nfft_sort_nodes(ths);
04501
04502 #pragma omp parallel for default(shared) private(k)
04503 for (k = 0; k < M; k++)
04504 {
04505 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04506 int u, o, l;
04507 R psij_const[3*(2*m+2)];
04508 R fg_psij0, fg_psij1, fg_psij2;
04509
04510 nfft_uo(ths,j,&u,&o,0);
04511 fg_psij0 = (PHI(ths->x[3*j]-((R)u)/n0,0));
04512 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]);
04513 fg_psij2 = K(1.0);
04514 psij_const[0] = fg_psij0;
04515 for(l=1; l<=2*m+1; l++)
04516 {
04517 fg_psij2 *= fg_psij1;
04518 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04519 }
04520
04521 nfft_uo(ths,j,&u,&o,1);
04522 fg_psij0 = (PHI(ths->x[3*j+1]-((R)u)/n1,1));
04523 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]);
04524 fg_psij2 = K(1.0);
04525 psij_const[2*m+2] = fg_psij0;
04526 for(l=1; l<=2*m+1; l++)
04527 {
04528 fg_psij2 *= fg_psij1;
04529 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04530 }
04531
04532 nfft_uo(ths,j,&u,&o,2);
04533 fg_psij0 = (PHI(ths->x[3*j+2]-((R)u)/n2,2));
04534 fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]);
04535 fg_psij2 = K(1.0);
04536 psij_const[2*(2*m+2)] = fg_psij0;
04537 for(l=1; l<=2*m+1; l++)
04538 {
04539 fg_psij2 *= fg_psij1;
04540 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
04541 }
04542
04543 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04544 }
04545
04546 return;
04547 }
04548
04549 if(ths->nfft_flags & PRE_LIN_PSI)
04550 {
04551 const int K = ths->K, ip_s = K / (m + 2);
04552
04553 nfft_sort_nodes(ths);
04554
04555 #pragma omp parallel for default(shared) private(k)
04556 for (k = 0; k < M; k++)
04557 {
04558 int u, o, l;
04559 R ip_y, ip_w;
04560 int ip_u;
04561 R psij_const[3*(2*m+2)];
04562 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04563
04564 nfft_uo(ths,j,&u,&o,0);
04565 ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s);
04566 ip_u = LRINT(FLOOR(ip_y));
04567 ip_w = ip_y-ip_u;
04568 for(l=0; l < 2*m+2; l++)
04569 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
04570 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
04571
04572 nfft_uo(ths,j,&u,&o,1);
04573 ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s);
04574 ip_u = LRINT(FLOOR(ip_y));
04575 ip_w = ip_y-ip_u;
04576 for(l=0; l < 2*m+2; l++)
04577 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
04578 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
04579
04580 nfft_uo(ths,j,&u,&o,2);
04581 ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s);
04582 ip_u = LRINT(FLOOR(ip_y));
04583 ip_w = ip_y-ip_u;
04584 for(l=0; l < 2*m+2; l++)
04585 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
04586 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
04587
04588 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04589 }
04590 return;
04591 }
04592
04593
04594
04595 nfft_sort_nodes(ths);
04596
04597 #pragma omp parallel for default(shared) private(k)
04598 for (k = 0; k < M; k++)
04599 {
04600 R psij_const[3*(2*m+2)];
04601 int u, o, l;
04602 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04603
04604 nfft_uo(ths,j,&u,&o,0);
04605 for(l=0;l<=2*m+1;l++)
04606 psij_const[l]=(PHI(ths->x[3*j]-((R)((u+l)))/n0,0));
04607
04608 nfft_uo(ths,j,&u,&o,1);
04609 for(l=0;l<=2*m+1;l++)
04610 psij_const[2*m+2+l]=(PHI(ths->x[3*j+1]-((R)((u+l)))/n1,1));
04611
04612 nfft_uo(ths,j,&u,&o,2);
04613 for(l=0;l<=2*m+1;l++)
04614 psij_const[2*(2*m+2)+l]=(PHI(ths->x[3*j+2]-((R)((u+l)))/n2,2));
04615
04616 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04617 }
04618 }
04619
04620 #ifdef OMP_ASSERT
04621 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
04622 { \
04623 assert(ar_x[2*k] >= min_u_a || k == M-1); \
04624 if (k > 0) \
04625 assert(ar_x[2*k-2] < min_u_a); \
04626 }
04627 #else
04628 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
04629 #endif
04630
04631 #ifdef OMP_ASSERT
04632 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
04633 { \
04634 assert(ar_x[2*k] >= min_u_b || k == M-1); \
04635 if (k > 0) \
04636 assert(ar_x[2*k-2] < min_u_b); \
04637 }
04638 #else
04639 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
04640 #endif
04641
04642 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
04643 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
04644 ths->psi+j*3*(2*m+2), \
04645 ths->psi+(j*3+1)*(2*m+2), \
04646 ths->psi+(j*3+2)*(2*m+2), \
04647 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
04648 n0, n1, n2, m, my_u0, my_o0);
04649
04650 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
04651 { \
04652 int u, o, l; \
04653 R psij_const[3*(2*m+2)]; \
04654 R fg_psij0 = ths->psi[2*j*3]; \
04655 R fg_psij1 = ths->psi[2*j*3+1]; \
04656 R fg_psij2 = K(1.0); \
04657 \
04658 psij_const[0] = fg_psij0; \
04659 for(l=1; l<=2*m+1; l++) \
04660 { \
04661 fg_psij2 *= fg_psij1; \
04662 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
04663 } \
04664 \
04665 fg_psij0 = ths->psi[2*(j*3+1)]; \
04666 fg_psij1 = ths->psi[2*(j*3+1)+1]; \
04667 fg_psij2 = K(1.0); \
04668 psij_const[2*m+2] = fg_psij0; \
04669 for(l=1; l<=2*m+1; l++) \
04670 { \
04671 fg_psij2 *= fg_psij1; \
04672 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
04673 } \
04674 \
04675 fg_psij0 = ths->psi[2*(j*3+2)]; \
04676 fg_psij1 = ths->psi[2*(j*3+2)+1]; \
04677 fg_psij2 = K(1.0); \
04678 psij_const[2*(2*m+2)] = fg_psij0; \
04679 for(l=1; l<=2*m+1; l++) \
04680 { \
04681 fg_psij2 *= fg_psij1; \
04682 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
04683 } \
04684 \
04685 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
04686 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
04687 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
04688 n0, n1, n2, m, my_u0, my_o0); \
04689 }
04690
04691 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
04692 { \
04693 int u, o, l; \
04694 R psij_const[3*(2*m+2)]; \
04695 R fg_psij0, fg_psij1, fg_psij2; \
04696 \
04697 nfft_uo(ths,j,&u,&o,0); \
04698 fg_psij0 = (PHI(ths->x[3*j]-((R)u)/n0,0)); \
04699 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \
04700 fg_psij2 = K(1.0); \
04701 psij_const[0] = fg_psij0; \
04702 for(l=1; l<=2*m+1; l++) \
04703 { \
04704 fg_psij2 *= fg_psij1; \
04705 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
04706 } \
04707 \
04708 nfft_uo(ths,j,&u,&o,1); \
04709 fg_psij0 = (PHI(ths->x[3*j+1]-((R)u)/n1,1)); \
04710 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \
04711 fg_psij2 = K(1.0); \
04712 psij_const[2*m+2] = fg_psij0; \
04713 for(l=1; l<=2*m+1; l++) \
04714 { \
04715 fg_psij2 *= fg_psij1; \
04716 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
04717 } \
04718 \
04719 nfft_uo(ths,j,&u,&o,2); \
04720 fg_psij0 = (PHI(ths->x[3*j+2]-((R)u)/n2,2)); \
04721 fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \
04722 fg_psij2 = K(1.0); \
04723 psij_const[2*(2*m+2)] = fg_psij0; \
04724 for(l=1; l<=2*m+1; l++) \
04725 { \
04726 fg_psij2 *= fg_psij1; \
04727 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
04728 } \
04729 \
04730 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
04731 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
04732 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
04733 n0, n1, n2, m, my_u0, my_o0); \
04734 }
04735
04736 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
04737 { \
04738 int u, o, l; \
04739 R psij_const[3*(2*m+2)]; \
04740 int ip_u; \
04741 R ip_y, ip_w; \
04742 \
04743 nfft_uo(ths,j,&u,&o,0); \
04744 ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \
04745 ip_u = LRINT(FLOOR(ip_y)); \
04746 ip_w = ip_y-ip_u; \
04747 for(l=0; l < 2*m+2; l++) \
04748 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
04749 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
04750 \
04751 nfft_uo(ths,j,&u,&o,1); \
04752 ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \
04753 ip_u = LRINT(FLOOR(ip_y)); \
04754 ip_w = ip_y-ip_u; \
04755 for(l=0; l < 2*m+2; l++) \
04756 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
04757 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
04758 \
04759 nfft_uo(ths,j,&u,&o,2); \
04760 ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \
04761 ip_u = LRINT(FLOOR(ip_y)); \
04762 ip_w = ip_y-ip_u; \
04763 for(l=0; l < 2*m+2; l++) \
04764 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
04765 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
04766 \
04767 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
04768 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
04769 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
04770 n0, n1, n2, m, my_u0, my_o0); \
04771 }
04772
04773 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
04774 { \
04775 int u, o, l; \
04776 R psij_const[3*(2*m+2)]; \
04777 \
04778 nfft_uo(ths,j,&u,&o,0); \
04779 for(l=0;l<=2*m+1;l++) \
04780 psij_const[l]=(PHI(ths->x[3*j]-((R)((u+l)))/n0,0)); \
04781 \
04782 nfft_uo(ths,j,&u,&o,1); \
04783 for(l=0;l<=2*m+1;l++) \
04784 psij_const[2*m+2+l]=(PHI(ths->x[3*j+1]-((R)((u+l)))/n1,1)); \
04785 \
04786 nfft_uo(ths,j,&u,&o,2); \
04787 for(l=0;l<=2*m+1;l++) \
04788 psij_const[2*(2*m+2)+l]=(PHI(ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
04789 \
04790 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
04791 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
04792 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
04793 n0, n1, n2, m, my_u0, my_o0); \
04794 }
04795
04796 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
04797 { \
04798 if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
04799 { \
04800 _Pragma("omp parallel private(k)") \
04801 { \
04802 int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
04803 int *ar_x = ths->index_x; \
04804 \
04805 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
04806 &min_u_b, &max_u_b, 3, ths->n, m); \
04807 \
04808 if (min_u_a != -1) \
04809 { \
04810 k = index_x_binary_search(ar_x, M, min_u_a); \
04811 \
04812 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
04813 \
04814 while (k < M) \
04815 { \
04816 int u_prod = ar_x[2*k]; \
04817 int j = ar_x[2*k+1]; \
04818 \
04819 if (u_prod < min_u_a || u_prod > max_u_a) \
04820 break; \
04821 \
04822 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
04823 \
04824 k++; \
04825 } \
04826 } \
04827 \
04828 if (min_u_b != -1) \
04829 { \
04830 int k = index_x_binary_search(ar_x, M, min_u_b); \
04831 \
04832 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
04833 \
04834 while (k < M) \
04835 { \
04836 int u_prod = ar_x[2*k]; \
04837 int j = ar_x[2*k+1]; \
04838 \
04839 if (u_prod < min_u_b || u_prod > max_u_b) \
04840 break; \
04841 \
04842 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
04843 \
04844 k++; \
04845 } \
04846 } \
04847 } \
04848 return; \
04849 } \
04850 }
04851
04852 static void nfft_adjoint_3d_B(nfft_plan *ths)
04853 {
04854 int k;
04855 const int N0 = ths->N[0];
04856 const int n0 = ths->n[0];
04857 const int N1 = ths->N[1];
04858 const int n1 = ths->n[1];
04859 const int N2 = ths->N[2];
04860 const int n2 = ths->n[2];
04861 const int M = ths->M_total;
04862 const int m = ths->m;
04863
04864 C* g = (C*) ths->g;
04865
04866 memset(g,0,ths->n_total*sizeof(C));
04867
04868 if(ths->nfft_flags & PRE_FULL_PSI)
04869 {
04870 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
04871 3, ths->n, m, ths->nfft_flags, ths->index_x);
04872 return;
04873 }
04874
04875 if(ths->nfft_flags & PRE_PSI)
04876 {
04877 #ifdef _OPENMP
04878 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
04879 #endif
04880
04881 #pragma omp parallel for default(shared) private(k)
04882 for (k = 0; k < M; k++)
04883 {
04884 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04885 #ifdef _OPENMP
04886 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04887 #else
04888 nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04889 #endif
04890 }
04891 return;
04892 }
04893
04894 if(ths->nfft_flags & PRE_FG_PSI)
04895 {
04896 R fg_exp_l[3*(2*m+2)];
04897
04898 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04899 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04900 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04901
04902 #ifdef _OPENMP
04903 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
04904 #endif
04905
04906 #pragma omp parallel for default(shared) private(k)
04907 for (k = 0; k < M; k++)
04908 {
04909 R psij_const[3*(2*m+2)];
04910 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04911 int l;
04912 R fg_psij0 = ths->psi[2*j*3];
04913 R fg_psij1 = ths->psi[2*j*3+1];
04914 R fg_psij2 = K(1.0);
04915
04916 psij_const[0] = fg_psij0;
04917 for(l=1; l<=2*m+1; l++)
04918 {
04919 fg_psij2 *= fg_psij1;
04920 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04921 }
04922
04923 fg_psij0 = ths->psi[2*(j*3+1)];
04924 fg_psij1 = ths->psi[2*(j*3+1)+1];
04925 fg_psij2 = K(1.0);
04926 psij_const[2*m+2] = fg_psij0;
04927 for(l=1; l<=2*m+1; l++)
04928 {
04929 fg_psij2 *= fg_psij1;
04930 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04931 }
04932
04933 fg_psij0 = ths->psi[2*(j*3+2)];
04934 fg_psij1 = ths->psi[2*(j*3+2)+1];
04935 fg_psij2 = K(1.0);
04936 psij_const[2*(2*m+2)] = fg_psij0;
04937 for(l=1; l<=2*m+1; l++)
04938 {
04939 fg_psij2 *= fg_psij1;
04940 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
04941 }
04942
04943 #ifdef _OPENMP
04944 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04945 #else
04946 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04947 #endif
04948 }
04949
04950 return;
04951 }
04952
04953 if(ths->nfft_flags & FG_PSI)
04954 {
04955 R fg_exp_l[3*(2*m+2)];
04956
04957 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04958 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04959 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04960
04961 nfft_sort_nodes(ths);
04962
04963 #ifdef _OPENMP
04964 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
04965 #endif
04966
04967 #pragma openmp parallel for default(shared) private(k)
04968 for (k = 0; k < M; k++)
04969 {
04970 int u,o,l;
04971 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04972 R psij_const[3*(2*m+2)];
04973 R fg_psij0, fg_psij1, fg_psij2;
04974
04975 nfft_uo(ths,j,&u,&o,0);
04976 fg_psij0 = (PHI(ths->x[3*j]-((R)u)/n0,0));
04977 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]);
04978 fg_psij2 = K(1.0);
04979 psij_const[0] = fg_psij0;
04980 for(l=1; l<=2*m+1; l++)
04981 {
04982 fg_psij2 *= fg_psij1;
04983 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04984 }
04985
04986 nfft_uo(ths,j,&u,&o,1);
04987 fg_psij0 = (PHI(ths->x[3*j+1]-((R)u)/n1,1));
04988 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]);
04989 fg_psij2 = K(1.0);
04990 psij_const[2*m+2] = fg_psij0;
04991 for(l=1; l<=2*m+1; l++)
04992 {
04993 fg_psij2 *= fg_psij1;
04994 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04995 }
04996
04997 nfft_uo(ths,j,&u,&o,2);
04998 fg_psij0 = (PHI(ths->x[3*j+2]-((R)u)/n2,2));
04999 fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]);
05000 fg_psij2 = K(1.0);
05001 psij_const[2*(2*m+2)] = fg_psij0;
05002 for(l=1; l<=2*m+1; l++)
05003 {
05004 fg_psij2 *= fg_psij1;
05005 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
05006 }
05007
05008 #ifdef _OPENMP
05009 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05010 #else
05011 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05012 #endif
05013 }
05014
05015 return;
05016 }
05017
05018 if(ths->nfft_flags & PRE_LIN_PSI)
05019 {
05020 const int K = ths->K;
05021 const int ip_s = K / (m + 2);
05022
05023 nfft_sort_nodes(ths);
05024
05025 #ifdef _OPENMP
05026 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
05027 #endif
05028
05029 #pragma openmp parallel for default(shared) private(k)
05030 for (k = 0; k < M; k++)
05031 {
05032 int u,o,l;
05033 int ip_u;
05034 R ip_y, ip_w;
05035 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
05036 R psij_const[3*(2*m+2)];
05037
05038 nfft_uo(ths,j,&u,&o,0);
05039 ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s);
05040 ip_u = LRINT(FLOOR(ip_y));
05041 ip_w = ip_y-ip_u;
05042 for(l=0; l < 2*m+2; l++)
05043 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
05044 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
05045
05046 nfft_uo(ths,j,&u,&o,1);
05047 ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s);
05048 ip_u = LRINT(FLOOR(ip_y));
05049 ip_w = ip_y-ip_u;
05050 for(l=0; l < 2*m+2; l++)
05051 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
05052 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
05053
05054 nfft_uo(ths,j,&u,&o,2);
05055 ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s);
05056 ip_u = LRINT(FLOOR(ip_y));
05057 ip_w = ip_y-ip_u;
05058 for(l=0; l < 2*m+2; l++)
05059 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
05060 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
05061
05062 #ifdef _OPENMP
05063 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05064 #else
05065 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05066 #endif
05067 }
05068 return;
05069 }
05070
05071
05072 nfft_sort_nodes(ths);
05073
05074 #ifdef _OPENMP
05075 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
05076 #endif
05077
05078 #pragma omp parallel for default(shared) private(k)
05079 for (k = 0; k < M; k++)
05080 {
05081 int u,o,l;
05082 R psij_const[3*(2*m+2)];
05083 int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
05084
05085 nfft_uo(ths,j,&u,&o,0);
05086 for(l=0;l<=2*m+1;l++)
05087 psij_const[l]=(PHI(ths->x[3*j]-((R)((u+l)))/n0,0));
05088
05089 nfft_uo(ths,j,&u,&o,1);
05090 for(l=0;l<=2*m+1;l++)
05091 psij_const[2*m+2+l]=(PHI(ths->x[3*j+1]-((R)((u+l)))/n1,1));
05092
05093 nfft_uo(ths,j,&u,&o,2);
05094 for(l=0;l<=2*m+1;l++)
05095 psij_const[2*(2*m+2)+l]=(PHI(ths->x[3*j+2]-((R)((u+l)))/n2,2));
05096
05097 #ifdef _OPENMP
05098 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05099 #else
05100 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05101 #endif
05102 }
05103 }
05104
05105
05106 void nfft_trafo_3d(nfft_plan *ths)
05107 {
05108 int k0,k1,k2,n0,n1,n2,N0,N1,N2;
05109 C *g_hat,*f_hat;
05110 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
05111 R ck01, ck02, ck11, ck12, ck21, ck22;
05112 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
05113 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
05114
05115 ths->g_hat=ths->g1;
05116 ths->g=ths->g2;
05117
05118 N0=ths->N[0];
05119 N1=ths->N[1];
05120 N2=ths->N[2];
05121 n0=ths->n[0];
05122 n1=ths->n[1];
05123 n2=ths->n[2];
05124
05125 f_hat=(C*)ths->f_hat;
05126 g_hat=(C*)ths->g_hat;
05127
05128 TIC(0)
05129 #ifdef _OPENMP
05130 #pragma omp parallel for default(shared) private(k0)
05131 for (k0 = 0; k0 < ths->n_total; k0++)
05132 ths->g_hat[k0] = 0.0;
05133 #else
05134 memset(ths->g_hat,0,ths->n_total*sizeof(C));
05135 #endif
05136
05137 if(ths->nfft_flags & PRE_PHI_HUT)
05138 {
05139 c_phi_inv01=ths->c_phi_inv[0];
05140 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
05141
05142 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
05143 for(k0=0;k0<N0/2;k0++)
05144 {
05145 ck01=c_phi_inv01[k0];
05146 ck02=c_phi_inv02[k0];
05147 c_phi_inv11=ths->c_phi_inv[1];
05148 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
05149
05150 for(k1=0;k1<N1/2;k1++)
05151 {
05152 ck11=c_phi_inv11[k1];
05153 ck12=c_phi_inv12[k1];
05154 c_phi_inv21=ths->c_phi_inv[2];
05155 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
05156
05157 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05158 f_hat111=f_hat + (k0*N1+k1)*N2;
05159 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05160 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
05161 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
05162 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
05163 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
05164 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
05165
05166 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
05167 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
05168 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
05169 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
05170 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
05171 f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
05172 g_hat222=g_hat + (k0*n1+k1)*n2;
05173 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
05174
05175 for(k2=0;k2<N2/2;k2++)
05176 {
05177 ck21=c_phi_inv21[k2];
05178 ck22=c_phi_inv22[k2];
05179
05180 g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
05181 g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
05182 g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
05183 g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
05184
05185 g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
05186 g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
05187 g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
05188 g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
05189 }
05190 }
05191 }
05192 }
05193 else
05194 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
05195 for(k0=0;k0<N0/2;k0++)
05196 {
05197 ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
05198 ck02=K(1.0)/(PHI_HUT(k0,0));
05199 for(k1=0;k1<N1/2;k1++)
05200 {
05201 ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
05202 ck12=K(1.0)/(PHI_HUT(k1,1));
05203
05204 for(k2=0;k2<N2/2;k2++)
05205 {
05206 ck21=K(1.0)/(PHI_HUT(k2-N2/2,2));
05207 ck22=K(1.0)/(PHI_HUT(k2,2));
05208
05209 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
05210 g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
05211 g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
05212 g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
05213
05214 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
05215 g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
05216 g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
05217 g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
05218 }
05219 }
05220 }
05221
05222 TOC(0)
05223
05224 TIC_FFTW(1)
05225 fftw_execute(ths->my_fftw_plan1);
05226 TOC_FFTW(1);
05227
05228 TIC(2);
05229 nfft_trafo_3d_B(ths);
05230 TOC(2);
05231 }
05232
05233 void nfft_adjoint_3d(nfft_plan *ths)
05234 {
05235 int k0,k1,k2,n0,n1,n2,N0,N1,N2;
05236 C *g_hat,*f_hat;
05237 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
05238 R ck01, ck02, ck11, ck12, ck21, ck22;
05239 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
05240 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
05241
05242 ths->g_hat=ths->g1;
05243 ths->g=ths->g2;
05244
05245 N0=ths->N[0];
05246 N1=ths->N[1];
05247 N2=ths->N[2];
05248 n0=ths->n[0];
05249 n1=ths->n[1];
05250 n2=ths->n[2];
05251
05252 f_hat=(C*)ths->f_hat;
05253 g_hat=(C*)ths->g_hat;
05254
05255 TIC(2);
05256 nfft_adjoint_3d_B(ths);
05257 TOC(2);
05258
05259 TIC_FFTW(1)
05260 fftw_execute(ths->my_fftw_plan2);
05261 TOC_FFTW(1);
05262
05263 TIC(0)
05264 if(ths->nfft_flags & PRE_PHI_HUT)
05265 {
05266 c_phi_inv01=ths->c_phi_inv[0];
05267 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
05268
05269 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
05270 for(k0=0;k0<N0/2;k0++)
05271 {
05272 ck01=c_phi_inv01[k0];
05273 ck02=c_phi_inv02[k0];
05274 c_phi_inv11=ths->c_phi_inv[1];
05275 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
05276
05277 for(k1=0;k1<N1/2;k1++)
05278 {
05279 ck11=c_phi_inv11[k1];
05280 ck12=c_phi_inv12[k1];
05281 c_phi_inv21=ths->c_phi_inv[2];
05282 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
05283
05284 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05285 f_hat111=f_hat + (k0*N1+k1)*N2;
05286 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05287 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
05288 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
05289 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
05290 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
05291 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
05292
05293 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
05294 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
05295 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
05296 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
05297 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
05298 f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
05299 g_hat222=g_hat + (k0*n1+k1)*n2;
05300 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
05301
05302 for(k2=0;k2<N2/2;k2++)
05303 {
05304 ck21=c_phi_inv21[k2];
05305 ck22=c_phi_inv22[k2];
05306
05307 f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
05308 f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
05309 f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
05310 f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
05311
05312 f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
05313 f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
05314 f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
05315 f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
05316 }
05317 }
05318 }
05319 }
05320 else
05321 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
05322 for(k0=0;k0<N0/2;k0++)
05323 {
05324 ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
05325 ck02=K(1.0)/(PHI_HUT(k0,0));
05326 for(k1=0;k1<N1/2;k1++)
05327 {
05328 ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
05329 ck12=K(1.0)/(PHI_HUT(k1,1));
05330
05331 for(k2=0;k2<N2/2;k2++)
05332 {
05333 ck21=K(1.0)/(PHI_HUT(k2-N2/2,2));
05334 ck22=K(1.0)/(PHI_HUT(k2,2));
05335
05336 f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
05337 f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
05338 f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
05339 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
05340
05341 f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
05342 f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
05343 f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
05344 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
05345 }
05346 }
05347 }
05348
05349 TOC(0)
05350 }
05351
05354 void nfft_trafo(nfft_plan *ths)
05355 {
05356 switch(ths->d)
05357 {
05358 case 1: nfft_trafo_1d(ths); break;
05359 case 2: nfft_trafo_2d(ths); break;
05360 case 3: nfft_trafo_3d(ths); break;
05361 default:
05362
05363 ths->g_hat=ths->g1;
05364 ths->g=ths->g2;
05365
05369 TIC(0)
05370 nfft_D_A(ths);
05371 TOC(0)
05372
05373
05377 TIC_FFTW(1)
05378 fftw_execute(ths->my_fftw_plan1);
05379 TOC_FFTW(1)
05380
05381
05384 TIC(2)
05385 nfft_B_A(ths);
05386 TOC(2)
05387 }
05388 }
05389
05390 void nfft_adjoint(nfft_plan *ths)
05391 {
05392 switch(ths->d)
05393 {
05394 case 1: nfft_adjoint_1d(ths); break;
05395 case 2: nfft_adjoint_2d(ths); break;
05396 case 3: nfft_adjoint_3d(ths); break;
05397 default:
05398
05399 ths->g_hat=ths->g1;
05400 ths->g=ths->g2;
05401
05405 TIC(2)
05406 nfft_B_T(ths);
05407 TOC(2)
05408
05413 TIC_FFTW(1)
05414 fftw_execute(ths->my_fftw_plan2);
05415 TOC_FFTW(1)
05416
05420 TIC(0)
05421 nfft_D_T(ths);
05422 TOC(0)
05423 }
05424 }
05425
05426
05429 static void nfft_precompute_phi_hut(nfft_plan *ths)
05430 {
05431 int ks[ths->d];
05432 int t;
05434 ths->c_phi_inv = (R**) nfft_malloc(ths->d*sizeof(R*));
05435
05436 for(t=0; t<ths->d; t++)
05437 {
05438 ths->c_phi_inv[t]= (R*)nfft_malloc(ths->N[t]*sizeof(R));
05439 for(ks[t]=0; ks[t]<ths->N[t]; ks[t]++)
05440 ths->c_phi_inv[t][ks[t]]= K(1.0)/(PHI_HUT(ks[t]-ths->N[t]/2,t));
05441 }
05442 }
05443
05449 void nfft_precompute_lin_psi(nfft_plan *ths)
05450 {
05451 int t;
05452 int j;
05453 R step;
05455 for (t=0; t<ths->d; t++)
05456 {
05457 step=((R)(ths->m+2))/(((R)ths->K)*ths->n[t]);
05458 for(j=0;j<=ths->K;j++)
05459 {
05460 ths->psi[(ths->K+1)*t + j] = PHI(j*step,t);
05461 }
05462 }
05463 }
05464
05465 static void nfft_precompute_fg_psi(nfft_plan *ths)
05466 {
05467 int t;
05468 int u, o;
05470 nfft_sort_nodes(ths);
05471
05472 for (t=0; t<ths->d; t++)
05473 {
05474 int j;
05475 #pragma omp parallel for default(shared) private(j,u,o)
05476 for (j = 0; j < ths->M_total; j++)
05477 {
05478 nfft_uo(ths,j,&u,&o,t);
05479
05480 ths->psi[2*(j*ths->d+t)]=
05481 (PHI((ths->x[j*ths->d+t]-((R)u)/ths->n[t]),t));
05482
05483 ths->psi[2*(j*ths->d+t)+1]=
05484 EXP(K(2.0)*(ths->n[t]*ths->x[j*ths->d+t] - u) / ths->b[t]);
05485 }
05486 }
05487
05488 }
05489
05490 void nfft_precompute_psi(nfft_plan *ths)
05491 {
05492 int t;
05493 int l;
05494 int lj;
05495 int u, o;
05497 nfft_sort_nodes(ths);
05498
05499 for (t=0; t<ths->d; t++)
05500 {
05501 int j;
05502 #pragma omp parallel for default(shared) private(j,l,lj,u,o)
05503 for (j = 0; j < ths->M_total; j++)
05504 {
05505 nfft_uo(ths,j,&u,&o,t);
05506
05507 for(l=u, lj=0; l <= o; l++, lj++)
05508 ths->psi[(j*ths->d+t)*(2*ths->m+2)+lj]=
05509 (PHI((ths->x[j*ths->d+t]-((R)l)/ths->n[t]),t));
05510 }
05511 }
05512
05513 }
05514
05515 #ifdef _OPENMP
05516 static void nfft_precompute_full_psi_omp(nfft_plan *ths)
05517 {
05518 int j;
05519 int lprod;
05521 {
05522 int t;
05523 for(t=0,lprod = 1; t<ths->d; t++)
05524 lprod *= 2*ths->m+2;
05525 }
05526
05527 #pragma omp parallel for default(shared) private(j)
05528 for(j=0; j<ths->M_total; j++)
05529 {
05530 int t,t2;
05531 int l_L;
05532 int l[ths->d];
05533 int lj[ths->d];
05534 int ll_plain[ths->d+1];
05536 int u[ths->d], o[ths->d];
05538 R phi_prod[ths->d+1];
05539 int ix = j*lprod;
05540
05541 phi_prod[0]=1;
05542 ll_plain[0]=0;
05543
05544 MACRO_init_uo_l_lj_t;
05545
05546 for(l_L=0; l_L<lprod; l_L++, ix++)
05547 {
05548 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
05549
05550 ths->psi_index_g[ix]=ll_plain[ths->d];
05551 ths->psi[ix]=phi_prod[ths->d];
05552
05553 MACRO_count_uo_l_lj_t;
05554 }
05555
05556 ths->psi_index_f[j]=lprod;
05557 }
05558 }
05559 #endif
05560
05561 void nfft_precompute_full_psi(nfft_plan *ths)
05562 {
05563 #ifdef _OPENMP
05564 nfft_sort_nodes(ths);
05565
05566 nfft_precompute_full_psi_omp(ths);
05567 #else
05568 int t,t2;
05569 int j;
05570 int l_L;
05571 int l[ths->d];
05572 int lj[ths->d];
05573 int ll_plain[ths->d+1];
05574 int lprod;
05575 int u[ths->d], o[ths->d];
05577 R phi_prod[ths->d+1];
05578
05579 int ix,ix_old;
05580
05581 nfft_sort_nodes(ths);
05582
05583 phi_prod[0]=1;
05584 ll_plain[0]=0;
05585
05586 for(t=0,lprod = 1; t<ths->d; t++)
05587 lprod *= 2*ths->m+2;
05588
05589 for(j=0,ix=0,ix_old=0; j<ths->M_total; j++)
05590 {
05591 MACRO_init_uo_l_lj_t;
05592
05593 for(l_L=0; l_L<lprod; l_L++, ix++)
05594 {
05595 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
05596
05597 ths->psi_index_g[ix]=ll_plain[ths->d];
05598 ths->psi[ix]=phi_prod[ths->d];
05599
05600 MACRO_count_uo_l_lj_t;
05601 }
05602
05603
05604 ths->psi_index_f[j]=ix-ix_old;
05605 ix_old=ix;
05606 }
05607 #endif
05608 }
05609
05610 void nfft_precompute_one_psi(nfft_plan *ths)
05611 {
05612 if(ths->nfft_flags & PRE_LIN_PSI)
05613 nfft_precompute_lin_psi(ths);
05614 if(ths->nfft_flags & PRE_FG_PSI)
05615 nfft_precompute_fg_psi(ths);
05616 if(ths->nfft_flags & PRE_PSI)
05617 nfft_precompute_psi(ths);
05618 if(ths->nfft_flags & PRE_FULL_PSI)
05619 nfft_precompute_full_psi(ths);
05620 }
05621
05622
05623 static void nfft_init_help(nfft_plan *ths)
05624 {
05625 int t;
05626 int lprod;
05628 if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)
05629 ths->nfft_flags |= NFFT_SORT_NODES;
05630
05631 ths->N_total=nfft_prod_int(ths->N, ths->d);
05632 ths->n_total=nfft_prod_int(ths->n, ths->d);
05633
05634 ths->sigma = (R*) nfft_malloc(ths->d*sizeof(R));
05635 for(t = 0;t < ths->d; t++)
05636 ths->sigma[t] = ((R)ths->n[t])/ths->N[t];
05637
05638 WINDOW_HELP_INIT;
05639
05640 if(ths->nfft_flags & MALLOC_X)
05641 ths->x = (R*)nfft_malloc(ths->d*ths->M_total*sizeof(R));
05642
05643 if(ths->nfft_flags & MALLOC_F_HAT)
05644 ths->f_hat = (fftw_complex*)nfft_malloc(ths->N_total*sizeof(C));
05645
05646 if(ths->nfft_flags & MALLOC_F)
05647 ths->f = (fftw_complex*)nfft_malloc(ths->M_total*sizeof(C));
05648
05649 if(ths->nfft_flags & PRE_PHI_HUT)
05650 nfft_precompute_phi_hut(ths);
05651
05652 if(ths->nfft_flags & PRE_LIN_PSI)
05653 {
05654 ths->K=(1U<< 10)*(ths->m+2);
05655 ths->psi = (R*) nfft_malloc((ths->K+1)*ths->d*sizeof(R));
05656 }
05657
05658 if(ths->nfft_flags & PRE_FG_PSI)
05659 ths->psi = (R*) nfft_malloc(ths->M_total*ths->d*2*sizeof(R));
05660
05661 if(ths->nfft_flags & PRE_PSI)
05662 ths->psi = (R*) nfft_malloc(ths->M_total*ths->d*
05663 (2*ths->m+2)*sizeof(R));
05664
05665 if(ths->nfft_flags & PRE_FULL_PSI)
05666 {
05667 for(t=0,lprod = 1; t<ths->d; t++)
05668 lprod *= 2*ths->m+2;
05669
05670 ths->psi = (R*) nfft_malloc(ths->M_total*lprod*sizeof(R));
05671
05672 ths->psi_index_f = (int*) nfft_malloc(ths->M_total*sizeof(int));
05673 ths->psi_index_g = (int*) nfft_malloc(ths->M_total*lprod*sizeof(int));
05674 }
05675
05676 if(ths->nfft_flags & FFTW_INIT)
05677 {
05678 #ifdef _OPENMP
05679 int nthreads = nfft_get_omp_num_threads();
05680 #endif
05681
05682 ths->g1=(fftw_complex*)nfft_malloc(ths->n_total*sizeof(C));
05683
05684 if(ths->nfft_flags & FFT_OUT_OF_PLACE)
05685 ths->g2 = (fftw_complex*) nfft_malloc(ths->n_total*sizeof(C));
05686 else
05687 ths->g2 = ths->g1;
05688
05689 #ifdef _OPENMP
05690 #pragma omp critical (nfft_omp_critical_fftw_plan)
05691 {
05692 fftw_plan_with_nthreads(nthreads);
05693 #endif
05694 ths->my_fftw_plan1 = fftw_plan_dft(ths->d, ths->n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
05695 ths->my_fftw_plan2 = fftw_plan_dft(ths->d, ths->n, ths->g2, ths->g1,
05696 FFTW_BACKWARD, ths->fftw_flags);
05697 #ifdef _OPENMP
05698 }
05699 #endif
05700 }
05701
05702 if(ths->nfft_flags & NFFT_SORT_NODES)
05703 ths->index_x = (int*) nfft_malloc(sizeof(int)*2*ths->M_total);
05704 else
05705 ths->index_x = NULL;
05706
05707 ths->mv_trafo = (void (*) (void* ))nfft_trafo;
05708 ths->mv_adjoint = (void (*) (void* ))nfft_adjoint;
05709 }
05710
05711 void nfft_init(nfft_plan *ths, int d, int *N, int M_total)
05712 {
05713 int t;
05715 ths->d = d;
05716
05717 ths->N=(int*) nfft_malloc(d*sizeof(int));
05718
05719 for (t = 0;t < d; t++)
05720 ths->N[t] = N[t];
05721
05722 ths->M_total = M_total;
05723
05724 ths->n = (int*) nfft_malloc(d*sizeof(int));
05725 for (t = 0;t < d; t++)
05726 ths->n[t] = 2*X(next_power_of_2)(ths->N[t]);
05727
05728 ths->m = WINDOW_HELP_ESTIMATE_m;
05729
05730 if (d > 1)
05731 {
05732 #ifdef _OPENMP
05733 ths->nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
05734 FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
05735 NFFT_OMP_BLOCKWISE_ADJOINT;
05736 #else
05737 ths->nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
05738 FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
05739 #endif
05740 }
05741 else
05742 ths->nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
05743 FFTW_INIT | FFT_OUT_OF_PLACE;
05744
05745 ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
05746
05747 nfft_init_help(ths);
05748 }
05749
05750 void nfft_init_guru(nfft_plan *ths, int d, int *N, int M_total, int *n,
05751 int m, unsigned nfft_flags, unsigned fftw_flags)
05752 {
05753 int t;
05755 ths->d =d;
05756 ths->N= (int*) nfft_malloc(ths->d*sizeof(int));
05757 for(t=0; t<d; t++)
05758 ths->N[t]= N[t];
05759 ths->M_total= M_total;
05760 ths->n= (int*) nfft_malloc(ths->d*sizeof(int));
05761 for(t=0; t<d; t++)
05762 ths->n[t]= n[t];
05763 ths->m= m;
05764 ths->nfft_flags= nfft_flags;
05765 ths->fftw_flags= fftw_flags;
05766
05767 nfft_init_help(ths);
05768 }
05769
05770 void nfft_init_1d(nfft_plan *ths, int N1, int M_total)
05771 {
05772 int N[1];
05773
05774 N[0]=N1;
05775
05776 nfft_init(ths, 1, N, M_total);
05777 }
05778
05779 void nfft_init_2d(nfft_plan *ths, int N1, int N2, int M_total)
05780 {
05781 int N[2];
05782
05783 N[0]=N1;
05784 N[1]=N2;
05785 nfft_init(ths,2,N,M_total);
05786 }
05787
05788 void nfft_init_3d(nfft_plan *ths, int N1, int N2, int N3, int M_total)
05789 {
05790 int N[3];
05791
05792 N[0]=N1;
05793 N[1]=N2;
05794 N[2]=N3;
05795 nfft_init(ths,3,N,M_total);
05796 }
05797
05798 const char* nfft_check(nfft_plan *ths)
05799 {
05800 int j;
05801
05802 for(j=0;j<ths->M_total*ths->d;j++)
05803 if((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
05804 return "ths->x out of range [-0.5,0.5)";
05805
05806 for(j=0;j<ths->d;j++)
05807 {
05808 if(ths->sigma[j]<=1)
05809 return "nfft_check: oversampling factor too small";
05810
05811 if(ths->N[j]<=ths->m)
05812 return "Polynomial degree N is smaller than cut-off m";
05813
05814 if(ths->N[j]%2==1)
05815 return "polynomial degree N has to be even";
05816 }
05817 return 0;
05818 }
05819
05820 void nfft_finalize(nfft_plan *ths)
05821 {
05822 int t;
05823
05824 if(ths->nfft_flags & NFFT_SORT_NODES)
05825 nfft_free(ths->index_x);
05826
05827 if(ths->nfft_flags & FFTW_INIT)
05828 {
05829 #pragma omp critical (nfft_omp_critical_fftw_plan)
05830 {
05831 fftw_destroy_plan(ths->my_fftw_plan2);
05832 fftw_destroy_plan(ths->my_fftw_plan1);
05833 }
05834
05835 if(ths->nfft_flags & FFT_OUT_OF_PLACE)
05836 nfft_free(ths->g2);
05837
05838 nfft_free(ths->g1);
05839 }
05840
05841 if(ths->nfft_flags & PRE_FULL_PSI)
05842 {
05843 nfft_free(ths->psi_index_g);
05844 nfft_free(ths->psi_index_f);
05845 nfft_free(ths->psi);
05846 }
05847
05848 if(ths->nfft_flags & PRE_PSI)
05849 nfft_free(ths->psi);
05850
05851 if(ths->nfft_flags & PRE_FG_PSI)
05852 nfft_free(ths->psi);
05853
05854 if(ths->nfft_flags & PRE_LIN_PSI)
05855 nfft_free(ths->psi);
05856
05857 if(ths->nfft_flags & PRE_PHI_HUT)
05858 {
05859 for(t=0; t<ths->d; t++)
05860 nfft_free(ths->c_phi_inv[t]);
05861 nfft_free(ths->c_phi_inv);
05862 }
05863
05864 if(ths->nfft_flags & MALLOC_F)
05865 nfft_free(ths->f);
05866
05867 if(ths->nfft_flags & MALLOC_F_HAT)
05868 nfft_free(ths->f_hat);
05869
05870 if(ths->nfft_flags & MALLOC_X)
05871 nfft_free(ths->x);
05872
05873 WINDOW_HELP_FINALIZE;
05874
05875 nfft_free(ths->sigma);
05876 nfft_free(ths->n);
05877 nfft_free(ths->N);
05878 }