NFFT Logo 3.2.2
nfft.c
00001 /*
00002  * Copyright (c) 2002, 2012 Jens Keiner, Stefan Kunis, Daniel Potts
00003  *
00004  * This program is free software; you can redistribute it and/or modify it under
00005  * the terms of the GNU General Public License as published by the Free Software
00006  * Foundation; either version 2 of the License, or (at your option) any later
00007  * version.
00008  *
00009  * This program is distributed in the hope that it will be useful, but WITHOUT
00010  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00011  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
00012  * details.
00013  *
00014  * You should have received a copy of the GNU General Public License along with
00015  * this program; if not, write to the Free Software Foundation, Inc., 51
00016  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
00017  */
00018 
00019 /* $Id: nfft.c 3896 2012-10-10 12:19:26Z tovo $ */
00020 
00021 /* Nonequispaced FFT */
00022 
00023 /* Authors: D. Potts, S. Kunis 2002-2009, Jens Keiner 2009, Toni Volkmer 2012 */
00024 
00025 /* configure header */
00026 #include "config.h"
00027 
00028 /* complex datatype (maybe) */
00029 #ifdef HAVE_COMPLEX_H
00030 #include<complex.h>
00031 #endif
00032 
00033 /* NFFT headers */
00034 #include "nfft3util.h"
00035 #include "nfft3.h"
00036 #include "infft.h"
00037 
00038 #ifdef _OPENMP
00039 #include <omp.h>
00040 #endif
00041 
00042 #ifdef OMP_ASSERT
00043 #include <assert.h>
00044 #endif
00045 
00046 
00061 static void nfft_sort_nodes_for_better_cache_handle(int d,
00062     const int *n, int m, int local_x_num, const R *local_x, int *ar_x)
00063 {
00064   int u_j[d], i, j, help, rhigh;
00065   int *ar_x_temp;
00066   int nprod;
00067 
00068   for(i = 0; i < local_x_num; i++) {
00069     ar_x[2*i] = 0;
00070     ar_x[2*i+1] = i;
00071     for(j = 0; j < d; j++) {
00072       help = (int) floor( n[j]*local_x[d*i+j] - m);
00073       u_j[j] = (help%n[j]+n[j])%n[j];
00074 
00075       ar_x[2*i] += u_j[j];
00076       if (j+1 < d)
00077         ar_x[2*i] *= n[j+1];
00078     }
00079   }
00080 
00081   for (j = 0, nprod = 1; j < d; j++)
00082     nprod *= n[j];
00083 
00084   rhigh = (int) ceil(log2(nprod)) - 1;
00085 
00086   ar_x_temp = (int *) nfft_malloc(2*local_x_num*sizeof(int));
00087   nfft_sort_node_indices_radix_lsdf(local_x_num, ar_x, ar_x_temp, rhigh);
00088 #ifdef OMP_ASSERT
00089   for (i = 1; i < local_x_num; i++)
00090     assert(ar_x[2*(i-1)] <= ar_x[2*i]);
00091 #endif
00092   nfft_free(ar_x_temp);
00093 }
00094 
00103 static void nfft_sort_nodes(const nfft_plan *ths)
00104 {
00105   if (ths->nfft_flags & NFFT_SORT_NODES)
00106     nfft_sort_nodes_for_better_cache_handle(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
00107 }
00108 
00130 /* some macros to initialize arrays before executing a transformation */
00131 #define MACRO_ndft_init_result_trafo memset(f,0,ths->M_total*sizeof(C));
00132 #define MACRO_ndft_init_result_conjugated MACRO_ndft_init_result_trafo
00133 #define MACRO_ndft_init_result_adjoint memset(f_hat,0,ths->N_total*sizeof(C));
00134 #define MACRO_ndft_init_result_transposed MACRO_ndft_init_result_adjoint
00135 
00136 /* exponent of complex exponentials */
00137 #define MACRO_ndft_sign_trafo K2PI*ths->x[j*ths->d+t]
00138 #define MACRO_ndft_sign_conjugated -K2PI*ths->x[j*ths->d+t]
00139 #define MACRO_ndft_sign_adjoint K2PI*ths->x[j*ths->d+t]
00140 #define MACRO_ndft_sign_transposed -K2PI*ths->x[j*ths->d+t]
00141 
00142 #define MACRO_init_k_N_Omega_x(which_one)                                     \
00143 {                                                                             \
00144   for (t = 0; t < ths->d; t++)                                                \
00145   {                                                                           \
00146     k[t] = -ths->N[t]/2;                                                      \
00147     x[t] = MACRO_ndft_sign_ ## which_one;                                     \
00148     Omega[t+1] = k[t]*x[t] + Omega[t];                                        \
00149   }                                                                           \
00150   omega = Omega[ths->d];                                                      \
00151 }                                                                             \
00152 
00153 #define MACRO_count_k_N_Omega                                                 \
00154 {                                                                             \
00155   for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)                \
00156     k[t]-= ths->N[t]-1;                                                       \
00157                                                                               \
00158   k[t]++;                                                                     \
00159 \
00160   for (t2 = t; t2 < ths->d; t2++)                                             \
00161     Omega[t2+1] = k[t2]*x[t2] + Omega[t2];                                    \
00162                                                                               \
00163   omega = Omega[ths->d];                                                      \
00164 }
00165 
00166 #define MACRO_ndft_compute_trafo f[j] += f_hat[k_L]*CEXP(-II*omega);
00167 #define MACRO_ndft_compute_conjugated MACRO_ndft_compute_trafo
00168 #define MACRO_ndft_compute_adjoint f_hat[k_L] += f[j]*CEXP(+II*omega);
00169 #define MACRO_ndft_compute_transposed MACRO_ndft_compute_adjoint
00170 
00171 #define MACRO_ndft(which_one)                                                 \
00172 void nfft_ ## which_one ## _direct (nfft_plan *ths)                           \
00173 {                                                                             \
00174   C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;                                 \
00175                                                                               \
00176   MACRO_ndft_init_result_ ## which_one                                        \
00177                                                                               \
00178   if (ths->d == 1)                                                            \
00179   {                                                                           \
00180     /* specialize for univariate case, rationale: faster */                   \
00181     const int t = 0;                                                          \
00182     int j;                                                                    \
00183     for (j = 0; j < ths->M_total; j++)                                        \
00184     {                                                                         \
00185       int k_L;                                                                \
00186       for(k_L = 0; k_L < ths->N_total; k_L++)                                 \
00187       {                                                                       \
00188         R omega = (k_L - (ths->N_total/2)) * MACRO_ndft_sign_ ## which_one;   \
00189         MACRO_ndft_compute_ ## which_one;                                     \
00190       }                                                                       \
00191     }                                                                         \
00192   }                                                                           \
00193   else                                                                        \
00194   {                                                                           \
00195     /* multivariate case */                                                   \
00196     int j;                                                                    \
00197     for (j = 0; j < ths->M_total; j++)                                        \
00198     {                                                                         \
00199       R x[ths->d], omega, Omega[ths->d+1];                                    \
00200       int t, t2, k_L, k[ths->d];                                              \
00201       Omega[0] = K(0.0);                                                      \
00202       MACRO_init_k_N_Omega_x(which_one);                                      \
00203       for(k_L = 0; k_L < ths->N_total; k_L++)                                 \
00204       {                                                                       \
00205         MACRO_ndft_compute_ ## which_one;                                     \
00206         MACRO_count_k_N_Omega;                                                \
00207       }                                                                       \
00208     }                                                                         \
00209   }                                                                           \
00210 }
00211 
00212 // macro expanded for OpenMP parallelization
00213 //MACRO_ndft(trafo)
00214 void nfft_trafo_direct (nfft_plan *ths)
00215 {
00216   C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
00217 
00218   memset(f,0,ths->M_total*sizeof(C));
00219 
00220   if (ths->d == 1)
00221   {
00222     /* specialize for univariate case, rationale: faster */
00223     int j;
00224     #pragma omp parallel for default(shared) private(j)
00225     for (j = 0; j < ths->M_total; j++)
00226     {
00227       int k_L;
00228       for(k_L = 0; k_L < ths->N_total; k_L++)
00229       {
00230         R omega = (k_L - ths->N_total/2) * K2PI * ths->x[j];
00231         f[j] += f_hat[k_L]*cexp(-( 1.0iF)*omega);
00232       }
00233     }
00234   }
00235   else
00236   {
00237     /* multivariate case */
00238     int j;
00239     #pragma omp parallel for default(shared) private(j)
00240     for (j = 0; j < ths->M_total; j++)
00241     {
00242       R x[ths->d], omega, Omega[ths->d+1];
00243       int t, t2, k_L, k[ths->d];
00244       Omega[0] = ((R) 0.0);
00245       for (t = 0; t < ths->d; t++)
00246       {
00247         k[t] = -ths->N[t]/2;
00248         x[t] = K2PI*ths->x[j*ths->d+t];
00249         Omega[t+1] = k[t]*x[t] + Omega[t];
00250       }
00251       omega = Omega[ths->d];
00252 
00253       for(k_L = 0; k_L < ths->N_total; k_L++)
00254       {
00255         f[j] += f_hat[k_L]*cexp(-( 1.0iF)*omega);
00256         {
00257           for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
00258             k[t]-= ths->N[t]-1;
00259 
00260           k[t]++;
00261 
00262           for (t2 = t; t2 < ths->d; t2++)
00263             Omega[t2+1] = k[t2]*x[t2] + Omega[t2];
00264 
00265           omega = Omega[ths->d];
00266         }
00267       }
00268     }
00269   }
00270 }
00271 
00272 // macro expanded for OpenMP parallelization since parallel calculation
00273 // requires outer loop over frequencies and inner loop over nodes.
00274 //MACRO_ndft(adjoint)
00275 void nfft_adjoint_direct (nfft_plan *ths)
00276 {
00277   C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
00278 
00279   memset(f_hat,0,ths->N_total*sizeof(C));
00280 
00281   if (ths->d == 1)
00282   {
00283     /* specialize for univariate case, rationale: faster */
00284 #ifdef _OPENMP
00285       int k_L;
00286       #pragma omp parallel for default(shared) private(k_L)
00287       for(k_L = 0; k_L < ths->N_total; k_L++)
00288       {
00289         int j;
00290         for (j = 0; j < ths->M_total; j++)
00291         {
00292           R omega = (k_L - (ths->N_total/2)) * K2PI * ths->x[j];
00293           f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
00294         }
00295       }
00296 #else
00297       int j;
00298       for (j = 0; j < ths->M_total; j++)
00299       {
00300         int k_L;
00301         for(k_L = 0; k_L < ths->N_total; k_L++)
00302         {
00303           R omega = (k_L - (ths->N_total/2)) * K2PI * ths->x[j];
00304           f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
00305         }
00306       }
00307 #endif
00308   }
00309   else
00310   {
00311     /* multivariate case */
00312     int j, k_L;
00313 #ifdef _OPENMP
00314     #pragma omp parallel for default(shared) private(j, k_L)
00315     for(k_L = 0; k_L < ths->N_total; k_L++)
00316     {
00317       int k[ths->d], k_temp, t;
00318 
00319       k_temp = k_L;
00320 
00321       for (t = ths->d-1; t >= 0; t--)
00322       {
00323         k[t] = k_temp % ths->N[t] - ths->N[t]/2;
00324         k_temp /= ths->N[t];
00325       }
00326 
00327       for (j = 0; j < ths->M_total; j++)
00328       {
00329         R omega = 0.0;
00330         for (t = 0; t < ths->d; t++)
00331           omega += k[t] * K2PI * ths->x[j*ths->d+t];
00332         f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
00333       }
00334     }
00335 #else
00336     for (j = 0; j < ths->M_total; j++)
00337     {
00338       R x[ths->d], omega, Omega[ths->d+1];
00339       int t, t2, k[ths->d];
00340       Omega[0] = ((R) 0.0);
00341       for (t = 0; t < ths->d; t++)
00342       {
00343         k[t] = -ths->N[t]/2;
00344         x[t] = K2PI * ths->x[j*ths->d+t];
00345         Omega[t+1] = k[t]*x[t] + Omega[t];
00346       }
00347       omega = Omega[ths->d];
00348       for(k_L = 0; k_L < ths->N_total; k_L++)
00349       {
00350         f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
00351 
00352         for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
00353           k[t]-= ths->N[t]-1;
00354 
00355         k[t]++;
00356 
00357         for (t2 = t; t2 < ths->d; t2++)
00358           Omega[t2+1] = k[t2]*x[t2] + Omega[t2];
00359 
00360         omega = Omega[ths->d];
00361       }
00362     }
00363 #endif
00364   }
00365 }
00366 
00392 static inline void nfft_uo(const nfft_plan *ths, const int j, int *up, int *op,
00393   const int act_dim)
00394 {
00395   const R xj = ths->x[j * ths->d + act_dim];
00396   int c = LRINT(FLOOR(xj * ths->n[act_dim]));
00397 
00398   (*up) = c - (ths->m);
00399   (*op) = c + 1 + (ths->m);
00400 }
00401 
00402 static inline void nfft_uo2(int *u, int *o, const R x, const int n, const int m)
00403 {
00404   int c = LRINT(FLOOR(x * n));
00405 
00406   *u = (c - m + n) % n;
00407   *o = (c + 1 + m + n) % n;
00408 }
00409 
00410 #define MACRO_nfft_D_compute_A                                                \
00411 {                                                                             \
00412   g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d];     \
00413 }
00414 
00415 #define MACRO_nfft_D_compute_T                                                \
00416 {                                                                             \
00417   f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d];     \
00418 }
00419 
00420 #define MACRO_nfft_D_init_result_A memset(g_hat,0,ths->n_total*sizeof(C));
00421 
00422 #define MACRO_nfft_D_init_result_T memset(f_hat,0,ths->N_total*sizeof(C));
00423 
00424 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
00425 
00426 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ks[t2]-(ths->N[t2]/2),t2));
00427 
00428 #define MACRO_init_k_ks                                                       \
00429 {                                                                             \
00430   for (t = ths->d-1; 0 <= t; t--)                                             \
00431   {                                                                           \
00432     kp[t] = k[t] = 0;                                                         \
00433     ks[t] = ths->N[t]/2;                                                      \
00434   }                                                                           \
00435   t++;                                                                        \
00436 }
00437 
00438 #define MACRO_update_c_phi_inv_k(which_one) \
00439 {                                                                             \
00440   for (t2 = t; t2 < ths->d; t2++)                                             \
00441   {                                                                           \
00442     c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one;                   \
00443     ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2];                        \
00444     k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2];                           \
00445   }                                                                           \
00446 }
00447 
00448 #define MACRO_count_k_ks                                                      \
00449 {                                                                             \
00450   for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--)                  \
00451   {                                                                           \
00452     kp[t] = k[t] = 0;                                                         \
00453     ks[t]= ths->N[t]/2;                                                       \
00454   }                                                                           \
00455                                                                               \
00456   kp[t]++; k[t]++; ks[t]++;                                                   \
00457   if(kp[t] == ths->N[t]/2)                                                    \
00458   {                                                                           \
00459     k[t] = ths->n[t] - ths->N[t]/2;                                           \
00460     ks[t] = 0;                                                                \
00461   }                                                                           \
00462 }                                                                             \
00463 
00464 
00468 #define MACRO_nfft_D(which_one)                                               \
00469 static inline void nfft_D_serial_ ## which_one (nfft_plan *ths)               \
00470 {                                                                             \
00471   C *f_hat, *g_hat;                     \
00472   R c_phi_inv_k[ths->d+1];              \
00473   int t, t2;                            \
00474   int k_L;                              \
00475   int kp[ths->d];                       \
00476   int k[ths->d];                        \
00477   int ks[ths->d];                       \
00478   int k_plain[ths->d+1];                \
00479   int ks_plain[ths->d+1];               \
00480                                                                               \
00481   f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;                             \
00482   MACRO_nfft_D_init_result_ ## which_one;                                     \
00483                                                                               \
00484   c_phi_inv_k[0] = K(1.0);                                                    \
00485   k_plain[0] = 0;                                                             \
00486   ks_plain[0] = 0;                                                            \
00487                                                                               \
00488   if (ths->nfft_flags & PRE_PHI_HUT)                                          \
00489   {                                                                           \
00490     MACRO_init_k_ks;                                                          \
00491                                                                               \
00492     for (k_L = 0; k_L < ths->N_total; k_L++)                                  \
00493     {                                                                         \
00494       MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT);                             \
00495       MACRO_nfft_D_compute_ ## which_one;                                     \
00496       MACRO_count_k_ks;                                                       \
00497     } /* for(k_L) */                                                          \
00498   } /* if(PRE_PHI_HUT) */                                                     \
00499   else                                                                        \
00500   {                                                                           \
00501     MACRO_init_k_ks;                                                          \
00502     for (k_L = 0; k_L < ths->N_total; k_L++)                                  \
00503     {                                                                         \
00504       MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT);                          \
00505       MACRO_nfft_D_compute_ ## which_one;                                     \
00506       MACRO_count_k_ks;                                                       \
00507     } /* for(k_L) */                                                          \
00508   } /* else(PRE_PHI_HUT) */                                                   \
00509 } /* nfft_D */
00510 
00511 #ifdef _OPENMP
00512 static void nfft_D_openmp_A(nfft_plan *ths)
00513 {
00514   C *f_hat, *g_hat;                     
00515   int k_L;                              
00517   f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
00518   memset(g_hat,0,ths->n_total*sizeof(C));
00519 
00520   if (ths->nfft_flags & PRE_PHI_HUT)
00521   {
00522     #pragma omp parallel for default(shared) private(k_L)
00523     for (k_L = 0; k_L < ths->N_total; k_L++)
00524     {
00525       int kp[ths->d];                        //0..N-1
00526       int k[ths->d];                        
00527       int ks[ths->d];                       
00528       R c_phi_inv_k_val = K(1.0);
00529       int k_plain_val = 0;
00530       int ks_plain_val = 0;
00531       int t;
00532       int k_temp = k_L;
00533 
00534       for (t = ths->d-1; t >= 0; t--)
00535       {
00536         kp[t] = k_temp % ths->N[t];
00537         if (kp[t] >= ths->N[t]/2)
00538           k[t] = ths->n[t] - ths->N[t] + kp[t];
00539         else
00540           k[t] = kp[t];
00541         ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00542         k_temp /= ths->N[t];
00543       }
00544 
00545       for (t = 0; t < ths->d; t++)
00546       {
00547         c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
00548         ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00549         k_plain_val = k_plain_val*ths->n[t] + k[t];
00550       }
00551 
00552       g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
00553     } /* for(k_L) */
00554   } /* if(PRE_PHI_HUT) */
00555   else
00556   {
00557     #pragma omp parallel for default(shared) private(k_L)
00558     for (k_L = 0; k_L < ths->N_total; k_L++)
00559     {
00560       int kp[ths->d];                        //0..N-1
00561       int k[ths->d];                        
00562       int ks[ths->d];                       
00563       R c_phi_inv_k_val = K(1.0);
00564       int k_plain_val = 0;
00565       int ks_plain_val = 0;
00566       int t;
00567       int k_temp = k_L;
00568 
00569       for (t = ths->d-1; t >= 0; t--)
00570       {
00571         kp[t] = k_temp % ths->N[t];
00572         if (kp[t] >= ths->N[t]/2)
00573           k[t] = ths->n[t] - ths->N[t] + kp[t];
00574         else
00575           k[t] = kp[t];
00576         ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00577         k_temp /= ths->N[t];
00578       }
00579 
00580       for (t = 0; t < ths->d; t++)
00581       {
00582         c_phi_inv_k_val /= (PHI_HUT(ks[t]-(ths->N[t]/2),t));
00583         ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00584         k_plain_val = k_plain_val*ths->n[t] + k[t];
00585       }
00586 
00587       g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
00588     } /* for(k_L) */
00589   } /* else(PRE_PHI_HUT) */
00590 }
00591 #endif
00592 
00593 #ifndef _OPENMP
00594 MACRO_nfft_D(A)
00595 #endif
00596 
00597 static void nfft_D_A(nfft_plan *ths)
00598 {
00599 #ifdef _OPENMP
00600   nfft_D_openmp_A(ths);
00601 #else
00602   nfft_D_serial_A(ths);
00603 #endif
00604 }
00605 
00606 #ifdef _OPENMP
00607 static void nfft_D_openmp_T(nfft_plan *ths)
00608 {
00609   C *f_hat, *g_hat;                     
00610   int k_L;                              
00612   f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
00613   memset(f_hat,0,ths->N_total*sizeof(C));
00614 
00615   if (ths->nfft_flags & PRE_PHI_HUT)
00616   {
00617     #pragma omp parallel for default(shared) private(k_L)
00618     for (k_L = 0; k_L < ths->N_total; k_L++)
00619     {
00620       int kp[ths->d];                        //0..N-1
00621       int k[ths->d];                        
00622       int ks[ths->d];                       
00623       R c_phi_inv_k_val = K(1.0);
00624       int k_plain_val = 0;
00625       int ks_plain_val = 0;
00626       int t;
00627       int k_temp = k_L;
00628 
00629       for (t = ths->d-1; t >= 0; t--)
00630       {
00631         kp[t] = k_temp % ths->N[t];
00632         if (kp[t] >= ths->N[t]/2)
00633           k[t] = ths->n[t] - ths->N[t] + kp[t];
00634         else
00635           k[t] = kp[t];
00636         ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00637         k_temp /= ths->N[t];
00638       }
00639 
00640       for (t = 0; t < ths->d; t++)
00641       {
00642         c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
00643         ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00644         k_plain_val = k_plain_val*ths->n[t] + k[t];
00645       }
00646 
00647       f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
00648     } /* for(k_L) */
00649   } /* if(PRE_PHI_HUT) */
00650   else
00651   {
00652     #pragma omp parallel for default(shared) private(k_L)
00653     for (k_L = 0; k_L < ths->N_total; k_L++)
00654     {
00655       int kp[ths->d];                        //0..N-1
00656       int k[ths->d];                        
00657       int ks[ths->d];                       
00658       R c_phi_inv_k_val = K(1.0);
00659       int k_plain_val = 0;
00660       int ks_plain_val = 0;
00661       int t;
00662       int k_temp = k_L;
00663 
00664       for (t = ths->d-1; t >= 0; t--)
00665       {
00666         kp[t] = k_temp % ths->N[t];
00667         if (kp[t] >= ths->N[t]/2)
00668           k[t] = ths->n[t] - ths->N[t] + kp[t];
00669         else
00670           k[t] = kp[t];
00671         ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00672         k_temp /= ths->N[t];
00673       }
00674 
00675       for (t = 0; t < ths->d; t++)
00676       {
00677         c_phi_inv_k_val /= (PHI_HUT(ks[t]-(ths->N[t]/2),t));
00678         ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00679         k_plain_val = k_plain_val*ths->n[t] + k[t];
00680       }
00681 
00682       f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
00683     } /* for(k_L) */
00684   } /* else(PRE_PHI_HUT) */
00685 }
00686 #endif
00687 
00688 #ifndef _OPENMP
00689 MACRO_nfft_D(T)
00690 #endif
00691 
00692 static void nfft_D_T(nfft_plan *ths)
00693 {
00694 #ifdef _OPENMP
00695   nfft_D_openmp_T(ths);
00696 #else
00697   nfft_D_serial_T(ths);
00698 #endif
00699 }
00700 
00704 #define MACRO_nfft_B_init_result_A  memset(f,0,ths->M_total*sizeof(C));
00705 #define MACRO_nfft_B_init_result_T memset(g,0,ths->n_total*sizeof(C));
00706 
00707 #define MACRO_nfft_B_PRE_FULL_PSI_compute_A                                   \
00708 {                                                                             \
00709   (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]];                            \
00710 }
00711 
00712 #define MACRO_nfft_B_PRE_FULL_PSI_compute_T                                   \
00713 {                                                                             \
00714   g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj);                            \
00715 }
00716 
00717 #define MACRO_nfft_B_compute_A                                                \
00718 {                                                                             \
00719   (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]];                            \
00720 }
00721 
00722 #define MACRO_nfft_B_compute_T                                                \
00723 {                                                                             \
00724   g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj);                            \
00725 }
00726 
00727 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
00728 
00729 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
00730 
00731 #define MACRO_without_PRE_PSI  PHI(ths->x[j*ths->d+t2]                        \
00732   - ((R)l[t2])/((R)ths->n[t2]), t2)
00733 
00734 #define MACRO_init_uo_l_lj_t                                                  \
00735 {                                                                             \
00736   for (t = ths->d-1; t >= 0; t--)                                             \
00737   {                                                                           \
00738     nfft_uo(ths,j,&u[t],&o[t],t);                                             \
00739     l[t] = u[t];                                                              \
00740     lj[t] = 0;                                                                \
00741   } /* for(t) */                                                              \
00742   t++;                                                                        \
00743 }
00744 
00745 #define MACRO_update_phi_prod_ll_plain(which_one) {                           \
00746   for(t2=t; t2<ths->d; t2++)                                                  \
00747     {                                                                         \
00748       phi_prod[t2+1] = phi_prod[t2]* MACRO_ ## which_one;                     \
00749       ll_plain[t2+1] = ll_plain[t2]*ths->n[t2] +(l[t2]+ths->n[t2])%ths->n[t2];\
00750     } /* for(t2) */                                                           \
00751 }
00752 
00753 #define MACRO_count_uo_l_lj_t {                                               \
00754   for(t = ths->d-1; (t > 0) && (l[t] == o[t]); t--)                           \
00755     {                                                                         \
00756       l[t] = u[t];                                                            \
00757       lj[t] = 0;                                                              \
00758     } /* for(t) */                                                            \
00759                                                                               \
00760   l[t]++;                                                                     \
00761   lj[t]++;                                                                    \
00762 }
00763 
00764 #define MACRO_nfft_B(which_one)                                               \
00765 static inline void nfft_B_serial_ ## which_one (nfft_plan *ths)               \
00766 {                                                                             \
00767   int lprod; /* 'regular bandwidth' of matrix B  */                           \
00768   int u[ths->d], o[ths->d]; /* multi band with respect to x_j */              \
00769   int t, t2; /* index dimensions */                                           \
00770   int j; /* index nodes */                                                    \
00771   int l_L, ix; /* index one row of B */                                       \
00772   int l[ths->d]; /* multi index u<=l<=o */                                    \
00773   int lj[ths->d]; /* multi index 0<=lj<u+o+1 */                               \
00774   int ll_plain[ths->d+1]; /* postfix plain index in g */                      \
00775   R phi_prod[ths->d+1]; /* postfix product of PHI */                          \
00776   C *f, *g; /* local copy */                                                  \
00777   C *fj; /* local copy */                                                     \
00778   R y[ths->d];                                                                \
00779   R fg_psi[ths->d][2*ths->m+2];                                               \
00780   R fg_exp_l[ths->d][2*ths->m+2];                                             \
00781   int l_fg,lj_fg;                                                             \
00782   R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3;                            \
00783   R ip_w;                                                                     \
00784   int ip_u;                                                                   \
00785   int ip_s = ths->K/(ths->m+2);                                               \
00786                                                                               \
00787   f = (C*)ths->f; g = (C*)ths->g;                                             \
00788                                                                               \
00789   MACRO_nfft_B_init_result_ ## which_one;                                     \
00790                                                                               \
00791   if (ths->nfft_flags & PRE_FULL_PSI)                                         \
00792   {                                                                           \
00793     for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++)                  \
00794       for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++)                   \
00795         MACRO_nfft_B_PRE_FULL_PSI_compute_ ## which_one;                      \
00796     return;                                                                   \
00797   }                                                                           \
00798                                                                               \
00799   phi_prod[0] = K(1.0);                                                       \
00800   ll_plain[0] = 0;                                                            \
00801                                                                               \
00802   for (t = 0, lprod = 1; t < ths->d; t++)                                     \
00803     lprod *= (2*ths->m+2);                                                    \
00804                                                                               \
00805   if (ths->nfft_flags & PRE_PSI)                                              \
00806   {                                                                           \
00807     for (j = 0, fj = f; j < ths->M_total; j++, fj++)                          \
00808     {                                                                         \
00809       MACRO_init_uo_l_lj_t;                                                   \
00810                                                                               \
00811       for (l_L = 0; l_L < lprod; l_L++)                                       \
00812       {                                                                       \
00813         MACRO_update_phi_prod_ll_plain(with_PRE_PSI);                         \
00814                                                                               \
00815         MACRO_nfft_B_compute_ ## which_one;                                   \
00816                                                                               \
00817         MACRO_count_uo_l_lj_t;                                                \
00818       } /* for(l_L) */                                                        \
00819     } /* for(j) */                                                            \
00820     return;                                                                   \
00821   } /* if(PRE_PSI) */                                                         \
00822                                                                               \
00823   if (ths->nfft_flags & PRE_FG_PSI)                                           \
00824   {                                                                           \
00825     for(t2 = 0; t2 < ths->d; t2++)                                            \
00826     {                                                                         \
00827       tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);                                      \
00828       tmpEXP2sq = tmpEXP2*tmpEXP2;                                            \
00829       tmp2 = K(1.0);                                                          \
00830       tmp3 = K(1.0);                                                          \
00831       fg_exp_l[t2][0] = K(1.0);                                               \
00832       for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)                          \
00833       {                                                                       \
00834         tmp3 = tmp2*tmpEXP2;                                                  \
00835         tmp2 *= tmpEXP2sq;                                                    \
00836         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;                     \
00837       }                                                                       \
00838     }                                                                         \
00839     for (j = 0, fj = f; j < ths->M_total; j++, fj++)                          \
00840     {                                                                         \
00841       MACRO_init_uo_l_lj_t;                                                   \
00842                                                                               \
00843       for (t2 = 0; t2 < ths->d; t2++)                                         \
00844       {                                                                       \
00845         fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];                            \
00846         tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];                                \
00847         tmp1 = K(1.0);                                                        \
00848         for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)       \
00849         {                                                                     \
00850           tmp1 *= tmpEXP1;                                                    \
00851           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];         \
00852         }                                                                     \
00853       }                                                                       \
00854                                                                               \
00855       for (l_L= 0; l_L < lprod; l_L++)                                        \
00856       {                                                                       \
00857         MACRO_update_phi_prod_ll_plain(with_FG_PSI);                          \
00858                                                                               \
00859         MACRO_nfft_B_compute_ ## which_one;                                   \
00860                                                                               \
00861         MACRO_count_uo_l_lj_t;                                                \
00862       } /* for(l_L) */                                                        \
00863     } /* for(j) */                                                            \
00864     return;                                                                   \
00865   } /* if(PRE_FG_PSI) */                                                      \
00866                                                                               \
00867   if (ths->nfft_flags & FG_PSI)                                               \
00868   {                                                                           \
00869     for (t2 = 0; t2 < ths->d; t2++)                                           \
00870     {                                                                         \
00871       tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);                                      \
00872       tmpEXP2sq = tmpEXP2*tmpEXP2;                                            \
00873       tmp2 = K(1.0);                                                          \
00874       tmp3 = K(1.0);                                                          \
00875       fg_exp_l[t2][0] = K(1.0);                                               \
00876       for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)                         \
00877       {                                                                       \
00878         tmp3 = tmp2*tmpEXP2;                                                  \
00879         tmp2 *= tmpEXP2sq;                                                    \
00880         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;                     \
00881       }                                                                       \
00882     }                                                                         \
00883     for (j = 0, fj = f; j < ths->M_total; j++, fj++)                          \
00884     {                                                                         \
00885       MACRO_init_uo_l_lj_t;                                                   \
00886                                                                               \
00887       for (t2 = 0; t2 < ths->d; t2++)                                         \
00888       {                                                                       \
00889         fg_psi[t2][0] = (PHI((ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));\
00890                                                                               \
00891         tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])         \
00892           /ths->b[t2]);                                                       \
00893         tmp1 = K(1.0);                                                        \
00894         for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)     \
00895         {                                                                     \
00896           tmp1 *= tmpEXP1;                                                    \
00897           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];         \
00898         }                                                                     \
00899       }                                                                       \
00900                                                                               \
00901       for (l_L = 0; l_L < lprod; l_L++)                                       \
00902       {                                                                       \
00903         MACRO_update_phi_prod_ll_plain(with_FG_PSI);                          \
00904                                                                               \
00905         MACRO_nfft_B_compute_ ## which_one;                                   \
00906                                                                               \
00907         MACRO_count_uo_l_lj_t;                                                \
00908       } /* for(l_L) */                                                        \
00909     } /* for(j) */                                                            \
00910     return;                                                                   \
00911   } /* if(FG_PSI) */                                                          \
00912                                                                               \
00913   if (ths->nfft_flags & PRE_LIN_PSI)                                          \
00914   {                                                                           \
00915     for (j = 0, fj=f; j<ths->M_total; j++, fj++)                              \
00916     {                                                                         \
00917       MACRO_init_uo_l_lj_t;                                                   \
00918                                                                               \
00919       for (t2 = 0; t2 < ths->d; t2++)                                         \
00920       {                                                                       \
00921         y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])                    \
00922           * ((R)ths->K))/(ths->m+2);                                          \
00923         ip_u  = LRINT(FLOOR(y[t2]));                                          \
00924         ip_w  = y[t2]-ip_u;                                                   \
00925         for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)         \
00926         {                                                                     \
00927           fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]  \
00928             * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]     \
00929             * (ip_w);                                                         \
00930         }                                                                     \
00931       }                                                                       \
00932                                                                               \
00933       for (l_L = 0; l_L < lprod; l_L++)                                       \
00934       {                                                                       \
00935         MACRO_update_phi_prod_ll_plain(with_FG_PSI);                          \
00936                                                                               \
00937         MACRO_nfft_B_compute_ ## which_one;                                   \
00938                                                                               \
00939         MACRO_count_uo_l_lj_t;                                                \
00940       } /* for(l_L) */                                                        \
00941     } /* for(j) */                                                            \
00942     return;                                                                   \
00943   } /* if(PRE_LIN_PSI) */                                                     \
00944                                                                               \
00945   /* no precomputed psi at all */                                             \
00946   for (j = 0, fj = f; j < ths->M_total; j++, fj++)                            \
00947   {                                                                           \
00948     MACRO_init_uo_l_lj_t;                                                     \
00949                                                                               \
00950     for (l_L = 0; l_L < lprod; l_L++)                                         \
00951     {                                                                         \
00952       MACRO_update_phi_prod_ll_plain(without_PRE_PSI);                        \
00953                                                                               \
00954       MACRO_nfft_B_compute_ ## which_one;                                     \
00955                                                                               \
00956       MACRO_count_uo_l_lj_t;                                                  \
00957     } /* for(l_L) */                                                          \
00958   } /* for(j) */                                                              \
00959 } /* nfft_B */                                                                \
00960 
00961 #ifndef _OPENMP
00962 MACRO_nfft_B(A)
00963 #endif
00964 
00965 #ifdef _OPENMP
00966 static inline void nfft_B_openmp_A (nfft_plan *ths)
00967 {
00968   int lprod; /* 'regular bandwidth' of matrix B  */
00969   int k;
00970 
00971   memset(ths->f,0,ths->M_total*sizeof(C));
00972 
00973   for (k = 0, lprod = 1; k < ths->d; k++)
00974     lprod *= (2*ths->m+2);
00975 
00976   if (ths->nfft_flags & PRE_FULL_PSI)
00977   {
00978     #pragma omp parallel for default(shared) private(k)
00979     for (k = 0; k < ths->M_total; k++)
00980     {
00981       int l;
00982       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
00983       ths->f[j] = K(0.0);
00984       for (l = 0; l < lprod; l++)
00985         ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
00986     }
00987     return;
00988   }
00989 
00990   if (ths->nfft_flags & PRE_PSI)
00991   {
00992     #pragma omp parallel for default(shared) private(k)
00993     for (k = 0; k < ths->M_total; k++)
00994     {
00995       int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
00996       int t, t2; /* index dimensions */
00997       int l_L; /* index one row of B */
00998       int l[ths->d]; /* multi index u<=l<=o */
00999       int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01000       int ll_plain[ths->d+1]; /* postfix plain index in g */
01001       R phi_prod[ths->d+1]; /* postfix product of PHI */
01002       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01003 
01004       phi_prod[0] = K(1.0);
01005       ll_plain[0] = 0;
01006 
01007       MACRO_init_uo_l_lj_t;
01008 
01009       for (l_L = 0; l_L < lprod; l_L++)
01010       {
01011         MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
01012 
01013         ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01014 
01015         MACRO_count_uo_l_lj_t;
01016       } /* for(l_L) */
01017     } /* for(j) */
01018     return;
01019   } /* if(PRE_PSI) */
01020 
01021   if (ths->nfft_flags & PRE_FG_PSI)
01022   {
01023     int t, t2; /* index dimensions */
01024     R fg_exp_l[ths->d][2*ths->m+2];
01025 
01026     for(t2 = 0; t2 < ths->d; t2++)
01027     {
01028       int lj_fg;
01029       R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01030       R tmpEXP2sq = tmpEXP2*tmpEXP2;
01031       R tmp2 = K(1.0);
01032       R tmp3 = K(1.0);
01033       fg_exp_l[t2][0] = K(1.0);
01034       for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01035       {
01036         tmp3 = tmp2*tmpEXP2;
01037         tmp2 *= tmpEXP2sq;
01038         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01039       }
01040     }
01041 
01042     #pragma omp parallel for default(shared) private(k,t,t2)
01043     for (k = 0; k < ths->M_total; k++)
01044     {
01045       int ll_plain[ths->d+1]; /* postfix plain index in g */
01046       R phi_prod[ths->d+1]; /* postfix product of PHI */
01047       int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01048       int l[ths->d]; /* multi index u<=l<=o */
01049       int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01050       R fg_psi[ths->d][2*ths->m+2];
01051       R tmpEXP1, tmp1;
01052       int l_fg,lj_fg;
01053       int l_L;
01054       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01055 
01056       phi_prod[0] = K(1.0);
01057       ll_plain[0] = 0;
01058 
01059       MACRO_init_uo_l_lj_t;
01060 
01061       for (t2 = 0; t2 < ths->d; t2++)
01062       {
01063         fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
01064         tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
01065         tmp1 = K(1.0);
01066         for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01067         {
01068           tmp1 *= tmpEXP1;
01069           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01070         }
01071       }
01072 
01073       for (l_L= 0; l_L < lprod; l_L++)
01074       {
01075         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01076 
01077         ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01078 
01079         MACRO_count_uo_l_lj_t;
01080       } /* for(l_L) */
01081     } /* for(j) */
01082     return;
01083   } /* if(PRE_FG_PSI) */
01084 
01085   if (ths->nfft_flags & FG_PSI)
01086   {
01087     int t, t2; /* index dimensions */
01088     R fg_exp_l[ths->d][2*ths->m+2];
01089 
01090     nfft_sort_nodes(ths);
01091 
01092     for (t2 = 0; t2 < ths->d; t2++)
01093     {
01094       int lj_fg;
01095       R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01096       R tmpEXP2sq = tmpEXP2*tmpEXP2;
01097       R tmp2 = K(1.0);
01098       R tmp3 = K(1.0);
01099       fg_exp_l[t2][0] = K(1.0);
01100       for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01101       {
01102         tmp3 = tmp2*tmpEXP2;
01103         tmp2 *= tmpEXP2sq;
01104         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01105       }
01106     }
01107 
01108     #pragma omp parallel for default(shared) private(k,t,t2)
01109     for (k = 0; k < ths->M_total; k++)
01110     {
01111       int ll_plain[ths->d+1]; /* postfix plain index in g */
01112       R phi_prod[ths->d+1]; /* postfix product of PHI */
01113       int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01114       int l[ths->d]; /* multi index u<=l<=o */
01115       int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01116       R fg_psi[ths->d][2*ths->m+2];
01117       R tmpEXP1, tmp1;
01118       int l_fg,lj_fg;
01119       int l_L;
01120       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01121 
01122       phi_prod[0] = K(1.0);
01123       ll_plain[0] = 0;
01124 
01125       MACRO_init_uo_l_lj_t;
01126 
01127       for (t2 = 0; t2 < ths->d; t2++)
01128       {
01129         fg_psi[t2][0] = (PHI((ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
01130 
01131         tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
01132           /ths->b[t2]);
01133         tmp1 = K(1.0);
01134         for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01135         {
01136           tmp1 *= tmpEXP1;
01137           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01138         }
01139       }
01140 
01141       for (l_L = 0; l_L < lprod; l_L++)
01142       {
01143         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01144 
01145         ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01146 
01147         MACRO_count_uo_l_lj_t;
01148       } /* for(l_L) */
01149     } /* for(j) */
01150     return;
01151   } /* if(FG_PSI) */
01152 
01153   if (ths->nfft_flags & PRE_LIN_PSI)
01154   {
01155     nfft_sort_nodes(ths);
01156 
01157     #pragma omp parallel for default(shared) private(k)
01158     for (k = 0; k<ths->M_total; k++)
01159     {
01160       int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01161       int t, t2; /* index dimensions */
01162       int l_L; /* index one row of B */
01163       int l[ths->d]; /* multi index u<=l<=o */
01164       int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01165       int ll_plain[ths->d+1]; /* postfix plain index in g */
01166       R phi_prod[ths->d+1]; /* postfix product of PHI */
01167       R y[ths->d];
01168       R fg_psi[ths->d][2*ths->m+2];
01169       int l_fg,lj_fg;
01170       R ip_w;
01171       int ip_u;
01172       int ip_s = ths->K/(ths->m+2);
01173       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01174 
01175       phi_prod[0] = K(1.0);
01176       ll_plain[0] = 0;
01177 
01178       MACRO_init_uo_l_lj_t;
01179 
01180       for (t2 = 0; t2 < ths->d; t2++)
01181       {
01182         y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
01183           * ((R)ths->K))/(ths->m+2);
01184         ip_u  = LRINT(FLOOR(y[t2]));
01185         ip_w  = y[t2]-ip_u;
01186         for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
01187         {
01188           fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
01189             * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
01190             * (ip_w);
01191         }
01192       }
01193 
01194       for (l_L = 0; l_L < lprod; l_L++)
01195       {
01196         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01197 
01198         ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01199 
01200         MACRO_count_uo_l_lj_t;
01201       } /* for(l_L) */
01202     } /* for(j) */
01203     return;
01204   } /* if(PRE_LIN_PSI) */
01205 
01206   /* no precomputed psi at all */
01207   nfft_sort_nodes(ths);
01208 
01209   #pragma omp parallel for default(shared) private(k)
01210   for (k = 0; k < ths->M_total; k++)
01211   {
01212     int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01213     int t, t2; /* index dimensions */
01214     int l_L; /* index one row of B */
01215     int l[ths->d]; /* multi index u<=l<=o */
01216     int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01217     int ll_plain[ths->d+1]; /* postfix plain index in g */
01218     R phi_prod[ths->d+1]; /* postfix product of PHI */
01219     int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01220 
01221     phi_prod[0] = K(1.0);
01222     ll_plain[0] = 0;
01223 
01224     MACRO_init_uo_l_lj_t;
01225 
01226     for (l_L = 0; l_L < lprod; l_L++)
01227     {
01228       MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
01229 
01230       ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01231 
01232       MACRO_count_uo_l_lj_t;
01233     } /* for(l_L) */
01234   } /* for(j) */
01235 }
01236 #endif
01237 
01238 static void nfft_B_A(nfft_plan *ths)
01239 {
01240 #ifdef _OPENMP
01241   nfft_B_openmp_A(ths);
01242 #else
01243   nfft_B_serial_A(ths);
01244 #endif
01245 }
01246 
01247 #ifdef _OPENMP
01248 
01263 static inline int index_x_binary_search(const int *ar_x, const int len, const int key)
01264 {
01265   int left = 0, right = len - 1;
01266 
01267   if (len == 1)
01268     return 0;
01269 
01270   while (left < right - 1)
01271   {
01272     int i = (left + right) / 2;
01273     if (ar_x[2*i] >= key)
01274       right = i;
01275     else if (ar_x[2*i] < key)
01276       left = i;
01277   }
01278 
01279   if (ar_x[2*left] < key && left != len-1)
01280     return left+1;
01281 
01282   return left;
01283 }
01284 #endif
01285 
01286 #ifdef _OPENMP
01287 
01302 static void nfft_adjoint_B_omp_blockwise_init(int *my_u0, int *my_o0, int *min_u_a, int *max_u_a, int *min_u_b, int *max_u_b, const int d, const int *n, const int m)
01303 {
01304   const int n0 = n[0];
01305   int k;
01306   int nthreads = omp_get_num_threads();
01307   int nthreads_used = MIN(nthreads, n0);
01308   int size_per_thread = n0 / nthreads_used;
01309   int size_left = n0 - size_per_thread * nthreads_used;
01310   int size_g[nthreads_used];
01311   int offset_g[nthreads_used];
01312   int my_id = omp_get_thread_num();
01313   int n_prod_rest = 1;
01314 
01315   for (k = 1; k < d; k++)
01316     n_prod_rest *= n[k];
01317 
01318   *min_u_a = -1;
01319   *max_u_a = -1;
01320   *min_u_b = -1;
01321   *max_u_b = -1;
01322   *my_u0 = -1;
01323   *my_o0 = -1;
01324 
01325   if (my_id < nthreads_used)
01326   {
01327     const int m22 = 2 * m + 2;
01328 
01329     offset_g[0] = 0;
01330     for (k = 0; k < nthreads_used; k++)
01331     {
01332       if (k > 0)
01333         offset_g[k] = offset_g[k-1] + size_g[k-1];
01334       size_g[k] = size_per_thread;
01335       if (size_left > 0)
01336       {
01337         size_g[k]++;
01338         size_left--;
01339       }
01340     }
01341 
01342     *my_u0 = offset_g[my_id];
01343     *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
01344 
01345     if (nthreads_used > 1)
01346     {
01347       *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
01348       *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
01349     }
01350     else
01351     {
01352       *min_u_a = 0;
01353       *max_u_a = n_prod_rest * n0 - 1;
01354     }
01355 
01356     if (*min_u_a < 0)
01357     {
01358       *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
01359       *max_u_b = n_prod_rest * n0 - 1;
01360       *min_u_a = 0;
01361     }
01362 
01363     if (*min_u_b != -1 && *min_u_b <= *max_u_a)
01364     {
01365       *max_u_a = *max_u_b;
01366       *min_u_b = -1;
01367       *max_u_b = -1;
01368     }
01369 #ifdef OMP_ASSERT
01370     assert(*min_u_a <= *max_u_a);
01371     assert(*min_u_b <= *max_u_b);
01372     assert(*min_u_b == -1 || *max_u_a < *min_u_b);
01373 #endif
01374   }
01375 }
01376 #endif
01377 
01386 static void nfft_adjoint_B_compute_full_psi(
01387     C *g, const int *psi_index_g, const R *psi, const C *f,
01388     const int M, const int d, const int *n, const int m, const int nfft_flags, const int *index_x)
01389 {
01390   int k;
01391   int lprod, lprod_m1;
01392   {
01393     int t;
01394     for(t = 0, lprod = 1; t < d; t++)
01395         lprod *= 2 * m + 2;
01396   }
01397   lprod_m1 = lprod / (2 * m + 2);
01398 
01399 #ifdef _OPENMP
01400   if (nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)
01401   {
01402     #pragma omp parallel private(k)
01403     {
01404       int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
01405       const int *ar_x = index_x;
01406       int n_prod_rest = 1;
01407 
01408       for (k = 1; k < d; k++)
01409         n_prod_rest *= n[k];
01410 
01411       nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
01412 
01413       if (min_u_a != -1)
01414       {
01415         k = index_x_binary_search(ar_x, M, min_u_a);
01416 #ifdef OMP_ASSERT
01417         assert(ar_x[2*k] >= min_u_a || k == M-1);
01418         if (k > 0)
01419           assert(ar_x[2*k-2] < min_u_a);
01420 #endif
01421         while (k < M)
01422         {
01423           int l0, lrest;
01424           int u_prod = ar_x[2*k];
01425           int j = ar_x[2*k+1];
01426 
01427           if (u_prod < min_u_a || u_prod > max_u_a)
01428             break;
01429 
01430           for (l0 = 0; l0 < 2 * m + 2; l0++)
01431           {
01432             const int start_index = psi_index_g[j * lprod + l0 * lprod_m1];
01433 
01434             if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
01435               continue;
01436 
01437             for (lrest = 0; lrest < lprod_m1; lrest++)
01438             {
01439               const int l = l0 * lprod_m1 + lrest;
01440               g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
01441             }
01442           }
01443 
01444           k++;
01445         }
01446       }
01447 
01448       if (min_u_b != -1)
01449       {
01450         k = index_x_binary_search(ar_x, M, min_u_b);
01451 #ifdef OMP_ASSERT
01452         assert(ar_x[2*k] >= min_u_b || k == M-1);
01453         if (k > 0)
01454           assert(ar_x[2*k-2] < min_u_b);
01455 #endif
01456         while (k < M)
01457         {
01458           int l0, lrest;
01459           int u_prod = ar_x[2*k];
01460           int j = ar_x[2*k+1];
01461 
01462           if (u_prod < min_u_b || u_prod > max_u_b)
01463             break;
01464 
01465           for (l0 = 0; l0 < 2 * m + 2; l0++)
01466           {
01467             const int start_index = psi_index_g[j * lprod + l0 * lprod_m1];
01468 
01469             if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
01470               continue;
01471             for (lrest = 0; lrest < lprod_m1; lrest++)
01472             {
01473               const int l = l0 * lprod_m1 + lrest;
01474               g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
01475             }
01476           }
01477 
01478           k++;
01479         }
01480       }
01481     } /* omp parallel */
01482     return;
01483   } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */
01484 #endif
01485 
01486   #pragma omp parallel for default(shared) private(k)
01487   for (k = 0; k < M; k++)
01488   {
01489     int l;
01490     int j = (nfft_flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
01491 
01492     for (l = 0; l < lprod; l++)
01493     {
01494 #ifdef _OPENMP
01495       C val = psi[j * lprod + l] * f[j];
01496       C *gref = g + psi_index_g[j * lprod + l];
01497       R *gref_real = (R*) gref;
01498 
01499       #pragma omp atomic
01500       gref_real[0] += creal(val);
01501 
01502       #pragma omp atomic
01503       gref_real[1] += cimag(val);
01504 #else
01505       g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
01506 #endif
01507     }
01508   }
01509 }
01510 
01511 #ifndef _OPENMP
01512 MACRO_nfft_B(T)
01513 #endif
01514 
01515 #ifdef _OPENMP
01516 static inline void nfft_B_openmp_T(nfft_plan *ths)
01517 {
01518   int lprod; /* 'regular bandwidth' of matrix B  */
01519   int k;
01520 
01521   memset(ths->g,0,ths->n_total*sizeof(C));
01522 
01523   for (k = 0, lprod = 1; k < ths->d; k++)
01524     lprod *= (2*ths->m+2);
01525 
01526   if (ths->nfft_flags & PRE_FULL_PSI)
01527   {
01528     nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f, ths->M_total,
01529             ths->d, ths->n, ths->m, ths->nfft_flags, ths->index_x);
01530     return;
01531   }
01532 
01533   if (ths->nfft_flags & PRE_PSI)
01534   {
01535     #pragma omp parallel for default(shared) private(k)
01536     for (k = 0; k < ths->M_total; k++)
01537     {
01538       int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01539       int t, t2; /* index dimensions */
01540       int l_L; /* index one row of B */
01541       int l[ths->d]; /* multi index u<=l<=o */
01542       int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01543       int ll_plain[ths->d+1]; /* postfix plain index in g */
01544       R phi_prod[ths->d+1]; /* postfix product of PHI */
01545       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01546 
01547       phi_prod[0] = K(1.0);
01548       ll_plain[0] = 0;
01549 
01550       MACRO_init_uo_l_lj_t;
01551 
01552       for (l_L = 0; l_L < lprod; l_L++)
01553       {
01554         C *lhs;
01555         R *lhs_real;
01556         C val;
01557 
01558         MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
01559 
01560         lhs = ths->g + ll_plain[ths->d];
01561         lhs_real = (R*)lhs;
01562         val = phi_prod[ths->d] * ths->f[j];
01563 
01564         #pragma omp atomic
01565         lhs_real[0] += creal(val);
01566 
01567         #pragma omp atomic
01568         lhs_real[1] += cimag(val);
01569 
01570         MACRO_count_uo_l_lj_t;
01571       } /* for(l_L) */
01572     } /* for(j) */
01573     return;
01574   } /* if(PRE_PSI) */
01575 
01576   if (ths->nfft_flags & PRE_FG_PSI)
01577   {
01578     int t, t2; /* index dimensions */
01579     R fg_exp_l[ths->d][2*ths->m+2];
01580     for(t2 = 0; t2 < ths->d; t2++)
01581     {
01582       int lj_fg;
01583       R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01584       R tmpEXP2sq = tmpEXP2*tmpEXP2;
01585       R tmp2 = K(1.0);
01586       R tmp3 = K(1.0);
01587       fg_exp_l[t2][0] = K(1.0);
01588       for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01589       {
01590         tmp3 = tmp2*tmpEXP2;
01591         tmp2 *= tmpEXP2sq;
01592         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01593       }
01594     }
01595 
01596     #pragma omp parallel for default(shared) private(k,t,t2)
01597     for (k = 0; k < ths->M_total; k++)
01598     {
01599       int ll_plain[ths->d+1]; /* postfix plain index in g */
01600       R phi_prod[ths->d+1]; /* postfix product of PHI */
01601       int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01602       int l[ths->d]; /* multi index u<=l<=o */
01603       int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01604       R fg_psi[ths->d][2*ths->m+2];
01605       R tmpEXP1, tmp1;
01606       int l_fg,lj_fg;
01607       int l_L;
01608       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01609 
01610       phi_prod[0] = K(1.0);
01611       ll_plain[0] = 0;
01612 
01613       MACRO_init_uo_l_lj_t;
01614 
01615       for (t2 = 0; t2 < ths->d; t2++)
01616       {
01617         fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
01618         tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
01619         tmp1 = K(1.0);
01620         for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01621         {
01622           tmp1 *= tmpEXP1;
01623           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01624         }
01625       }
01626 
01627       for (l_L= 0; l_L < lprod; l_L++)
01628       {
01629         C *lhs;
01630         R *lhs_real;
01631         C val;
01632 
01633         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01634 
01635         lhs = ths->g + ll_plain[ths->d];
01636         lhs_real = (R*)lhs;
01637         val = phi_prod[ths->d] * ths->f[j];
01638 
01639         #pragma omp atomic
01640         lhs_real[0] += creal(val);
01641 
01642         #pragma omp atomic
01643         lhs_real[1] += cimag(val);
01644 
01645         MACRO_count_uo_l_lj_t;
01646       } /* for(l_L) */
01647     } /* for(j) */
01648     return;
01649   } /* if(PRE_FG_PSI) */
01650 
01651   if (ths->nfft_flags & FG_PSI)
01652   {
01653     int t, t2; /* index dimensions */
01654     R fg_exp_l[ths->d][2*ths->m+2];
01655 
01656     nfft_sort_nodes(ths);
01657 
01658     for (t2 = 0; t2 < ths->d; t2++)
01659     {
01660       int lj_fg;
01661       R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01662       R tmpEXP2sq = tmpEXP2*tmpEXP2;
01663       R tmp2 = K(1.0);
01664       R tmp3 = K(1.0);
01665       fg_exp_l[t2][0] = K(1.0);
01666       for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01667       {
01668         tmp3 = tmp2*tmpEXP2;
01669         tmp2 *= tmpEXP2sq;
01670         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01671       }
01672     }
01673 
01674     #pragma omp parallel for default(shared) private(k,t,t2)
01675     for (k = 0; k < ths->M_total; k++)
01676     {
01677       int ll_plain[ths->d+1]; /* postfix plain index in g */
01678       R phi_prod[ths->d+1]; /* postfix product of PHI */
01679       int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01680       int l[ths->d]; /* multi index u<=l<=o */
01681       int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01682       R fg_psi[ths->d][2*ths->m+2];
01683       R tmpEXP1, tmp1;
01684       int l_fg,lj_fg;
01685       int l_L;
01686       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01687 
01688       phi_prod[0] = K(1.0);
01689       ll_plain[0] = 0;
01690 
01691       MACRO_init_uo_l_lj_t;
01692 
01693       for (t2 = 0; t2 < ths->d; t2++)
01694       {
01695         fg_psi[t2][0] = (PHI((ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
01696 
01697         tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
01698           /ths->b[t2]);
01699         tmp1 = K(1.0);
01700         for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01701         {
01702           tmp1 *= tmpEXP1;
01703           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01704         }
01705       }
01706 
01707       for (l_L = 0; l_L < lprod; l_L++)
01708       {
01709         C *lhs;
01710         R *lhs_real;
01711         C val;
01712 
01713         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01714 
01715         lhs = ths->g + ll_plain[ths->d];
01716         lhs_real = (R*)lhs;
01717         val = phi_prod[ths->d] * ths->f[j];
01718 
01719         #pragma omp atomic
01720         lhs_real[0] += creal(val);
01721 
01722         #pragma omp atomic
01723         lhs_real[1] += cimag(val);
01724 
01725         MACRO_count_uo_l_lj_t;
01726       } /* for(l_L) */
01727     } /* for(j) */
01728     return;
01729   } /* if(FG_PSI) */
01730 
01731   if (ths->nfft_flags & PRE_LIN_PSI)
01732   {
01733     nfft_sort_nodes(ths);
01734 
01735     #pragma omp parallel for default(shared) private(k)
01736     for (k = 0; k<ths->M_total; k++)
01737     {
01738       int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01739       int t, t2; /* index dimensions */
01740       int l_L; /* index one row of B */
01741       int l[ths->d]; /* multi index u<=l<=o */
01742       int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01743       int ll_plain[ths->d+1]; /* postfix plain index in g */
01744       R phi_prod[ths->d+1]; /* postfix product of PHI */
01745       R y[ths->d];
01746       R fg_psi[ths->d][2*ths->m+2];
01747       int l_fg,lj_fg;
01748       R ip_w;
01749       int ip_u;
01750       int ip_s = ths->K/(ths->m+2);
01751       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01752 
01753       phi_prod[0] = K(1.0);
01754       ll_plain[0] = 0;
01755 
01756       MACRO_init_uo_l_lj_t;
01757 
01758       for (t2 = 0; t2 < ths->d; t2++)
01759       {
01760         y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
01761           * ((R)ths->K))/(ths->m+2);
01762         ip_u  = LRINT(FLOOR(y[t2]));
01763         ip_w  = y[t2]-ip_u;
01764         for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
01765         {
01766           fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
01767             * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
01768             * (ip_w);
01769         }
01770       }
01771 
01772       for (l_L = 0; l_L < lprod; l_L++)
01773       {
01774         C *lhs;
01775         R *lhs_real;
01776         C val;
01777 
01778         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01779 
01780         lhs = ths->g + ll_plain[ths->d];
01781         lhs_real = (R*)lhs;
01782         val = phi_prod[ths->d] * ths->f[j];
01783 
01784         #pragma omp atomic
01785         lhs_real[0] += creal(val);
01786 
01787         #pragma omp atomic
01788         lhs_real[1] += cimag(val);
01789 
01790         MACRO_count_uo_l_lj_t;
01791       } /* for(l_L) */
01792     } /* for(j) */
01793     return;
01794   } /* if(PRE_LIN_PSI) */
01795 
01796   /* no precomputed psi at all */
01797   nfft_sort_nodes(ths);
01798 
01799   #pragma omp parallel for default(shared) private(k)
01800   for (k = 0; k < ths->M_total; k++)
01801   {
01802     int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01803     int t, t2; /* index dimensions */
01804     int l_L; /* index one row of B */
01805     int l[ths->d]; /* multi index u<=l<=o */
01806     int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01807     int ll_plain[ths->d+1]; /* postfix plain index in g */
01808     R phi_prod[ths->d+1]; /* postfix product of PHI */
01809     int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01810 
01811     phi_prod[0] = K(1.0);
01812     ll_plain[0] = 0;
01813 
01814     MACRO_init_uo_l_lj_t;
01815 
01816     for (l_L = 0; l_L < lprod; l_L++)
01817     {
01818       C *lhs;
01819       R *lhs_real;
01820       C val;
01821 
01822       MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
01823 
01824       lhs = ths->g + ll_plain[ths->d];
01825       lhs_real = (R*)lhs;
01826       val = phi_prod[ths->d] * ths->f[j];
01827 
01828       #pragma omp atomic
01829       lhs_real[0] += creal(val);
01830 
01831       #pragma omp atomic
01832       lhs_real[1] += cimag(val);
01833 
01834       MACRO_count_uo_l_lj_t;
01835     } /* for(l_L) */
01836   } /* for(j) */
01837 }
01838 #endif
01839 
01840 static void nfft_B_T(nfft_plan *ths)
01841 {
01842 #ifdef _OPENMP
01843   nfft_B_openmp_T(ths);
01844 #else
01845   nfft_B_serial_T(ths);
01846 #endif
01847 }
01848 
01849 /* ## specialized version for d=1  ########################################### */
01850 
01851 static void nfft_1d_init_fg_exp_l(R *fg_exp_l, const int m, const R b)
01852 {
01853   const int tmp2 = 2*m+2;
01854   int l;
01855   R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
01856 
01857   fg_exp_b0 = EXP(K(-1.0)/b);
01858   fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
01859   fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
01860 
01861   for (l = 1; l < tmp2; l++)
01862   {
01863     fg_exp_b2 = fg_exp_b1*fg_exp_b0;
01864     fg_exp_b1 *= fg_exp_b0_sq;
01865     fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
01866   }
01867 }
01868 
01869 
01870 static void nfft_trafo_1d_compute(C *fj, const C *g,const R *psij_const,
01871   const R *xj, const int n, const int m)
01872 {
01873   int u, o, l;
01874   const C *gj;
01875   const R *psij;
01876   psij = psij_const;
01877 
01878   nfft_uo2(&u, &o, *xj, n, m);
01879 
01880   if (u < o)
01881   {
01882     for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
01883       (*fj) += (*psij++) * (*gj++);
01884   }
01885   else
01886   {
01887     for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
01888       (*fj) += (*psij++) * (*gj++);
01889     for (l = 0, gj = g; l <= o; l++)
01890       (*fj) += (*psij++) * (*gj++);
01891   }
01892 }
01893 
01894 #ifndef _OPENMP
01895 static void nfft_adjoint_1d_compute_serial(const C *fj, C *g,const R *psij_const,
01896   const R *xj, const int n, const int m)
01897 {
01898   int u,o,l;
01899   C *gj;
01900   const R *psij;
01901   psij=psij_const;
01902 
01903   nfft_uo2(&u,&o,*xj, n, m);
01904 
01905   if(u<o)
01906   {
01907     for (l = 0, gj = g+u; l <= 2*m+1; l++)
01908       (*gj++) += (*psij++) * (*fj);
01909   }
01910   else
01911   {
01912     for (l = 0, gj = g+u; l < 2*m+1-o; l++)
01913       (*gj++) += (*psij++) * (*fj);
01914     for (l = 0, gj = g; l <= o; l++)
01915       (*gj++) += (*psij++) * (*fj);
01916   }
01917 }
01918 #endif
01919 
01920 #ifdef _OPENMP
01921 /* adjoint NFFT one-dimensional case with OpenMP atomic operations */
01922 static void nfft_adjoint_1d_compute_omp_atomic(const C f, C *g,const R *psij_const,
01923   const R *xj, const int n, const int m)
01924 {
01925   int u,o,l;
01926   C *gj;
01927   int index_temp[2*m+2];
01928 
01929   nfft_uo2(&u,&o,*xj, n, m);
01930 
01931   for (l=0; l<=2*m+1; l++)
01932     index_temp[l] = (l+u)%n;
01933 
01934   for (l = 0, gj = g+u; l <= 2*m+1; l++)
01935   {
01936     int i = index_temp[l];
01937     C *lhs = g+i;
01938     R *lhs_real = (R*)lhs;
01939     C val = psij_const[l] * f;
01940     #pragma omp atomic
01941     lhs_real[0] += creal(val);
01942 
01943     #pragma omp atomic
01944     lhs_real[1] += cimag(val);
01945   }
01946 }
01947 #endif
01948 
01949 #ifdef _OPENMP
01950 
01965 static void nfft_adjoint_1d_compute_omp_blockwise(const C f, C *g,const R *psij_const,
01966   const R *xj, const int n, const int m, const int my_u0, const int my_o0)
01967 {
01968   int ar_u,ar_o,l;
01969 
01970   nfft_uo2(&ar_u,&ar_o,*xj, n, m);
01971 
01972   if(ar_u<ar_o)
01973   {
01974     int u = MAX(my_u0,ar_u);
01975     int o = MIN(my_o0,ar_o);
01976     int offset_psij = u-ar_u;
01977 #ifdef OMP_ASSERT
01978     assert(offset_psij >= 0);
01979     assert(o-u <= 2*m+1);
01980     assert(offset_psij+o-u <= 2*m+1);
01981 #endif
01982 
01983     for (l = 0; l <= o-u; l++)
01984       g[u+l] += psij_const[offset_psij+l] * f;
01985   }
01986   else
01987   {
01988     int u = MAX(my_u0,ar_u);
01989     int o = my_o0;
01990     int offset_psij = u-ar_u;
01991 #ifdef OMP_ASSERT
01992     assert(offset_psij >= 0);
01993     assert(o-u <= 2*m+1);
01994     assert(offset_psij+o-u <= 2*m+1);
01995 #endif
01996 
01997     for (l = 0; l <= o-u; l++)
01998       g[u+l] += psij_const[offset_psij+l] * f;
01999 
02000     u = my_u0;
02001     o = MIN(my_o0,ar_o);
02002     offset_psij += my_u0-ar_u+n;
02003 
02004 #ifdef OMP_ASSERT
02005     if (u<=o)
02006     {
02007       assert(o-u <= 2*m+1);
02008       if (offset_psij+o-u > 2*m+1)
02009       {
02010         fprintf(stderr, "ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
02011       }
02012       assert(offset_psij+o-u <= 2*m+1);
02013     }
02014 #endif
02015     for (l = 0; l <= o-u; l++)
02016       g[u+l] += psij_const[offset_psij+l] * f;
02017   }
02018 }
02019 #endif
02020 
02021 static void nfft_trafo_1d_B(nfft_plan *ths)
02022 {
02023   const int n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
02024   const C *g = (C*)ths->g;
02025 
02026   if (ths->nfft_flags & PRE_FULL_PSI)
02027   {
02028     int k;
02029     #pragma omp parallel for default(shared) private(k)
02030     for (k = 0; k < M; k++)
02031     {
02032       int l;
02033       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02034       ths->f[j] = K(0.0);
02035       for (l = 0; l < m2p2; l++)
02036         ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
02037     }
02038     return;
02039   } /* if(PRE_FULL_PSI) */
02040 
02041   if (ths->nfft_flags & PRE_PSI)
02042   {
02043     int k;
02044     #pragma omp parallel for default(shared) private(k)
02045     for (k = 0; k < M; k++)
02046     {
02047       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02048       nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
02049         &ths->x[j], n, m);
02050     }
02051     return;
02052   } /* if(PRE_PSI) */
02053 
02054   if (ths->nfft_flags & PRE_FG_PSI)
02055   {
02056     int k;
02057     R fg_exp_l[m2p2];
02058 
02059     nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02060 
02061     #pragma omp parallel for default(shared) private(k)
02062     for (k = 0; k < M; k++)
02063     {
02064       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02065       const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
02066       R fg_psij2 = K(1.0);
02067       R psij_const[m2p2];
02068       int l;
02069 
02070       psij_const[0] = fg_psij0;
02071 
02072       for (l = 1; l < m2p2; l++)
02073       {
02074         fg_psij2 *= fg_psij1;
02075         psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02076       }
02077 
02078       nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02079     }
02080 
02081     return;
02082   } /* if(PRE_FG_PSI) */
02083 
02084   if (ths->nfft_flags & FG_PSI)
02085   {
02086     int k;
02087     R fg_exp_l[m2p2];
02088 
02089     nfft_sort_nodes(ths);
02090 
02091     nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02092 
02093     #pragma omp parallel for default(shared) private(k)
02094     for (k = 0; k < M; k++)
02095     {
02096       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02097       int u, o, l;
02098       R fg_psij0, fg_psij1, fg_psij2;
02099       R psij_const[m2p2];
02100 
02101       nfft_uo(ths, (int)j, &u, &o, 0);
02102       fg_psij0 = (PHI(ths->x[j]-((R)u)/n,0));
02103       fg_psij1 = EXP(K(2.0) * (n * ths->x[j] - u) / ths->b[0]);
02104       fg_psij2  = K(1.0);
02105 
02106       psij_const[0] = fg_psij0;
02107 
02108       for (l = 1; l < m2p2; l++)
02109       {
02110         fg_psij2 *= fg_psij1;
02111         psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02112       }
02113 
02114       nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02115     }
02116     return;
02117   } /* if(FG_PSI) */
02118 
02119   if (ths->nfft_flags & PRE_LIN_PSI)
02120   {
02121     const int K = ths->K, ip_s = K / (m + 2);
02122     int k;
02123 
02124     nfft_sort_nodes(ths);
02125 
02126     #pragma omp parallel for default(shared) private(k)
02127     for (k = 0; k < M; k++)
02128     {
02129       int u, o, l;
02130       R ip_y, ip_w;
02131       int ip_u;
02132       R psij_const[m2p2];
02133       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02134 
02135       nfft_uo(ths, (int)j, &u, &o, 0);
02136 
02137       ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s);
02138       ip_u = LRINT(FLOOR(ip_y));
02139       ip_w = ip_y - ip_u;
02140 
02141       for (l = 0; l < m2p2; l++)
02142         psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
02143           + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
02144 
02145       nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02146     }
02147     return;
02148   } /* if(PRE_LIN_PSI) */
02149   else
02150   {
02151     /* no precomputed psi at all */
02152     int k;
02153 
02154     nfft_sort_nodes(ths);
02155 
02156     #pragma omp parallel for default(shared) private(k)
02157     for (k = 0; k < M; k++)
02158     {
02159       R psij_const[m2p2];
02160       int u, o, l;
02161       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02162 
02163       nfft_uo(ths, (int)j, &u, &o, 0);
02164 
02165       for (l = 0; l < m2p2; l++)
02166         psij_const[l] = (PHI(ths->x[j]-((R)((u+l)))/n,0));
02167 
02168       nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02169     }
02170   }
02171 }
02172 
02173 
02174 #ifdef OMP_ASSERT
02175 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A                        \
02176 {                                                                             \
02177           assert(ar_x[2*k] >= min_u_a || k == M-1);                           \
02178           if (k > 0)                                                          \
02179             assert(ar_x[2*k-2] < min_u_a);                                    \
02180 }
02181 #else
02182 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
02183 #endif
02184 
02185 #ifdef OMP_ASSERT
02186 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B                        \
02187 {                                                                             \
02188           assert(ar_x[2*k] >= min_u_b || k == M-1);                           \
02189           if (k > 0)                                                          \
02190             assert(ar_x[2*k-2] < min_u_b);                                    \
02191 }
02192 #else
02193 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
02194 #endif
02195 
02196 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI                 \
02197 {                                                                             \
02198             nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g,               \
02199                 ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0);  \
02200 }
02201 
02202 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI              \
02203 {                                                                             \
02204             R psij_const[2 * m + 2];                                          \
02205             int u, o, l;                                                      \
02206             R fg_psij0 = ths->psi[2 * j];                                     \
02207             R fg_psij1 = ths->psi[2 * j + 1];                                 \
02208             R fg_psij2 = K(1.0);                                              \
02209                                                                               \
02210             psij_const[0] = fg_psij0;                                         \
02211             for (l = 1; l <= 2 * m + 1; l++)                                  \
02212             {                                                                 \
02213               fg_psij2 *= fg_psij1;                                           \
02214               psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];              \
02215             }                                                                 \
02216                                                                               \
02217             nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const,   \
02218                 ths->x + j, n, m, my_u0, my_o0);                              \
02219 }
02220 
02221 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI                  \
02222 {                                                                             \
02223             R psij_const[2 * m + 2];                                          \
02224             R fg_psij0, fg_psij1, fg_psij2;                                   \
02225             int u, o, l;                                                      \
02226                                                                               \
02227             nfft_uo(ths, j, &u, &o, 0);                                       \
02228             fg_psij0 = (PHI(ths->x[j]-((R)u)/n,0));                           \
02229             fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]);       \
02230             fg_psij2 = K(1.0);                                                \
02231             psij_const[0] = fg_psij0;                                         \
02232             for (l = 1; l <= 2 * m + 1; l++)                                  \
02233             {                                                                 \
02234               fg_psij2 *= fg_psij1;                                           \
02235               psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];              \
02236             }                                                                 \
02237                                                                               \
02238             nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const,   \
02239                 ths->x + j, n, m, my_u0, my_o0);                              \
02240 }
02241 
02242 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI             \
02243 {                                                                             \
02244             R psij_const[2 * m + 2];                                          \
02245             int ip_u;                                                         \
02246             R ip_y, ip_w;                                                     \
02247             int u, o, l;                                                      \
02248                                                                               \
02249             nfft_uo(ths, j, &u, &o, 0);                                       \
02250                                                                               \
02251             ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s);                       \
02252             ip_u = LRINT(FLOOR(ip_y));                                        \
02253             ip_w = ip_y - ip_u;                                               \
02254             for (l = 0; l < 2 * m + 2; l++)                                   \
02255               psij_const[l]                                                   \
02256                   = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)              \
02257                       + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);                \
02258                                                                               \
02259             nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const,   \
02260                 ths->x + j, n, m, my_u0, my_o0);                              \
02261 }
02262 
02263 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI                  \
02264 {                                                                             \
02265             R psij_const[2 * m + 2];                                          \
02266             int u, o, l;                                                      \
02267                                                                               \
02268             nfft_uo(ths, j, &u, &o, 0);                                       \
02269                                                                               \
02270             for (l = 0; l <= 2 * m + 1; l++)                                  \
02271               psij_const[l] = (PHI(ths->x[j]-((R)((u+l)))/n,0));              \
02272                                                                               \
02273             nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const,   \
02274                 ths->x + j, n, m, my_u0, my_o0);                              \
02275 }
02276 
02277 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(whichone)                       \
02278 {                                                                             \
02279     if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)                         \
02280     {                                                                         \
02281       _Pragma("omp parallel private(k)")                                      \
02282       {                                                                       \
02283         int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;                 \
02284         int *ar_x = ths->index_x;                                             \
02285                                                                               \
02286         nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
02287                                       &min_u_b, &max_u_b, 1, &n, m);      \
02288                                                                               \
02289         if (min_u_a != -1)                                                    \
02290         {                                                                     \
02291           k = index_x_binary_search(ar_x, M, min_u_a);                        \
02292                                                                               \
02293           MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A                      \
02294                                                                               \
02295           while (k < M)                                                       \
02296           {                                                                   \
02297             int u_prod = ar_x[2*k];                                           \
02298             int j = ar_x[2*k+1];                                              \
02299                                                                               \
02300             if (u_prod < min_u_a || u_prod > max_u_a)                         \
02301               break;                                                          \
02302                                                                               \
02303             MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone         \
02304                                                                               \
02305             k++;                                                              \
02306           }                                                                   \
02307         }                                                                     \
02308                                                                               \
02309         if (min_u_b != -1)                                                    \
02310         {                                                                     \
02311           k = index_x_binary_search(ar_x, M, min_u_b);                        \
02312                                                                               \
02313           MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B                      \
02314                                                                               \
02315           while (k < M)                                                       \
02316           {                                                                   \
02317             int u_prod = ar_x[2*k];                                           \
02318             int j = ar_x[2*k+1];                                              \
02319                                                                               \
02320             if (u_prod < min_u_b || u_prod > max_u_b)                         \
02321               break;                                                          \
02322                                                                               \
02323             MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone         \
02324                                                                               \
02325             k++;                                                              \
02326           }                                                                   \
02327         }                                                                     \
02328       } /* omp parallel */                                                    \
02329       return;                                                                 \
02330     } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */                                    \
02331 }
02332 
02333 static void nfft_adjoint_1d_B(nfft_plan *ths)
02334 {
02335   const int n = ths->n[0], M = ths->M_total, m = ths->m;
02336   int k;
02337   C *g = (C*)ths->g;
02338 
02339   memset(g,0,ths->n_total*sizeof(C));
02340 
02341   if (ths->nfft_flags & PRE_FULL_PSI)
02342   {
02343     nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
02344         1, ths->n, m, ths->nfft_flags, ths->index_x);
02345     return;
02346   } /* if(PRE_FULL_PSI) */
02347 
02348   if (ths->nfft_flags & PRE_PSI)
02349   {
02350 #ifdef _OPENMP
02351     MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
02352 #endif
02353 
02354     #pragma omp parallel for default(shared) private(k)
02355     for (k = 0; k < M; k++)
02356     {
02357       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02358 #ifdef _OPENMP
02359       nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
02360 #else
02361       nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
02362 #endif
02363     }
02364 
02365     return;
02366   } /* if(PRE_PSI) */
02367 
02368   if (ths->nfft_flags & PRE_FG_PSI)
02369   {
02370     R fg_exp_l[2 * m + 2];
02371 
02372     nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02373 
02374 #ifdef _OPENMP
02375     MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
02376 #endif
02377 
02378 
02379     #pragma omp parallel for default(shared) private(k)
02380     for (k = 0; k < M; k++)
02381     {
02382       R psij_const[2 * m + 2];
02383       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02384       int l;
02385       R fg_psij0 = ths->psi[2 * j];
02386       R fg_psij1 = ths->psi[2 * j + 1];
02387       R fg_psij2 = K(1.0);
02388 
02389       psij_const[0] = fg_psij0;
02390       for (l = 1; l <= 2 * m + 1; l++)
02391       {
02392         fg_psij2 *= fg_psij1;
02393         psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02394       }
02395 
02396 #ifdef _OPENMP
02397       nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02398 #else
02399       nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02400 #endif
02401     }
02402 
02403     return;
02404   } /* if(PRE_FG_PSI) */
02405 
02406   if (ths->nfft_flags & FG_PSI)
02407   {
02408     R fg_exp_l[2 * m + 2];
02409 
02410     nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02411 
02412     nfft_sort_nodes(ths);
02413 
02414 #ifdef _OPENMP
02415     MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
02416 #endif
02417 
02418     #pragma omp parallel for default(shared) private(k)
02419     for (k = 0; k < M; k++)
02420     {
02421       int u,o,l;
02422       R psij_const[2 * m + 2];
02423       R fg_psij0, fg_psij1, fg_psij2;
02424       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02425 
02426       nfft_uo(ths, j, &u, &o, 0);
02427       fg_psij0 = (PHI(ths->x[j]-((R)u)/n,0));
02428       fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]);
02429       fg_psij2 = K(1.0);
02430       psij_const[0] = fg_psij0;
02431       for (l = 1; l <= 2 * m + 1; l++)
02432       {
02433         fg_psij2 *= fg_psij1;
02434         psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02435       }
02436 
02437 #ifdef _OPENMP
02438       nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02439 #else
02440       nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02441 #endif
02442     }
02443 
02444     return;
02445   } /* if(FG_PSI) */
02446 
02447   if (ths->nfft_flags & PRE_LIN_PSI)
02448   {
02449     const int K = ths->K;
02450     const int ip_s = K / (m + 2);
02451 
02452     nfft_sort_nodes(ths);
02453 
02454 #ifdef _OPENMP
02455     MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
02456 #endif
02457 
02458     #pragma openmp parallel for default(shared) private(k)
02459     for (k = 0; k < M; k++)
02460     {
02461       int u,o,l;
02462       int ip_u;
02463       R ip_y, ip_w;
02464       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02465       R psij_const[2 * m + 2];
02466 
02467       nfft_uo(ths, j, &u, &o, 0);
02468 
02469       ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s);
02470       ip_u = LRINT(FLOOR(ip_y));
02471       ip_w = ip_y - ip_u;
02472       for (l = 0; l < 2 * m + 2; l++)
02473         psij_const[l]
02474             = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
02475                 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
02476 
02477 #ifdef _OPENMP
02478       nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02479 #else
02480       nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02481 #endif
02482     }
02483     return;
02484   } /* if(PRE_LIN_PSI) */
02485 
02486   /* no precomputed psi at all */
02487   nfft_sort_nodes(ths);
02488 
02489 #ifdef _OPENMP
02490   MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
02491 #endif
02492 
02493   #pragma omp parallel for default(shared) private(k)
02494   for (k = 0; k < M; k++)
02495   {
02496     int u,o,l;
02497     R psij_const[2 * m + 2];
02498     int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02499 
02500     nfft_uo(ths, j, &u, &o, 0);
02501 
02502     for (l = 0; l <= 2 * m + 1; l++)
02503       psij_const[l] = (PHI(ths->x[j]-((R)((u+l)))/n,0));
02504 
02505 #ifdef _OPENMP
02506     nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02507 #else
02508     nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02509 #endif
02510   }
02511 }
02512 
02513 void nfft_trafo_1d(nfft_plan *ths)
02514 {
02515   const int N = ths->N[0], N2 = N/2, n = ths->n[0];
02516   C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
02517 
02518   ths->g_hat = ths->g1;
02519   ths->g = ths->g2;
02520 
02521   {
02522     C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
02523     R *c_phi_inv1, *c_phi_inv2;
02524 
02525     TIC(0)
02526 #ifdef _OPENMP
02527     {
02528       int k;
02529       #pragma omp parallel for default(shared) private(k)
02530       for (k = 0; k < ths->n_total; k++)
02531         ths->g_hat[k] = 0.0;
02532     }
02533 #else
02534     memset(ths->g_hat, 0, ths->n_total*sizeof(C));
02535 #endif
02536     if(ths->nfft_flags & PRE_PHI_HUT)
02537     {
02538       int k;
02539       c_phi_inv1 = ths->c_phi_inv[0];
02540       c_phi_inv2 = &ths->c_phi_inv[0][N2];
02541 
02542       #pragma omp parallel for default(shared) private(k)
02543       for (k = 0; k < N2; k++)
02544       {
02545         g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
02546         g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
02547       }
02548     }
02549     else
02550     {
02551       int k;
02552       #pragma omp parallel for default(shared) private(k)
02553       for (k = 0; k < N2; k++)
02554       {
02555         g_hat1[k] = f_hat1[k] / (PHI_HUT(k-N2,0));
02556         g_hat2[k] = f_hat2[k] / (PHI_HUT(k,0));
02557       }
02558     }
02559     TOC(0)
02560 
02561     TIC_FFTW(1)
02562     fftw_execute(ths->my_fftw_plan1);
02563     TOC_FFTW(1);
02564 
02565     TIC(2);
02566     nfft_trafo_1d_B(ths);
02567     TOC(2);
02568   }
02569 }
02570 
02571 void nfft_adjoint_1d(nfft_plan *ths)
02572 {
02573   int n,N;
02574   C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
02575   R *c_phi_inv1, *c_phi_inv2;
02576 
02577   N=ths->N[0];
02578   n=ths->n[0];
02579 
02580   ths->g_hat=ths->g1;
02581   ths->g=ths->g2;
02582 
02583   f_hat1=(C*)ths->f_hat;
02584   f_hat2=(C*)&ths->f_hat[N/2];
02585   g_hat1=(C*)&ths->g_hat[n-N/2];
02586   g_hat2=(C*)ths->g_hat;
02587 
02588   TIC(2)
02589   nfft_adjoint_1d_B(ths);
02590   TOC(2)
02591 
02592   TIC_FFTW(1)
02593   fftw_execute(ths->my_fftw_plan2);
02594   TOC_FFTW(1);
02595 
02596   TIC(0)
02597   if(ths->nfft_flags & PRE_PHI_HUT)
02598   {
02599     int k;
02600     c_phi_inv1=ths->c_phi_inv[0];
02601     c_phi_inv2=&ths->c_phi_inv[0][N/2];
02602 
02603     #pragma omp parallel for default(shared) private(k)
02604     for (k = 0; k < N/2; k++)
02605     {
02606       f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
02607       f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
02608     }
02609   }
02610   else
02611   {
02612     int k;
02613 
02614     #pragma omp parallel for default(shared) private(k)
02615     for (k = 0; k < N/2; k++)
02616     {
02617       f_hat1[k] = g_hat1[k] / (PHI_HUT(k-N/2,0));
02618       f_hat2[k] = g_hat2[k] / (PHI_HUT(k,0));
02619     }
02620   }
02621   TOC(0)
02622 }
02623 
02624 
02625 /* ############################################################ SPECIFIC VERSIONS FOR d=2 */
02626 
02627 static void nfft_2d_init_fg_exp_l(R *fg_exp_l, const int m, const R b)
02628 {
02629   int l;
02630   R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
02631 
02632   fg_exp_b0 = EXP(K(-1.0)/b);
02633   fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
02634   fg_exp_b1 = K(1.0);
02635   fg_exp_b2 = K(1.0);
02636   fg_exp_l[0] = K(1.0);
02637   for(l=1; l <= 2*m+1; l++)
02638     {
02639       fg_exp_b2 = fg_exp_b1*fg_exp_b0;
02640       fg_exp_b1 *= fg_exp_b0_sq;
02641       fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
02642     }
02643 }
02644 
02645 static void nfft_trafo_2d_compute(C *fj, const C *g,
02646           const R *psij_const0, const R *psij_const1,
02647           const R *xj0, const R *xj1,
02648           const int n0, const int n1, const int m)
02649 {
02650   int u0,o0,l0,u1,o1,l1;
02651   const C *gj;
02652   const R *psij0,*psij1;
02653 
02654   psij0=psij_const0;
02655   psij1=psij_const1;
02656 
02657   nfft_uo2(&u0,&o0,*xj0, n0, m);
02658   nfft_uo2(&u1,&o1,*xj1, n1, m);
02659 
02660   *fj=0;
02661 
02662   if(u0<o0)
02663       if(u1<o1)
02664     for(l0=0; l0<=2*m+1; l0++,psij0++)
02665     {
02666         psij1=psij_const1;
02667         gj=g+(u0+l0)*n1+u1;
02668         for(l1=0; l1<=2*m+1; l1++)
02669       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02670     }
02671       else
02672     for(l0=0; l0<=2*m+1; l0++,psij0++)
02673     {
02674         psij1=psij_const1;
02675         gj=g+(u0+l0)*n1+u1;
02676         for(l1=0; l1<2*m+1-o1; l1++)
02677       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02678         gj=g+(u0+l0)*n1;
02679         for(l1=0; l1<=o1; l1++)
02680       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02681     }
02682   else
02683       if(u1<o1)
02684       {
02685     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02686     {
02687         psij1=psij_const1;
02688         gj=g+(u0+l0)*n1+u1;
02689         for(l1=0; l1<=2*m+1; l1++)
02690       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02691     }
02692     for(l0=0; l0<=o0; l0++,psij0++)
02693     {
02694         psij1=psij_const1;
02695         gj=g+l0*n1+u1;
02696         for(l1=0; l1<=2*m+1; l1++)
02697       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02698     }
02699       }
02700       else
02701       {
02702     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02703     {
02704         psij1=psij_const1;
02705         gj=g+(u0+l0)*n1+u1;
02706         for(l1=0; l1<2*m+1-o1; l1++)
02707       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02708         gj=g+(u0+l0)*n1;
02709         for(l1=0; l1<=o1; l1++)
02710       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02711     }
02712     for(l0=0; l0<=o0; l0++,psij0++)
02713     {
02714         psij1=psij_const1;
02715         gj=g+l0*n1+u1;
02716         for(l1=0; l1<2*m+1-o1; l1++)
02717       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02718         gj=g+l0*n1;
02719         for(l1=0; l1<=o1; l1++)
02720       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02721     }
02722       }
02723 }
02724 
02725 #ifdef _OPENMP
02726 /* adjoint NFFT two-dimensional case with OpenMP atomic operations */
02727 static void nfft_adjoint_2d_compute_omp_atomic(const C f, C *g,
02728             const R *psij_const0, const R *psij_const1,
02729             const R *xj0, const R *xj1,
02730             const int n0, const int n1, const int m)
02731 {
02732   int u0,o0,l0,u1,o1,l1;
02733   const int lprod = (2*m+2) * (2*m+2); 
02734 
02735   unsigned long int index_temp0[2*m+2];
02736   unsigned long int index_temp1[2*m+2];
02737 
02738   nfft_uo2(&u0,&o0,*xj0, n0, m);
02739   nfft_uo2(&u1,&o1,*xj1, n1, m);
02740 
02741   for (l0=0; l0<=2*m+1; l0++)
02742     index_temp0[l0] = (u0+l0)%n0;
02743 
02744   for (l1=0; l1<=2*m+1; l1++)
02745     index_temp1[l1] = (u1+l1)%n1;
02746 
02747   for(l0=0; l0<=2*m+1; l0++)
02748   {
02749     for(l1=0; l1<=2*m+1; l1++)
02750     {
02751       unsigned long int i = index_temp0[l0] * n1 + index_temp1[l1];
02752       C *lhs = g+i;
02753       R *lhs_real = (R*)lhs;
02754       C val = psij_const0[l0] * psij_const1[l1] * f;
02755 
02756       #pragma omp atomic
02757       lhs_real[0] += creal(val);
02758 
02759       #pragma omp atomic
02760       lhs_real[1] += cimag(val);
02761     }
02762   }
02763 }
02764 #endif
02765 
02766 #ifdef _OPENMP
02767 
02785 static void nfft_adjoint_2d_compute_omp_blockwise(const C f, C *g,
02786             const R *psij_const0, const R *psij_const1,
02787             const R *xj0, const R *xj1,
02788             const int n0, const int n1, const int m,
02789       const int my_u0, const int my_o0)
02790 {
02791   int ar_u0,ar_o0,l0,u1,o1,l1;
02792   const int lprod = (2*m+2) * (2*m+2); 
02793   unsigned long int index_temp1[2*m+2];
02794 
02795   nfft_uo2(&ar_u0,&ar_o0,*xj0, n0, m);
02796   nfft_uo2(&u1,&o1,*xj1, n1, m);
02797 
02798   for (l1=0; l1<=2*m+1; l1++)
02799     index_temp1[l1] = (u1+l1)%n1;
02800 
02801   if(ar_u0<ar_o0)
02802   {
02803     int u0 = MAX(my_u0,ar_u0);
02804     int o0 = MIN(my_o0,ar_o0);
02805     int offset_psij = u0-ar_u0;
02806 #ifdef OMP_ASSERT
02807     assert(offset_psij >= 0);
02808     assert(o0-u0 <= 2*m+1);
02809     assert(offset_psij+o0-u0 <= 2*m+1);
02810 #endif
02811 
02812     for (l0 = 0; l0 <= o0-u0; l0++)
02813     {
02814       unsigned long int i0 = (u0+l0) * n1;
02815       const C val0 = psij_const0[offset_psij+l0];
02816 
02817       for(l1=0; l1<=2*m+1; l1++)
02818         g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
02819     }
02820   }
02821   else
02822   {
02823     int u0 = MAX(my_u0,ar_u0);
02824     int o0 = my_o0;
02825     int offset_psij = u0-ar_u0;
02826 #ifdef OMP_ASSERT
02827     assert(offset_psij >= 0);
02828     assert(o0-u0 <= 2*m+1);
02829     assert(offset_psij+o0-u0 <= 2*m+1);
02830 #endif
02831 
02832     for (l0 = 0; l0 <= o0-u0; l0++)
02833     {
02834       unsigned long int i0 = (u0+l0) * n1;
02835       const C val0 = psij_const0[offset_psij+l0];
02836 
02837       for(l1=0; l1<=2*m+1; l1++)
02838         g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
02839     }
02840 
02841     u0 = my_u0;
02842     o0 = MIN(my_o0,ar_o0);
02843     offset_psij += my_u0-ar_u0+n0;
02844 
02845 #ifdef OMP_ASSERT
02846     if (u0<=o0)
02847     {
02848       assert(o0-u0 <= 2*m+1);
02849       assert(offset_psij+o0-u0 <= 2*m+1);
02850     }
02851 #endif
02852 
02853     for (l0 = 0; l0 <= o0-u0; l0++)
02854     {
02855       unsigned long int i0 = (u0+l0) * n1;
02856       const C val0 = psij_const0[offset_psij+l0];
02857 
02858       for(l1=0; l1<=2*m+1; l1++)
02859         g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
02860     }
02861   }
02862 }
02863 #endif
02864 
02865 #ifndef _OPENMP
02866 static void nfft_adjoint_2d_compute_serial(const C *fj, C *g,
02867             const R *psij_const0, const R *psij_const1,
02868             const R *xj0, const R *xj1,
02869             const int n0, const int n1, const int m)
02870 {
02871   int u0,o0,l0,u1,o1,l1;
02872   C *gj;
02873   const R *psij0,*psij1;
02874 
02875   psij0=psij_const0;
02876   psij1=psij_const1;
02877 
02878   nfft_uo2(&u0,&o0,*xj0, n0, m);
02879   nfft_uo2(&u1,&o1,*xj1, n1, m);
02880 
02881   if(u0<o0)
02882       if(u1<o1)
02883     for(l0=0; l0<=2*m+1; l0++,psij0++)
02884     {
02885         psij1=psij_const1;
02886         gj=g+(u0+l0)*n1+u1;
02887         for(l1=0; l1<=2*m+1; l1++)
02888     (*gj++) += (*psij0) * (*psij1++) * (*fj);
02889     }
02890       else
02891     for(l0=0; l0<=2*m+1; l0++,psij0++)
02892     {
02893         psij1=psij_const1;
02894         gj=g+(u0+l0)*n1+u1;
02895         for(l1=0; l1<2*m+1-o1; l1++)
02896       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02897         gj=g+(u0+l0)*n1;
02898         for(l1=0; l1<=o1; l1++)
02899       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02900     }
02901   else
02902       if(u1<o1)
02903       {
02904     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02905     {
02906         psij1=psij_const1;
02907         gj=g+(u0+l0)*n1+u1;
02908         for(l1=0; l1<=2*m+1; l1++)
02909       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02910     }
02911     for(l0=0; l0<=o0; l0++,psij0++)
02912     {
02913         psij1=psij_const1;
02914         gj=g+l0*n1+u1;
02915         for(l1=0; l1<=2*m+1; l1++)
02916       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02917     }
02918       }
02919       else
02920       {
02921     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02922     {
02923         psij1=psij_const1;
02924         gj=g+(u0+l0)*n1+u1;
02925         for(l1=0; l1<2*m+1-o1; l1++)
02926       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02927         gj=g+(u0+l0)*n1;
02928         for(l1=0; l1<=o1; l1++)
02929       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02930     }
02931     for(l0=0; l0<=o0; l0++,psij0++)
02932     {
02933         psij1=psij_const1;
02934         gj=g+l0*n1+u1;
02935         for(l1=0; l1<2*m+1-o1; l1++)
02936       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02937         gj=g+l0*n1;
02938         for(l1=0; l1<=o1; l1++)
02939       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02940     }
02941       }
02942 }
02943 #endif
02944 
02945 static void nfft_trafo_2d_B(nfft_plan *ths)
02946 {
02947   const C *g = (C*)ths->g;
02948   const int N0 = ths->N[0];
02949   const int n0 = ths->n[0];
02950   const int N1 = ths->N[1];
02951   const int n1 = ths->n[1];
02952   const int M = ths->M_total;
02953   const int m = ths->m;
02954 
02955   int k;
02956 
02957   if(ths->nfft_flags & PRE_FULL_PSI)
02958   {
02959     const int lprod = (2*m+2) * (2*m+2);
02960     #pragma omp parallel for default(shared) private(k)
02961     for (k = 0; k < M; k++)
02962     {
02963       int l;
02964       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02965       ths->f[j] = K(0.0);
02966       for (l = 0; l < lprod; l++)
02967         ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
02968     }
02969     return;
02970   } /* if(PRE_FULL_PSI) */
02971 
02972   if(ths->nfft_flags & PRE_PSI)
02973   {
02974     #pragma omp parallel for default(shared) private(k)
02975     for (k = 0; k < M; k++)
02976     {
02977       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02978       nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
02979     }
02980 
02981       return;
02982   } /* if(PRE_PSI) */
02983 
02984   if(ths->nfft_flags & PRE_FG_PSI)
02985   {
02986     R fg_exp_l[2*(2*m+2)];
02987 
02988     nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02989     nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
02990 
02991     #pragma omp parallel for default(shared) private(k)
02992     for (k = 0; k < M; k++)
02993     {
02994       R psij_const[2*(2*m+2)];
02995       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02996       int l;
02997       R fg_psij0 = ths->psi[2*j*2];
02998       R fg_psij1 = ths->psi[2*j*2+1];
02999       R fg_psij2 = K(1.0);
03000 
03001       psij_const[0] = fg_psij0;
03002       for(l=1; l<=2*m+1; l++)
03003       {
03004         fg_psij2 *= fg_psij1;
03005         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03006       }
03007 
03008       fg_psij0 = ths->psi[2*(j*2+1)];
03009       fg_psij1 = ths->psi[2*(j*2+1)+1];
03010       fg_psij2 = K(1.0);
03011       psij_const[2*m+2] = fg_psij0;
03012       for(l=1; l<=2*m+1; l++)
03013       {
03014         fg_psij2 *= fg_psij1;
03015         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03016       }
03017 
03018       nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03019     }
03020 
03021     return;
03022   } /* if(PRE_FG_PSI) */
03023 
03024   if(ths->nfft_flags & FG_PSI)
03025   {
03026     R fg_exp_l[2*(2*m+2)];
03027 
03028     nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
03029     nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
03030 
03031     nfft_sort_nodes(ths);
03032 
03033     #pragma omp parallel for default(shared) private(k)
03034     for (k = 0; k < M; k++)
03035     {
03036       int u, o, l;
03037       R fg_psij0, fg_psij1, fg_psij2;
03038       R psij_const[2*(2*m+2)];
03039       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03040 
03041       nfft_uo(ths,j,&u,&o,0);
03042       fg_psij0 = (PHI(ths->x[2*j]-((R)u)/n0,0));
03043       fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]);
03044       fg_psij2 = K(1.0);
03045       psij_const[0] = fg_psij0;
03046       for(l=1; l<=2*m+1; l++)
03047       {
03048         fg_psij2 *= fg_psij1;
03049         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03050       }
03051 
03052       nfft_uo(ths,j,&u,&o,1);
03053       fg_psij0 = (PHI(ths->x[2*j+1]-((R)u)/n1,1));
03054       fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]);
03055       fg_psij2 = K(1.0);
03056       psij_const[2*m+2] = fg_psij0;
03057       for(l=1; l<=2*m+1; l++)
03058       {
03059         fg_psij2 *= fg_psij1;
03060         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03061       }
03062 
03063       nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03064     }
03065 
03066     return;
03067   } /* if(FG_PSI) */
03068 
03069   if(ths->nfft_flags & PRE_LIN_PSI)
03070   {
03071     const int K = ths->K, ip_s = K / (m + 2);
03072 
03073     nfft_sort_nodes(ths);
03074 
03075     #pragma omp parallel for default(shared) private(k)
03076     for (k = 0; k < M; k++)
03077     {
03078       int u, o, l;
03079       R ip_y, ip_w;
03080       int ip_u;
03081       R psij_const[2*(2*m+2)];
03082       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03083 
03084       nfft_uo(ths,j,&u,&o,0);
03085       ip_y = FABS(n0*ths->x[2*j] - u)*((R)ip_s);
03086       ip_u = LRINT(FLOOR(ip_y));
03087       ip_w = ip_y-ip_u;
03088       for(l=0; l < 2*m+2; l++)
03089         psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
03090 
03091       nfft_uo(ths,j,&u,&o,1);
03092       ip_y = FABS(n1*ths->x[2*j+1] - u)*((R)ip_s);
03093       ip_u = LRINT(FLOOR(ip_y));
03094       ip_w = ip_y-ip_u;
03095       for(l=0; l < 2*m+2; l++)
03096         psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
03097 
03098       nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03099     }
03100       return;
03101   } /* if(PRE_LIN_PSI) */
03102 
03103   /* no precomputed psi at all */
03104 
03105   nfft_sort_nodes(ths);
03106 
03107   #pragma omp parallel for default(shared) private(k)
03108   for (k = 0; k < M; k++)
03109   {
03110     R psij_const[2*(2*m+2)];
03111     int u, o, l;
03112     int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03113 
03114     nfft_uo(ths,j,&u,&o,0);
03115     for(l=0;l<=2*m+1;l++)
03116       psij_const[l]=(PHI(ths->x[2*j]-((R)((u+l)))/n0,0));
03117 
03118     nfft_uo(ths,j,&u,&o,1);
03119     for(l=0;l<=2*m+1;l++)
03120       psij_const[2*m+2+l]=(PHI(ths->x[2*j+1]-((R)((u+l)))/n1,1));
03121 
03122     nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03123   }
03124 }
03125 
03126 #ifdef OMP_ASSERT
03127 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A                        \
03128 {                                                                             \
03129           assert(ar_x[2*k] >= min_u_a || k == M-1);                           \
03130           if (k > 0)                                                          \
03131             assert(ar_x[2*k-2] < min_u_a);                                    \
03132 }
03133 #else
03134 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
03135 #endif
03136 
03137 #ifdef OMP_ASSERT
03138 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B                        \
03139 {                                                                             \
03140           assert(ar_x[2*k] >= min_u_b || k == M-1);                           \
03141           if (k > 0)                                                          \
03142             assert(ar_x[2*k-2] < min_u_b);                                    \
03143 }
03144 #else
03145 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
03146 #endif
03147 
03148 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI                 \
03149             nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g,               \
03150                 ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2),               \
03151                 ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
03152 
03153 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI              \
03154 {                                                                             \
03155             R psij_const[2*(2*m+2)];                                          \
03156             int u, o, l;                                                      \
03157             R fg_psij0 = ths->psi[2*j*2];                                     \
03158             R fg_psij1 = ths->psi[2*j*2+1];                                   \
03159             R fg_psij2 = K(1.0);                                              \
03160                                                                               \
03161             psij_const[0] = fg_psij0;                                         \
03162             for(l=1; l<=2*m+1; l++)                                           \
03163             {                                                                 \
03164               fg_psij2 *= fg_psij1;                                           \
03165               psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];                  \
03166             }                                                                 \
03167                                                                               \
03168             fg_psij0 = ths->psi[2*(j*2+1)];                                   \
03169             fg_psij1 = ths->psi[2*(j*2+1)+1];                                 \
03170             fg_psij2 = K(1.0);                                                \
03171             psij_const[2*m+2] = fg_psij0;                                     \
03172             for(l=1; l<=2*m+1; l++)                                           \
03173             {                                                                 \
03174               fg_psij2 *= fg_psij1;                                           \
03175               psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];      \
03176             }                                                                 \
03177                                                                               \
03178             nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g,               \
03179                 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1,       \
03180                 n0, n1, m, my_u0, my_o0);                                     \
03181 }
03182 
03183 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI                  \
03184 {                                                                             \
03185             R psij_const[2*(2*m+2)];                                          \
03186             R fg_psij0, fg_psij1, fg_psij2;                                   \
03187             int u, o, l;                                                      \
03188                                                                               \
03189             nfft_uo(ths,j,&u,&o,0);                                           \
03190             fg_psij0 = (PHI(ths->x[2*j]-((R)u)/n0,0));                        \
03191             fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]);          \
03192             fg_psij2 = K(1.0);                                                \
03193             psij_const[0] = fg_psij0;                                         \
03194             for(l=1; l<=2*m+1; l++)                                           \
03195             {                                                                 \
03196               fg_psij2 *= fg_psij1;                                           \
03197               psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];                  \
03198             }                                                                 \
03199                                                                               \
03200             nfft_uo(ths,j,&u,&o,1);                                           \
03201             fg_psij0 = (PHI(ths->x[2*j+1]-((R)u)/n1,1));                      \
03202             fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]);        \
03203             fg_psij2 = K(1.0);                                                \
03204             psij_const[2*m+2] = fg_psij0;                                     \
03205             for(l=1; l<=2*m+1; l++)                                           \
03206             {                                                                 \
03207               fg_psij2 *= fg_psij1;                                           \
03208               psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];      \
03209             }                                                                 \
03210                                                                               \
03211             nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g,               \
03212                 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1,       \
03213                 n0, n1, m, my_u0, my_o0);                                     \
03214 }
03215 
03216 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI             \
03217 {                                                                             \
03218             R psij_const[2*(2*m+2)];                                          \
03219             int u, o, l;                                                      \
03220             int ip_u;                                                         \
03221             R ip_y, ip_w;                                                     \
03222                                                                               \
03223             nfft_uo(ths,j,&u,&o,0);                                           \
03224             ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s);                      \
03225             ip_u = LRINT(FLOOR(ip_y));                                        \
03226             ip_w = ip_y-ip_u;                                                 \
03227             for(l=0; l < 2*m+2; l++)                                          \
03228               psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +      \
03229                 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);                          \
03230                                                                               \
03231             nfft_uo(ths,j,&u,&o,1);                                           \
03232             ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s);                    \
03233             ip_u = LRINT(FLOOR(ip_y));                                        \
03234             ip_w = ip_y-ip_u;                                                 \
03235             for(l=0; l < 2*m+2; l++)                                          \
03236               psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +  \
03237                 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);                    \
03238                                                                               \
03239             nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g,               \
03240                 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1,       \
03241                 n0, n1, m, my_u0, my_o0);                                     \
03242 }
03243 
03244 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI                  \
03245 {                                                                             \
03246             R psij_const[2*(2*m+2)];                                          \
03247             int u, o, l;                                                      \
03248                                                                               \
03249             nfft_uo(ths,j,&u,&o,0);                                           \
03250             for(l=0;l<=2*m+1;l++)                                             \
03251               psij_const[l]=(PHI(ths->x[2*j]-((R)((u+l)))/n0,0));             \
03252                                                                               \
03253             nfft_uo(ths,j,&u,&o,1);                                           \
03254             for(l=0;l<=2*m+1;l++)                                             \
03255               psij_const[2*m+2+l]=(PHI(ths->x[2*j+1]-((R)((u+l)))/n1,1));     \
03256                                                                               \
03257             nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g,               \
03258                 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1,       \
03259                 n0, n1, m, my_u0, my_o0);                                     \
03260 }
03261 
03262 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(whichone)                       \
03263 {                                                                             \
03264     if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)                         \
03265     {                                                                         \
03266       _Pragma("omp parallel private(k)")                                      \
03267       {                                                                       \
03268         int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;                 \
03269         int *ar_x = ths->index_x;                                             \
03270                                                                               \
03271         nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
03272             &min_u_b, &max_u_b, 2, ths->n, m);                                \
03273                                                                               \
03274         if (min_u_a != -1)                                                    \
03275         {                                                                     \
03276           k = index_x_binary_search(ar_x, M, min_u_a);                        \
03277                                                                               \
03278           MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A                      \
03279                                                                               \
03280           while (k < M)                                                       \
03281           {                                                                   \
03282             int u_prod = ar_x[2*k];                                           \
03283             int j = ar_x[2*k+1];                                              \
03284                                                                               \
03285             if (u_prod < min_u_a || u_prod > max_u_a)                         \
03286               break;                                                          \
03287                                                                               \
03288             MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone         \
03289                                                                               \
03290             k++;                                                              \
03291           }                                                                   \
03292         }                                                                     \
03293                                                                               \
03294         if (min_u_b != -1)                                                    \
03295         {                                                                     \
03296           int k = index_x_binary_search(ar_x, M, min_u_b);                    \
03297                                                                               \
03298           MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B                      \
03299                                                                               \
03300           while (k < M)                                                       \
03301           {                                                                   \
03302             int u_prod = ar_x[2*k];                                           \
03303             int j = ar_x[2*k+1];                                              \
03304                                                                               \
03305             if (u_prod < min_u_b || u_prod > max_u_b)                         \
03306               break;                                                          \
03307                                                                               \
03308             MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone         \
03309                                                                               \
03310             k++;                                                              \
03311           }                                                                   \
03312         }                                                                     \
03313       } /* omp parallel */                                                    \
03314       return;                                                                 \
03315     } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */                                    \
03316 }
03317 
03318 
03319 static void nfft_adjoint_2d_B(nfft_plan *ths)
03320 {
03321   const int N0 = ths->N[0];
03322   const int n0 = ths->n[0];
03323   const int N1 = ths->N[1];
03324   const int n1 = ths->n[1];
03325   const int M = ths->M_total;
03326   const int m = ths->m;
03327   C* g = (C*) ths->g;
03328   int k;
03329 
03330   memset(g,0,ths->n_total*sizeof(C));
03331 
03332   if(ths->nfft_flags & PRE_FULL_PSI)
03333   {
03334     nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
03335         2, ths->n, m, ths->nfft_flags, ths->index_x);
03336     return;
03337   } /* if(PRE_FULL_PSI) */
03338 
03339   if(ths->nfft_flags & PRE_PSI)
03340   {
03341 #ifdef _OPENMP
03342     MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
03343 #endif
03344 
03345     #pragma omp parallel for default(shared) private(k)
03346     for (k = 0; k < M; k++)
03347     {
03348       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03349 #ifdef _OPENMP
03350       nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03351 #else
03352       nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03353 #endif
03354     }
03355     return;
03356   } /* if(PRE_PSI) */
03357 
03358   if(ths->nfft_flags & PRE_FG_PSI)
03359   {
03360     R fg_exp_l[2*(2*m+2)];
03361 
03362     nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
03363     nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
03364 
03365 #ifdef _OPENMP
03366     MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
03367 #endif
03368 
03369 
03370     #pragma omp parallel for default(shared) private(k)
03371     for (k = 0; k < M; k++)
03372     {
03373       R psij_const[2*(2*m+2)];
03374       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03375       int l;
03376       R fg_psij0 = ths->psi[2*j*2];
03377       R fg_psij1 = ths->psi[2*j*2+1];
03378       R fg_psij2 = K(1.0);
03379 
03380       psij_const[0] = fg_psij0;
03381       for(l=1; l<=2*m+1; l++)
03382       {
03383         fg_psij2 *= fg_psij1;
03384         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03385       }
03386 
03387       fg_psij0 = ths->psi[2*(j*2+1)];
03388       fg_psij1 = ths->psi[2*(j*2+1)+1];
03389       fg_psij2 = K(1.0);
03390       psij_const[2*m+2] = fg_psij0;
03391       for(l=1; l<=2*m+1; l++)
03392       {
03393         fg_psij2 *= fg_psij1;
03394         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03395       }
03396 
03397 #ifdef _OPENMP
03398       nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03399 #else
03400       nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03401 #endif
03402     }
03403 
03404     return;
03405   } /* if(PRE_FG_PSI) */
03406 
03407   if(ths->nfft_flags & FG_PSI)
03408   {
03409     R fg_exp_l[2*(2*m+2)];
03410 
03411     nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
03412     nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
03413 
03414     nfft_sort_nodes(ths);
03415 
03416 #ifdef _OPENMP
03417     MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
03418 #endif
03419 
03420     #pragma omp parallel for default(shared) private(k)
03421     for (k = 0; k < M; k++)
03422     {
03423       int u, o, l;
03424       R fg_psij0, fg_psij1, fg_psij2;
03425       R psij_const[2*(2*m+2)];
03426       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03427 
03428       nfft_uo(ths,j,&u,&o,0);
03429       fg_psij0 = (PHI(ths->x[2*j]-((R)u)/n0,0));
03430       fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]);
03431       fg_psij2 = K(1.0);
03432       psij_const[0] = fg_psij0;
03433       for(l=1; l<=2*m+1; l++)
03434       {
03435         fg_psij2 *= fg_psij1;
03436         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03437       }
03438 
03439       nfft_uo(ths,j,&u,&o,1);
03440       fg_psij0 = (PHI(ths->x[2*j+1]-((R)u)/n1,1));
03441       fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]);
03442       fg_psij2 = K(1.0);
03443       psij_const[2*m+2] = fg_psij0;
03444       for(l=1; l<=2*m+1; l++)
03445       {
03446         fg_psij2 *= fg_psij1;
03447         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03448       }
03449 
03450 #ifdef _OPENMP
03451       nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03452 #else
03453       nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03454 #endif
03455     }
03456 
03457     return;
03458   } /* if(FG_PSI) */
03459 
03460   if(ths->nfft_flags & PRE_LIN_PSI)
03461   {
03462     const int K = ths->K;
03463     const int ip_s = K / (m + 2);
03464 
03465     nfft_sort_nodes(ths);
03466 
03467 #ifdef _OPENMP
03468     MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
03469 #endif
03470 
03471     #pragma openmp parallel for default(shared) private(k)
03472     for (k = 0; k < M; k++)
03473     {
03474       int u,o,l;
03475       int ip_u;
03476       R ip_y, ip_w;
03477       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03478       R psij_const[2*(2*m+2)];
03479 
03480       nfft_uo(ths,j,&u,&o,0);
03481       ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s);
03482       ip_u = LRINT(FLOOR(ip_y));
03483       ip_w = ip_y-ip_u;
03484       for(l=0; l < 2*m+2; l++)
03485         psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
03486           ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
03487 
03488       nfft_uo(ths,j,&u,&o,1);
03489       ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s);
03490       ip_u = LRINT(FLOOR(ip_y));
03491       ip_w = ip_y-ip_u;
03492       for(l=0; l < 2*m+2; l++)
03493         psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
03494           ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
03495 
03496 #ifdef _OPENMP
03497       nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03498 #else
03499       nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03500 #endif
03501   }
03502       return;
03503     } /* if(PRE_LIN_PSI) */
03504 
03505   /* no precomputed psi at all */
03506   nfft_sort_nodes(ths);
03507 
03508 #ifdef _OPENMP
03509   MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
03510 #endif
03511 
03512   #pragma omp parallel for default(shared) private(k)
03513   for (k = 0; k < M; k++)
03514   {
03515     int u,o,l;
03516     R psij_const[2*(2*m+2)];
03517     int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03518 
03519     nfft_uo(ths,j,&u,&o,0);
03520     for(l=0;l<=2*m+1;l++)
03521       psij_const[l]=(PHI(ths->x[2*j]-((R)((u+l)))/n0,0));
03522 
03523     nfft_uo(ths,j,&u,&o,1);
03524     for(l=0;l<=2*m+1;l++)
03525       psij_const[2*m+2+l]=(PHI(ths->x[2*j+1]-((R)((u+l)))/n1,1));
03526 
03527 #ifdef _OPENMP
03528     nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03529 #else
03530     nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03531 #endif
03532   }
03533 }
03534 
03535 
03536 void nfft_trafo_2d(nfft_plan *ths)
03537 {
03538   int k0,k1,n0,n1,N0,N1;
03539   C *g_hat,*f_hat;
03540   R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
03541   R ck01, ck02, ck11, ck12;
03542   C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
03543 
03544   ths->g_hat=ths->g1;
03545   ths->g=ths->g2;
03546 
03547   N0=ths->N[0];
03548   N1=ths->N[1];
03549   n0=ths->n[0];
03550   n1=ths->n[1];
03551 
03552   f_hat=(C*)ths->f_hat;
03553   g_hat=(C*)ths->g_hat;
03554 
03555   TIC(0)
03556 #ifdef _OPENMP
03557   #pragma omp parallel for default(shared) private(k0)
03558   for (k0 = 0; k0 < ths->n_total; k0++)
03559     ths->g_hat[k0] = 0.0;
03560 #else
03561   memset(ths->g_hat,0,ths->n_total*sizeof(C));
03562 #endif
03563   if(ths->nfft_flags & PRE_PHI_HUT)
03564     {
03565       c_phi_inv01=ths->c_phi_inv[0];
03566       c_phi_inv02=&ths->c_phi_inv[0][N0/2];
03567 
03568       #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
03569       for(k0=0;k0<N0/2;k0++)
03570       {
03571         ck01=c_phi_inv01[k0];
03572         ck02=c_phi_inv02[k0];
03573 
03574         c_phi_inv11=ths->c_phi_inv[1];
03575         c_phi_inv12=&ths->c_phi_inv[1][N1/2];
03576 
03577         g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
03578         f_hat11=f_hat + k0*N1;
03579         g_hat21=g_hat + k0*n1+n1-(N1/2);
03580         f_hat21=f_hat + ((N0/2)+k0)*N1;
03581         g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
03582         f_hat12=f_hat + k0*N1+(N1/2);
03583         g_hat22=g_hat + k0*n1;
03584         f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
03585 
03586         for(k1=0;k1<N1/2;k1++)
03587         {
03588           ck11=c_phi_inv11[k1];
03589           ck12=c_phi_inv12[k1];
03590 
03591           g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
03592           g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
03593           g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
03594           g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
03595         }
03596       }
03597     }
03598   else
03599     #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
03600     for(k0=0;k0<N0/2;k0++)
03601       {
03602   ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
03603   ck02=K(1.0)/(PHI_HUT(k0,0));
03604   for(k1=0;k1<N1/2;k1++)
03605     {
03606       ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
03607       ck12=K(1.0)/(PHI_HUT(k1,1));
03608       g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1]             * ck01 * ck11;
03609       g_hat[k0*n1+n1-N1/2+k1]           = f_hat[(N0/2+k0)*N1+k1]      * ck02 * ck11;
03610       g_hat[(n0-N0/2+k0)*n1+k1]         = f_hat[k0*N1+N1/2+k1]        * ck01 * ck12;
03611       g_hat[k0*n1+k1]                   = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
03612     }
03613       }
03614 
03615   TOC(0)
03616 
03617   TIC_FFTW(1)
03618   fftw_execute(ths->my_fftw_plan1);
03619   TOC_FFTW(1);
03620 
03621   TIC(2);
03622   nfft_trafo_2d_B(ths);
03623   TOC(2);
03624 }
03625 
03626 void nfft_adjoint_2d(nfft_plan *ths)
03627 {
03628   int k0,k1,n0,n1,N0,N1;
03629   C *g_hat,*f_hat;
03630   R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
03631   R ck01, ck02, ck11, ck12;
03632   C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
03633 
03634   ths->g_hat=ths->g1;
03635   ths->g=ths->g2;
03636 
03637   N0=ths->N[0];
03638   N1=ths->N[1];
03639   n0=ths->n[0];
03640   n1=ths->n[1];
03641 
03642   f_hat=(C*)ths->f_hat;
03643   g_hat=(C*)ths->g_hat;
03644 
03645   TIC(2);
03646   nfft_adjoint_2d_B(ths);
03647   TOC(2);
03648 
03649   TIC_FFTW(1)
03650   fftw_execute(ths->my_fftw_plan2);
03651   TOC_FFTW(1);
03652 
03653   TIC(0)
03654   if(ths->nfft_flags & PRE_PHI_HUT)
03655     {
03656       c_phi_inv01=ths->c_phi_inv[0];
03657       c_phi_inv02=&ths->c_phi_inv[0][N0/2];
03658 
03659       #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
03660       for(k0=0;k0<N0/2;k0++)
03661       {
03662         ck01=c_phi_inv01[k0];
03663         ck02=c_phi_inv02[k0];
03664 
03665         c_phi_inv11=ths->c_phi_inv[1];
03666         c_phi_inv12=&ths->c_phi_inv[1][N1/2];
03667 
03668         g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
03669         f_hat11=f_hat + k0*N1;
03670         g_hat21=g_hat + k0*n1+n1-(N1/2);
03671         f_hat21=f_hat + ((N0/2)+k0)*N1;
03672         g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
03673         f_hat12=f_hat + k0*N1+(N1/2);
03674         g_hat22=g_hat + k0*n1;
03675         f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
03676 
03677         for(k1=0;k1<N1/2;k1++)
03678         {
03679           ck11=c_phi_inv11[k1];
03680           ck12=c_phi_inv12[k1];
03681 
03682           f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
03683           f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
03684           f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
03685           f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
03686         }
03687       }
03688     }
03689   else
03690     #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
03691     for(k0=0;k0<N0/2;k0++)
03692       {
03693   ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
03694   ck02=K(1.0)/(PHI_HUT(k0,0));
03695   for(k1=0;k1<N1/2;k1++)
03696     {
03697       ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
03698       ck12=K(1.0)/(PHI_HUT(k1,1));
03699       f_hat[k0*N1+k1]             = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
03700       f_hat[(N0/2+k0)*N1+k1]      = g_hat[k0*n1+n1-N1/2+k1]           * ck02 * ck11;
03701       f_hat[k0*N1+N1/2+k1]        = g_hat[(n0-N0/2+k0)*n1+k1]         * ck01 * ck12;
03702       f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1]                   * ck02 * ck12;
03703     }
03704       }
03705   TOC(0)
03706 }
03707 
03708 /* ############################################################ SPECIFIC VERSIONS FOR d=3 */
03709 
03710 static void nfft_3d_init_fg_exp_l(R *fg_exp_l, const int m, const R b)
03711 {
03712   int l;
03713   R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
03714 
03715   fg_exp_b0 = EXP(-1.0/b);
03716   fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
03717   fg_exp_b1 = K(1.0);
03718   fg_exp_b2 = K(1.0);
03719   fg_exp_l[0] = K(1.0);
03720   for(l=1; l <= 2*m+1; l++)
03721     {
03722       fg_exp_b2 = fg_exp_b1*fg_exp_b0;
03723       fg_exp_b1 *= fg_exp_b0_sq;
03724       fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
03725     }
03726 }
03727 
03728 static void nfft_trafo_3d_compute(C *fj, const C *g,
03729           const R *psij_const0, const R *psij_const1, const R *psij_const2,
03730           const R *xj0, const R *xj1, const R *xj2,
03731           const int n0, const int n1, const int n2, const int m)
03732 {
03733   int u0,o0,l0,u1,o1,l1,u2,o2,l2;
03734   const C *gj;
03735   const R *psij0,*psij1,*psij2;
03736 
03737   psij0=psij_const0;
03738   psij1=psij_const1;
03739   psij2=psij_const2;
03740 
03741   nfft_uo2(&u0,&o0,*xj0, n0, m);
03742   nfft_uo2(&u1,&o1,*xj1, n1, m);
03743   nfft_uo2(&u2,&o2,*xj2, n2, m);
03744 
03745   *fj=0;
03746 
03747   if(u0<o0)
03748     if(u1<o1)
03749       if(u2<o2)
03750   for(l0=0; l0<=2*m+1; l0++,psij0++)
03751     {
03752       psij1=psij_const1;
03753       for(l1=0; l1<=2*m+1; l1++,psij1++)
03754         {
03755     psij2=psij_const2;
03756     gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03757     for(l2=0; l2<=2*m+1; l2++)
03758       (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03759         }
03760     }
03761       else/* asserts (u2>o2)*/
03762   for(l0=0; l0<=2*m+1; l0++,psij0++)
03763     {
03764       psij1=psij_const1;
03765       for(l1=0; l1<=2*m+1; l1++,psij1++)
03766         {
03767     psij2=psij_const2;
03768     gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03769     for(l2=0; l2<2*m+1-o2; l2++)
03770       (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03771     gj=g+((u0+l0)*n1+(u1+l1))*n2;
03772     for(l2=0; l2<=o2; l2++)
03773       (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03774         }
03775     }
03776     else/* asserts (u1>o1)*/
03777       if(u2<o2)
03778   for(l0=0; l0<=2*m+1; l0++,psij0++)
03779     {
03780       psij1=psij_const1;
03781       for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03782         {
03783     psij2=psij_const2;
03784     gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03785     for(l2=0; l2<=2*m+1; l2++)
03786       (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03787         }
03788       for(l1=0; l1<=o1; l1++,psij1++)
03789         {
03790     psij2=psij_const2;
03791     gj=g+((u0+l0)*n1+l1)*n2+u2;
03792     for(l2=0; l2<=2*m+1; l2++)
03793       (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03794         }
03795     }
03796       else/* asserts (u2>o2) */
03797   {
03798     for(l0=0; l0<=2*m+1; l0++,psij0++)
03799       {
03800         psij1=psij_const1;
03801         for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03802     {
03803       psij2=psij_const2;
03804       gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03805       for(l2=0; l2<2*m+1-o2; l2++)
03806         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03807       gj=g+((u0+l0)*n1+(u1+l1))*n2;
03808       for(l2=0; l2<=o2; l2++)
03809         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03810     }
03811         for(l1=0; l1<=o1; l1++,psij1++)
03812     {
03813       psij2=psij_const2;
03814       gj=g+((u0+l0)*n1+l1)*n2+u2;
03815       for(l2=0; l2<2*m+1-o2; l2++)
03816         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03817       gj=g+((u0+l0)*n1+l1)*n2;
03818       for(l2=0; l2<=o2; l2++)
03819         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03820     }
03821       }
03822   }
03823   else/* asserts (u0>o0) */
03824     if(u1<o1)
03825       if(u2<o2)
03826   {
03827     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
03828       {
03829         psij1=psij_const1;
03830         for(l1=0; l1<=2*m+1; l1++,psij1++)
03831     {
03832       psij2=psij_const2;
03833       gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03834       for(l2=0; l2<=2*m+1; l2++)
03835         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03836     }
03837       }
03838 
03839     for(l0=0; l0<=o0; l0++,psij0++)
03840       {
03841         psij1=psij_const1;
03842         for(l1=0; l1<=2*m+1; l1++,psij1++)
03843     {
03844       psij2=psij_const2;
03845       gj=g+(l0*n1+(u1+l1))*n2+u2;
03846       for(l2=0; l2<=2*m+1; l2++)
03847         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03848     }
03849       }
03850   }
03851       else/* asserts (u2>o2) */
03852   {
03853     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
03854       {
03855         psij1=psij_const1;
03856         for(l1=0; l1<=2*m+1; l1++,psij1++)
03857     {
03858       psij2=psij_const2;
03859       gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03860       for(l2=0; l2<2*m+1-o2; l2++)
03861         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03862       gj=g+((u0+l0)*n1+(u1+l1))*n2;
03863       for(l2=0; l2<=o2; l2++)
03864         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03865     }
03866       }
03867 
03868     for(l0=0; l0<=o0; l0++,psij0++)
03869       {
03870         psij1=psij_const1;
03871         for(l1=0; l1<=2*m+1; l1++,psij1++)
03872     {
03873       psij2=psij_const2;
03874       gj=g+(l0*n1+(u1+l1))*n2+u2;
03875       for(l2=0; l2<2*m+1-o2; l2++)
03876         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03877       gj=g+(l0*n1+(u1+l1))*n2;
03878       for(l2=0; l2<=o2; l2++)
03879         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03880     }
03881       }
03882   }
03883     else/* asserts (u1>o1) */
03884       if(u2<o2)
03885   {
03886     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
03887       {
03888         psij1=psij_const1;
03889         for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03890     {
03891       psij2=psij_const2;
03892       gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03893       for(l2=0; l2<=2*m+1; l2++)
03894         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03895     }
03896         for(l1=0; l1<=o1; l1++,psij1++)
03897     {
03898       psij2=psij_const2;
03899       gj=g+((u0+l0)*n1+l1)*n2+u2;
03900       for(l2=0; l2<=2*m+1; l2++)
03901         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03902     }
03903       }
03904     for(l0=0; l0<=o0; l0++,psij0++)
03905       {
03906         psij1=psij_const1;
03907         for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03908     {
03909       psij2=psij_const2;
03910       gj=g+(l0*n1+(u1+l1))*n2+u2;
03911       for(l2=0; l2<=2*m+1; l2++)
03912         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03913     }
03914         for(l1=0; l1<=o1; l1++,psij1++)
03915     {
03916       psij2=psij_const2;
03917       gj=g+(l0*n1+l1)*n2+u2;
03918       for(l2=0; l2<=2*m+1; l2++)
03919         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03920     }
03921       }
03922   }
03923       else/* asserts (u2>o2) */
03924   {
03925     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
03926       {
03927         psij1=psij_const1;
03928         for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03929     {
03930       psij2=psij_const2;
03931       gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
03932       for(l2=0; l2<2*m+1-o2; l2++)
03933         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03934       gj=g+((u0+l0)*n1+(u1+l1))*n2;
03935       for(l2=0; l2<=o2; l2++)
03936         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03937     }
03938         for(l1=0; l1<=o1; l1++,psij1++)
03939     {
03940       psij2=psij_const2;
03941       gj=g+((u0+l0)*n1+l1)*n2+u2;
03942       for(l2=0; l2<2*m+1-o2; l2++)
03943         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03944       gj=g+((u0+l0)*n1+l1)*n2;
03945       for(l2=0; l2<=o2; l2++)
03946         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03947     }
03948       }
03949 
03950     for(l0=0; l0<=o0; l0++,psij0++)
03951       {
03952         psij1=psij_const1;
03953         for(l1=0; l1<2*m+1-o1; l1++,psij1++)
03954     {
03955       psij2=psij_const2;
03956       gj=g+(l0*n1+(u1+l1))*n2+u2;
03957       for(l2=0; l2<2*m+1-o2; l2++)
03958         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03959       gj=g+(l0*n1+(u1+l1))*n2;
03960       for(l2=0; l2<=o2; l2++)
03961         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03962     }
03963         for(l1=0; l1<=o1; l1++,psij1++)
03964     {
03965       psij2=psij_const2;
03966       gj=g+(l0*n1+l1)*n2+u2;
03967       for(l2=0; l2<2*m+1-o2; l2++)
03968         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03969       gj=g+(l0*n1+l1)*n2;
03970       for(l2=0; l2<=o2; l2++)
03971         (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03972     }
03973       }
03974   }
03975 }
03976 
03977 #ifdef _OPENMP
03978 
03999 static void nfft_adjoint_3d_compute_omp_blockwise(const C f, C *g,
04000             const R *psij_const0, const R *psij_const1, const R *psij_const2,
04001             const R *xj0, const R *xj1, const R *xj2,
04002             const int n0, const int n1, const int n2, const int m,
04003       const int my_u0, const int my_o0)
04004 {
04005   int ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
04006   const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
04007 
04008   unsigned long int index_temp1[2*m+2];
04009   unsigned long int index_temp2[2*m+2];
04010 
04011   nfft_uo2(&ar_u0,&ar_o0,*xj0, n0, m);
04012   nfft_uo2(&u1,&o1,*xj1, n1, m);
04013   nfft_uo2(&u2,&o2,*xj2, n2, m);
04014 
04015   for (l1=0; l1<=2*m+1; l1++)
04016     index_temp1[l1] = (u1+l1)%n1;
04017 
04018   for (l2=0; l2<=2*m+1; l2++)
04019     index_temp2[l2] = (u2+l2)%n2;
04020 
04021   if(ar_u0<ar_o0)
04022   {
04023     int u0 = MAX(my_u0,ar_u0);
04024     int o0 = MIN(my_o0,ar_o0);
04025     int offset_psij = u0-ar_u0;
04026 #ifdef OMP_ASSERT
04027     assert(offset_psij >= 0);
04028     assert(o0-u0 <= 2*m+1);
04029     assert(offset_psij+o0-u0 <= 2*m+1);
04030 #endif
04031 
04032     for (l0 = 0; l0 <= o0-u0; l0++)
04033     {
04034       const unsigned long int i0 = (u0+l0) * n1;
04035       const C val0 = psij_const0[offset_psij+l0];
04036 
04037       for(l1=0; l1<=2*m+1; l1++)
04038       {
04039         const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
04040         const C val1 = psij_const1[l1];
04041 
04042         for(l2=0; l2<=2*m+1; l2++)
04043           g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
04044       }
04045     }  
04046   }
04047   else
04048   {
04049     int u0 = MAX(my_u0,ar_u0);
04050     int o0 = my_o0;
04051     int offset_psij = u0-ar_u0;
04052 #ifdef OMP_ASSERT
04053     assert(offset_psij >= 0);
04054     assert(o0-u0 <= 2*m+1);
04055     assert(offset_psij+o0-u0 <= 2*m+1);
04056 #endif
04057 
04058     for (l0 = 0; l0 <= o0-u0; l0++)
04059     {
04060       unsigned long int i0 = (u0+l0) * n1;
04061       const C val0 = psij_const0[offset_psij+l0];
04062 
04063       for(l1=0; l1<=2*m+1; l1++)
04064       {
04065         const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
04066         const C val1 = psij_const1[l1];
04067 
04068         for(l2=0; l2<=2*m+1; l2++)
04069           g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
04070       }
04071     }
04072 
04073     u0 = my_u0;
04074     o0 = MIN(my_o0,ar_o0);
04075     offset_psij += my_u0-ar_u0+n0;
04076 
04077 #ifdef OMP_ASSERT
04078     if (u0<=o0)
04079     {
04080       assert(o0-u0 <= 2*m+1);
04081       assert(offset_psij+o0-u0 <= 2*m+1);
04082     }
04083 #endif
04084     for (l0 = 0; l0 <= o0-u0; l0++)
04085     {
04086       unsigned long int i0 = (u0+l0) * n1;
04087       const C val0 = psij_const0[offset_psij+l0];
04088 
04089       for(l1=0; l1<=2*m+1; l1++)
04090       {
04091         const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
04092         const C val1 = psij_const1[l1];
04093 
04094         for(l2=0; l2<=2*m+1; l2++)
04095           g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
04096       }
04097     }
04098   }
04099 }
04100 #endif
04101 
04102 #ifdef _OPENMP
04103 /* adjoint NFFT three-dimensional case with OpenMP atomic operations */
04104 static void nfft_adjoint_3d_compute_omp_atomic(const C f, C *g,
04105             const R *psij_const0, const R *psij_const1, const R *psij_const2,
04106             const R *xj0, const R *xj1, const R *xj2,
04107             const int n0, const int n1, const int n2, const int m)
04108 {
04109   int u0,o0,l0,u1,o1,l1,u2,o2,l2;
04110   const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
04111 
04112   unsigned long int index_temp0[2*m+2];
04113   unsigned long int index_temp1[2*m+2];
04114   unsigned long int index_temp2[2*m+2];
04115 
04116   nfft_uo2(&u0,&o0,*xj0, n0, m);
04117   nfft_uo2(&u1,&o1,*xj1, n1, m);
04118   nfft_uo2(&u2,&o2,*xj2, n2, m);
04119 
04120   for (l0=0; l0<=2*m+1; l0++)
04121     index_temp0[l0] = (u0+l0)%n0;
04122 
04123   for (l1=0; l1<=2*m+1; l1++)
04124     index_temp1[l1] = (u1+l1)%n1;
04125 
04126   for (l2=0; l2<=2*m+1; l2++)
04127     index_temp2[l2] = (u2+l2)%n2;
04128 
04129   for(l0=0; l0<=2*m+1; l0++)
04130   {
04131     for(l1=0; l1<=2*m+1; l1++)
04132     {
04133       for(l2=0; l2<=2*m+1; l2++)
04134       {
04135         unsigned long int i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
04136         C *lhs = g+i;
04137         R *lhs_real = (R*)lhs;
04138         C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
04139 
04140         #pragma omp atomic
04141         lhs_real[0] += creal(val);
04142 
04143         #pragma omp atomic
04144         lhs_real[1] += cimag(val);
04145       }
04146     }
04147   }
04148 }
04149 #endif
04150 
04151 #ifndef _OPENMP
04152 static void nfft_adjoint_3d_compute_serial(const C *fj, C *g,
04153             const R *psij_const0, const R *psij_const1, const R *psij_const2,
04154             const R *xj0, const R *xj1, const R *xj2,
04155             const int n0, const int n1, const int n2, const int m)
04156 {
04157   int u0,o0,l0,u1,o1,l1,u2,o2,l2;
04158   C *gj;
04159   const R *psij0,*psij1,*psij2;
04160 
04161   psij0=psij_const0;
04162   psij1=psij_const1;
04163   psij2=psij_const2;
04164 
04165   nfft_uo2(&u0,&o0,*xj0, n0, m);
04166   nfft_uo2(&u1,&o1,*xj1, n1, m);
04167   nfft_uo2(&u2,&o2,*xj2, n2, m);
04168 
04169   if(u0<o0)
04170     if(u1<o1)
04171       if(u2<o2)
04172   for(l0=0; l0<=2*m+1; l0++,psij0++)
04173     {
04174       psij1=psij_const1;
04175       for(l1=0; l1<=2*m+1; l1++,psij1++)
04176         {
04177     psij2=psij_const2;
04178     gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04179     for(l2=0; l2<=2*m+1; l2++)
04180       (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04181         }
04182     }
04183       else/* asserts (u2>o2)*/
04184   for(l0=0; l0<=2*m+1; l0++,psij0++)
04185     {
04186       psij1=psij_const1;
04187       for(l1=0; l1<=2*m+1; l1++,psij1++)
04188         {
04189     psij2=psij_const2;
04190     gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04191     for(l2=0; l2<2*m+1-o2; l2++)
04192       (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04193     gj=g+((u0+l0)*n1+(u1+l1))*n2;
04194     for(l2=0; l2<=o2; l2++)
04195       (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04196         }
04197     }
04198     else/* asserts (u1>o1)*/
04199       if(u2<o2)
04200   for(l0=0; l0<=2*m+1; l0++,psij0++)
04201     {
04202       psij1=psij_const1;
04203       for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04204         {
04205     psij2=psij_const2;
04206     gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04207     for(l2=0; l2<=2*m+1; l2++)
04208       (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04209         }
04210       for(l1=0; l1<=o1; l1++,psij1++)
04211         {
04212     psij2=psij_const2;
04213     gj=g+((u0+l0)*n1+l1)*n2+u2;
04214     for(l2=0; l2<=2*m+1; l2++)
04215       (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04216         }
04217     }
04218       else/* asserts (u2>o2) */
04219   {
04220     for(l0=0; l0<=2*m+1; l0++,psij0++)
04221       {
04222         psij1=psij_const1;
04223         for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04224     {
04225       psij2=psij_const2;
04226       gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04227       for(l2=0; l2<2*m+1-o2; l2++)
04228         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04229       gj=g+((u0+l0)*n1+(u1+l1))*n2;
04230       for(l2=0; l2<=o2; l2++)
04231         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04232     }
04233         for(l1=0; l1<=o1; l1++,psij1++)
04234     {
04235       psij2=psij_const2;
04236       gj=g+((u0+l0)*n1+l1)*n2+u2;
04237       for(l2=0; l2<2*m+1-o2; l2++)
04238         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04239       gj=g+((u0+l0)*n1+l1)*n2;
04240       for(l2=0; l2<=o2; l2++)
04241         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04242     }
04243       }
04244   }
04245   else/* asserts (u0>o0) */
04246     if(u1<o1)
04247       if(u2<o2)
04248   {
04249     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
04250       {
04251         psij1=psij_const1;
04252         for(l1=0; l1<=2*m+1; l1++,psij1++)
04253     {
04254       psij2=psij_const2;
04255       gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04256       for(l2=0; l2<=2*m+1; l2++)
04257         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04258     }
04259       }
04260 
04261     for(l0=0; l0<=o0; l0++,psij0++)
04262       {
04263         psij1=psij_const1;
04264         for(l1=0; l1<=2*m+1; l1++,psij1++)
04265     {
04266       psij2=psij_const2;
04267       gj=g+(l0*n1+(u1+l1))*n2+u2;
04268       for(l2=0; l2<=2*m+1; l2++)
04269         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04270     }
04271       }
04272   }
04273       else/* asserts (u2>o2) */
04274   {
04275     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
04276       {
04277         psij1=psij_const1;
04278         for(l1=0; l1<=2*m+1; l1++,psij1++)
04279     {
04280       psij2=psij_const2;
04281       gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04282       for(l2=0; l2<2*m+1-o2; l2++)
04283         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04284       gj=g+((u0+l0)*n1+(u1+l1))*n2;
04285       for(l2=0; l2<=o2; l2++)
04286         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04287     }
04288       }
04289 
04290     for(l0=0; l0<=o0; l0++,psij0++)
04291       {
04292         psij1=psij_const1;
04293         for(l1=0; l1<=2*m+1; l1++,psij1++)
04294     {
04295       psij2=psij_const2;
04296       gj=g+(l0*n1+(u1+l1))*n2+u2;
04297       for(l2=0; l2<2*m+1-o2; l2++)
04298         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04299       gj=g+(l0*n1+(u1+l1))*n2;
04300       for(l2=0; l2<=o2; l2++)
04301         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04302     }
04303       }
04304   }
04305     else/* asserts (u1>o1) */
04306       if(u2<o2)
04307   {
04308     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
04309       {
04310         psij1=psij_const1;
04311         for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04312     {
04313       psij2=psij_const2;
04314       gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04315       for(l2=0; l2<=2*m+1; l2++)
04316         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04317     }
04318         for(l1=0; l1<=o1; l1++,psij1++)
04319     {
04320       psij2=psij_const2;
04321       gj=g+((u0+l0)*n1+l1)*n2+u2;
04322       for(l2=0; l2<=2*m+1; l2++)
04323         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04324     }
04325       }
04326     for(l0=0; l0<=o0; l0++,psij0++)
04327       {
04328         psij1=psij_const1;
04329         for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04330     {
04331       psij2=psij_const2;
04332       gj=g+(l0*n1+(u1+l1))*n2+u2;
04333       for(l2=0; l2<=2*m+1; l2++)
04334         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04335     }
04336         for(l1=0; l1<=o1; l1++,psij1++)
04337     {
04338       psij2=psij_const2;
04339       gj=g+(l0*n1+l1)*n2+u2;
04340       for(l2=0; l2<=2*m+1; l2++)
04341         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04342     }
04343       }
04344   }
04345       else/* asserts (u2>o2) */
04346   {
04347     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
04348       {
04349         psij1=psij_const1;
04350         for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04351     {
04352       psij2=psij_const2;
04353       gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
04354       for(l2=0; l2<2*m+1-o2; l2++)
04355         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04356       gj=g+((u0+l0)*n1+(u1+l1))*n2;
04357       for(l2=0; l2<=o2; l2++)
04358         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04359     }
04360         for(l1=0; l1<=o1; l1++,psij1++)
04361     {
04362       psij2=psij_const2;
04363       gj=g+((u0+l0)*n1+l1)*n2+u2;
04364       for(l2=0; l2<2*m+1-o2; l2++)
04365         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04366       gj=g+((u0+l0)*n1+l1)*n2;
04367       for(l2=0; l2<=o2; l2++)
04368         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04369     }
04370       }
04371 
04372     for(l0=0; l0<=o0; l0++,psij0++)
04373       {
04374         psij1=psij_const1;
04375         for(l1=0; l1<2*m+1-o1; l1++,psij1++)
04376     {
04377       psij2=psij_const2;
04378       gj=g+(l0*n1+(u1+l1))*n2+u2;
04379       for(l2=0; l2<2*m+1-o2; l2++)
04380         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04381       gj=g+(l0*n1+(u1+l1))*n2;
04382       for(l2=0; l2<=o2; l2++)
04383         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04384     }
04385         for(l1=0; l1<=o1; l1++,psij1++)
04386     {
04387       psij2=psij_const2;
04388       gj=g+(l0*n1+l1)*n2+u2;
04389       for(l2=0; l2<2*m+1-o2; l2++)
04390         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04391       gj=g+(l0*n1+l1)*n2;
04392       for(l2=0; l2<=o2; l2++)
04393         (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04394     }
04395       }
04396   }
04397 }
04398 #endif
04399 
04400 static void nfft_trafo_3d_B(nfft_plan *ths)
04401 {
04402   const int N0 = ths->N[0];
04403   const int n0 = ths->n[0];
04404   const int N1 = ths->N[1];
04405   const int n1 = ths->n[1];
04406   const int N2 = ths->N[2];
04407   const int n2 = ths->n[2];
04408   const int M = ths->M_total;
04409   const int m = ths->m;
04410 
04411   const C* g = (C*) ths->g;
04412 
04413   int k;
04414 
04415   if(ths->nfft_flags & PRE_FULL_PSI)
04416   {
04417     const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
04418     #pragma omp parallel for default(shared) private(k)
04419     for (k = 0; k < M; k++)
04420     {
04421       int l;
04422       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04423       ths->f[j] = K(0.0);
04424       for (l = 0; l < lprod; l++)
04425         ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
04426     }
04427     return;
04428   } /* if(PRE_FULL_PSI) */
04429 
04430   if(ths->nfft_flags & PRE_PSI)
04431   {
04432     #pragma omp parallel for default(shared) private(k)
04433     for (k = 0; k < M; k++)
04434     {
04435       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04436       nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04437     }
04438     return;
04439   } /* if(PRE_PSI) */
04440 
04441   if(ths->nfft_flags & PRE_FG_PSI)
04442   {
04443     R fg_exp_l[3*(2*m+2)];
04444 
04445     nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04446     nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04447     nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04448 
04449     #pragma omp parallel for default(shared) private(k)
04450     for (k = 0; k < M; k++)
04451     {
04452       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04453       int l;
04454       R psij_const[3*(2*m+2)];
04455       R fg_psij0 = ths->psi[2*j*3];
04456       R fg_psij1 = ths->psi[2*j*3+1];
04457       R fg_psij2 = K(1.0);
04458 
04459       psij_const[0] = fg_psij0;
04460       for(l=1; l<=2*m+1; l++)
04461       {
04462         fg_psij2 *= fg_psij1;
04463         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04464       }
04465 
04466       fg_psij0 = ths->psi[2*(j*3+1)];
04467       fg_psij1 = ths->psi[2*(j*3+1)+1];
04468       fg_psij2 = K(1.0);
04469       psij_const[2*m+2] = fg_psij0;
04470       for(l=1; l<=2*m+1; l++)
04471       {
04472         fg_psij2 *= fg_psij1;
04473         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04474       }
04475 
04476       fg_psij0 = ths->psi[2*(j*3+2)];
04477       fg_psij1 = ths->psi[2*(j*3+2)+1];
04478       fg_psij2 = K(1.0);
04479       psij_const[2*(2*m+2)] = fg_psij0;
04480       for(l=1; l<=2*m+1; l++)
04481       {
04482         fg_psij2 *= fg_psij1;
04483         psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
04484       }
04485 
04486       nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04487     }
04488 
04489     return;
04490   } /* if(PRE_FG_PSI) */
04491 
04492   if(ths->nfft_flags & FG_PSI)
04493   {
04494     R fg_exp_l[3*(2*m+2)];
04495 
04496     nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04497     nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04498     nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04499 
04500     nfft_sort_nodes(ths);
04501 
04502     #pragma omp parallel for default(shared) private(k)
04503     for (k = 0; k < M; k++)
04504     {
04505       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04506       int u, o, l;
04507       R psij_const[3*(2*m+2)];
04508       R fg_psij0, fg_psij1, fg_psij2;
04509 
04510       nfft_uo(ths,j,&u,&o,0);
04511       fg_psij0 = (PHI(ths->x[3*j]-((R)u)/n0,0));
04512       fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]);
04513       fg_psij2 = K(1.0);
04514       psij_const[0] = fg_psij0;
04515       for(l=1; l<=2*m+1; l++)
04516       {
04517         fg_psij2 *= fg_psij1;
04518         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04519       }
04520 
04521       nfft_uo(ths,j,&u,&o,1);
04522       fg_psij0 = (PHI(ths->x[3*j+1]-((R)u)/n1,1));
04523       fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]);
04524       fg_psij2 = K(1.0);
04525       psij_const[2*m+2] = fg_psij0;
04526       for(l=1; l<=2*m+1; l++)
04527       {
04528         fg_psij2 *= fg_psij1;
04529         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04530       }
04531 
04532       nfft_uo(ths,j,&u,&o,2);
04533       fg_psij0 = (PHI(ths->x[3*j+2]-((R)u)/n2,2));
04534       fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]);
04535       fg_psij2 = K(1.0);
04536       psij_const[2*(2*m+2)] = fg_psij0;
04537       for(l=1; l<=2*m+1; l++)
04538       {
04539         fg_psij2 *= fg_psij1;
04540         psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
04541       }
04542 
04543       nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04544     }
04545 
04546     return;
04547   } /* if(FG_PSI) */
04548 
04549   if(ths->nfft_flags & PRE_LIN_PSI)
04550   {
04551     const int K = ths->K, ip_s = K / (m + 2);
04552 
04553     nfft_sort_nodes(ths);
04554 
04555     #pragma omp parallel for default(shared) private(k)
04556     for (k = 0; k < M; k++)
04557     {
04558       int u, o, l;
04559       R ip_y, ip_w;
04560       int ip_u;
04561       R psij_const[3*(2*m+2)];
04562       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04563 
04564       nfft_uo(ths,j,&u,&o,0);
04565       ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s);
04566       ip_u = LRINT(FLOOR(ip_y));
04567       ip_w = ip_y-ip_u;
04568       for(l=0; l < 2*m+2; l++)
04569         psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
04570           ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
04571 
04572       nfft_uo(ths,j,&u,&o,1);
04573       ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s);
04574       ip_u = LRINT(FLOOR(ip_y));
04575       ip_w = ip_y-ip_u;
04576       for(l=0; l < 2*m+2; l++)
04577         psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
04578           ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
04579 
04580       nfft_uo(ths,j,&u,&o,2);
04581       ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s);
04582       ip_u = LRINT(FLOOR(ip_y));
04583       ip_w = ip_y-ip_u;
04584       for(l=0; l < 2*m+2; l++)
04585         psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
04586           ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
04587 
04588       nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04589     }
04590     return;
04591   } /* if(PRE_LIN_PSI) */
04592 
04593   /* no precomputed psi at all */
04594 
04595   nfft_sort_nodes(ths);
04596 
04597   #pragma omp parallel for default(shared) private(k)
04598   for (k = 0; k < M; k++)
04599   {
04600     R psij_const[3*(2*m+2)];
04601     int u, o, l;
04602     int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04603 
04604     nfft_uo(ths,j,&u,&o,0);
04605     for(l=0;l<=2*m+1;l++)
04606       psij_const[l]=(PHI(ths->x[3*j]-((R)((u+l)))/n0,0));
04607 
04608     nfft_uo(ths,j,&u,&o,1);
04609     for(l=0;l<=2*m+1;l++)
04610       psij_const[2*m+2+l]=(PHI(ths->x[3*j+1]-((R)((u+l)))/n1,1));
04611 
04612     nfft_uo(ths,j,&u,&o,2);
04613     for(l=0;l<=2*m+1;l++)
04614       psij_const[2*(2*m+2)+l]=(PHI(ths->x[3*j+2]-((R)((u+l)))/n2,2));
04615 
04616     nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04617   }
04618 }
04619 
04620 #ifdef OMP_ASSERT
04621 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A                        \
04622 {                                                                             \
04623           assert(ar_x[2*k] >= min_u_a || k == M-1);                           \
04624           if (k > 0)                                                          \
04625             assert(ar_x[2*k-2] < min_u_a);                                    \
04626 }
04627 #else
04628 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
04629 #endif
04630 
04631 #ifdef OMP_ASSERT
04632 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B                        \
04633 {                                                                             \
04634           assert(ar_x[2*k] >= min_u_b || k == M-1);                           \
04635           if (k > 0)                                                          \
04636             assert(ar_x[2*k-2] < min_u_b);                                    \
04637 }
04638 #else
04639 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
04640 #endif
04641 
04642 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI                 \
04643             nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g,               \
04644                 ths->psi+j*3*(2*m+2),                                         \
04645                 ths->psi+(j*3+1)*(2*m+2),                                     \
04646                 ths->psi+(j*3+2)*(2*m+2),                                     \
04647                 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2,                       \
04648                 n0, n1, n2, m, my_u0, my_o0);
04649 
04650 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI              \
04651 {                                                                             \
04652             int u, o, l;                                                      \
04653             R psij_const[3*(2*m+2)];                                          \
04654             R fg_psij0 = ths->psi[2*j*3];                                     \
04655             R fg_psij1 = ths->psi[2*j*3+1];                                   \
04656             R fg_psij2 = K(1.0);                                              \
04657                                                                               \
04658             psij_const[0] = fg_psij0;                                         \
04659             for(l=1; l<=2*m+1; l++)                                           \
04660             {                                                                 \
04661               fg_psij2 *= fg_psij1;                                           \
04662               psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];                  \
04663             }                                                                 \
04664                                                                               \
04665             fg_psij0 = ths->psi[2*(j*3+1)];                                   \
04666             fg_psij1 = ths->psi[2*(j*3+1)+1];                                 \
04667             fg_psij2 = K(1.0);                                                \
04668             psij_const[2*m+2] = fg_psij0;                                     \
04669             for(l=1; l<=2*m+1; l++)                                           \
04670             {                                                                 \
04671               fg_psij2 *= fg_psij1;                                           \
04672               psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];      \
04673             }                                                                 \
04674                                                                               \
04675             fg_psij0 = ths->psi[2*(j*3+2)];                                   \
04676             fg_psij1 = ths->psi[2*(j*3+2)+1];                                 \
04677             fg_psij2 = K(1.0);                                                \
04678             psij_const[2*(2*m+2)] = fg_psij0;                                 \
04679             for(l=1; l<=2*m+1; l++)                                           \
04680             {                                                                 \
04681               fg_psij2 *= fg_psij1;                                           \
04682               psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
04683             }                                                                 \
04684                                                                               \
04685             nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g,               \
04686                 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2,           \
04687                 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2,                       \
04688                 n0, n1, n2, m, my_u0, my_o0);                                 \
04689 }
04690 
04691 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI                  \
04692 {                                                                             \
04693             int u, o, l;                                                      \
04694             R psij_const[3*(2*m+2)];                                          \
04695             R fg_psij0, fg_psij1, fg_psij2;                                   \
04696                                                                               \
04697             nfft_uo(ths,j,&u,&o,0);                                           \
04698             fg_psij0 = (PHI(ths->x[3*j]-((R)u)/n0,0));                        \
04699             fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]);          \
04700             fg_psij2 = K(1.0);                                                \
04701             psij_const[0] = fg_psij0;                                         \
04702             for(l=1; l<=2*m+1; l++)                                           \
04703             {                                                                 \
04704               fg_psij2 *= fg_psij1;                                           \
04705               psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];                  \
04706             }                                                                 \
04707                                                                               \
04708             nfft_uo(ths,j,&u,&o,1);                                           \
04709             fg_psij0 = (PHI(ths->x[3*j+1]-((R)u)/n1,1));                      \
04710             fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]);        \
04711             fg_psij2 = K(1.0);                                                \
04712             psij_const[2*m+2] = fg_psij0;                                     \
04713             for(l=1; l<=2*m+1; l++)                                           \
04714             {                                                                 \
04715               fg_psij2 *= fg_psij1;                                           \
04716               psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];      \
04717             }                                                                 \
04718                                                                               \
04719             nfft_uo(ths,j,&u,&o,2);                                           \
04720             fg_psij0 = (PHI(ths->x[3*j+2]-((R)u)/n2,2));                      \
04721             fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]);        \
04722             fg_psij2 = K(1.0);                                                \
04723             psij_const[2*(2*m+2)] = fg_psij0;                                 \
04724             for(l=1; l<=2*m+1; l++)                                           \
04725             {                                                                 \
04726               fg_psij2 *= fg_psij1;                                           \
04727               psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
04728             }                                                                 \
04729                                                                               \
04730             nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g,               \
04731                 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2,           \
04732                 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2,                       \
04733                 n0, n1, n2, m, my_u0, my_o0);                                 \
04734 }
04735 
04736 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI             \
04737 {                                                                             \
04738             int u, o, l;                                                      \
04739             R psij_const[3*(2*m+2)];                                          \
04740             int ip_u;                                                         \
04741             R ip_y, ip_w;                                                     \
04742                                                                               \
04743             nfft_uo(ths,j,&u,&o,0);                                           \
04744             ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s);                      \
04745             ip_u = LRINT(FLOOR(ip_y));                                        \
04746             ip_w = ip_y-ip_u;                                                 \
04747             for(l=0; l < 2*m+2; l++)                                          \
04748               psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +      \
04749                 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);                          \
04750                                                                               \
04751             nfft_uo(ths,j,&u,&o,1);                                           \
04752             ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s);                      \
04753             ip_u = LRINT(FLOOR(ip_y));                                        \
04754             ip_w = ip_y-ip_u;                                                 \
04755             for(l=0; l < 2*m+2; l++)                                          \
04756               psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
04757                 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);                    \
04758                                                                               \
04759             nfft_uo(ths,j,&u,&o,2);                                           \
04760             ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s);                      \
04761             ip_u = LRINT(FLOOR(ip_y));                                        \
04762             ip_w = ip_y-ip_u;                                                 \
04763             for(l=0; l < 2*m+2; l++)                                          \
04764               psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
04765                 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);                  \
04766                                                                               \
04767             nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g,               \
04768                 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2,           \
04769                 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2,                       \
04770                 n0, n1, n2, m, my_u0, my_o0);                                 \
04771 }
04772 
04773 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI                  \
04774 {                                                                             \
04775             int u, o, l;                                                      \
04776             R psij_const[3*(2*m+2)];                                          \
04777                                                                               \
04778             nfft_uo(ths,j,&u,&o,0);                                           \
04779             for(l=0;l<=2*m+1;l++)                                             \
04780               psij_const[l]=(PHI(ths->x[3*j]-((R)((u+l)))/n0,0));             \
04781                                                                               \
04782             nfft_uo(ths,j,&u,&o,1);                                           \
04783             for(l=0;l<=2*m+1;l++)                                             \
04784               psij_const[2*m+2+l]=(PHI(ths->x[3*j+1]-((R)((u+l)))/n1,1));     \
04785                                                                               \
04786             nfft_uo(ths,j,&u,&o,2);                                           \
04787             for(l=0;l<=2*m+1;l++)                                             \
04788               psij_const[2*(2*m+2)+l]=(PHI(ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
04789                                                                               \
04790             nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g,               \
04791                 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2,           \
04792                 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2,                       \
04793                 n0, n1, n2, m, my_u0, my_o0);                                 \
04794 }
04795 
04796 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(whichone)                       \
04797 {                                                                             \
04798     if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)                         \
04799     {                                                                         \
04800       _Pragma("omp parallel private(k)")                                      \
04801       {                                                                       \
04802         int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;                 \
04803         int *ar_x = ths->index_x;                                             \
04804                                                                               \
04805         nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
04806             &min_u_b, &max_u_b, 3, ths->n, m);                                \
04807                                                                               \
04808         if (min_u_a != -1)                                                    \
04809         {                                                                     \
04810           k = index_x_binary_search(ar_x, M, min_u_a);                        \
04811                                                                               \
04812           MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A                      \
04813                                                                               \
04814           while (k < M)                                                       \
04815           {                                                                   \
04816             int u_prod = ar_x[2*k];                                           \
04817             int j = ar_x[2*k+1];                                              \
04818                                                                               \
04819             if (u_prod < min_u_a || u_prod > max_u_a)                         \
04820               break;                                                          \
04821                                                                               \
04822             MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone         \
04823                                                                               \
04824             k++;                                                              \
04825           }                                                                   \
04826         }                                                                     \
04827                                                                               \
04828         if (min_u_b != -1)                                                    \
04829         {                                                                     \
04830           int k = index_x_binary_search(ar_x, M, min_u_b);                    \
04831                                                                               \
04832           MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B                      \
04833                                                                               \
04834           while (k < M)                                                       \
04835           {                                                                   \
04836             int u_prod = ar_x[2*k];                                           \
04837             int j = ar_x[2*k+1];                                              \
04838                                                                               \
04839             if (u_prod < min_u_b || u_prod > max_u_b)                         \
04840               break;                                                          \
04841                                                                               \
04842             MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone         \
04843                                                                               \
04844             k++;                                                              \
04845           }                                                                   \
04846         }                                                                     \
04847       } /* omp parallel */                                                    \
04848       return;                                                                 \
04849     } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */                                    \
04850 }
04851 
04852 static void nfft_adjoint_3d_B(nfft_plan *ths)
04853 {
04854   int k;
04855   const int N0 = ths->N[0];
04856   const int n0 = ths->n[0];
04857   const int N1 = ths->N[1];
04858   const int n1 = ths->n[1];
04859   const int N2 = ths->N[2];
04860   const int n2 = ths->n[2];
04861   const int M = ths->M_total;
04862   const int m = ths->m;
04863 
04864   C* g = (C*) ths->g;
04865 
04866   memset(g,0,ths->n_total*sizeof(C));
04867 
04868   if(ths->nfft_flags & PRE_FULL_PSI)
04869   {
04870     nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
04871         3, ths->n, m, ths->nfft_flags, ths->index_x);
04872     return;
04873   } /* if(PRE_FULL_PSI) */
04874 
04875   if(ths->nfft_flags & PRE_PSI)
04876   {
04877 #ifdef _OPENMP
04878     MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
04879 #endif
04880 
04881     #pragma omp parallel for default(shared) private(k)
04882     for (k = 0; k < M; k++)
04883     {
04884       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04885 #ifdef _OPENMP
04886       nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04887 #else
04888       nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04889 #endif
04890     }
04891     return;
04892   } /* if(PRE_PSI) */
04893 
04894   if(ths->nfft_flags & PRE_FG_PSI)
04895   {
04896     R fg_exp_l[3*(2*m+2)];
04897 
04898     nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04899     nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04900     nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04901 
04902 #ifdef _OPENMP
04903     MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
04904 #endif
04905 
04906     #pragma omp parallel for default(shared) private(k)
04907     for (k = 0; k < M; k++)
04908     {
04909       R psij_const[3*(2*m+2)];
04910       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04911       int l;
04912       R fg_psij0 = ths->psi[2*j*3];
04913       R fg_psij1 = ths->psi[2*j*3+1];
04914       R fg_psij2 = K(1.0);
04915 
04916       psij_const[0] = fg_psij0;
04917       for(l=1; l<=2*m+1; l++)
04918       {
04919         fg_psij2 *= fg_psij1;
04920         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04921       }
04922 
04923       fg_psij0 = ths->psi[2*(j*3+1)];
04924       fg_psij1 = ths->psi[2*(j*3+1)+1];
04925       fg_psij2 = K(1.0);
04926       psij_const[2*m+2] = fg_psij0;
04927       for(l=1; l<=2*m+1; l++)
04928       {
04929         fg_psij2 *= fg_psij1;
04930         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04931       }
04932 
04933       fg_psij0 = ths->psi[2*(j*3+2)];
04934       fg_psij1 = ths->psi[2*(j*3+2)+1];
04935       fg_psij2 = K(1.0);
04936       psij_const[2*(2*m+2)] = fg_psij0;
04937       for(l=1; l<=2*m+1; l++)
04938       {
04939         fg_psij2 *= fg_psij1;
04940         psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
04941       }
04942 
04943 #ifdef _OPENMP
04944       nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04945 #else
04946       nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04947 #endif
04948     }
04949 
04950     return;
04951   } /* if(PRE_FG_PSI) */
04952 
04953   if(ths->nfft_flags & FG_PSI)
04954   {
04955     R fg_exp_l[3*(2*m+2)];
04956 
04957     nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04958     nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04959     nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04960 
04961     nfft_sort_nodes(ths);
04962 
04963 #ifdef _OPENMP
04964     MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
04965 #endif
04966 
04967     #pragma openmp parallel for default(shared) private(k)
04968     for (k = 0; k < M; k++)
04969     {
04970       int u,o,l;
04971       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04972       R psij_const[3*(2*m+2)];
04973       R fg_psij0, fg_psij1, fg_psij2;
04974 
04975       nfft_uo(ths,j,&u,&o,0);
04976       fg_psij0 = (PHI(ths->x[3*j]-((R)u)/n0,0));
04977       fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]);
04978       fg_psij2 = K(1.0);
04979       psij_const[0] = fg_psij0;
04980       for(l=1; l<=2*m+1; l++)
04981       {
04982         fg_psij2 *= fg_psij1;
04983         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04984       }
04985 
04986       nfft_uo(ths,j,&u,&o,1);
04987       fg_psij0 = (PHI(ths->x[3*j+1]-((R)u)/n1,1));
04988       fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]);
04989       fg_psij2 = K(1.0);
04990       psij_const[2*m+2] = fg_psij0;
04991       for(l=1; l<=2*m+1; l++)
04992       {
04993         fg_psij2 *= fg_psij1;
04994         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04995       }
04996 
04997       nfft_uo(ths,j,&u,&o,2);
04998       fg_psij0 = (PHI(ths->x[3*j+2]-((R)u)/n2,2));
04999       fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]);
05000       fg_psij2 = K(1.0);
05001       psij_const[2*(2*m+2)] = fg_psij0;
05002       for(l=1; l<=2*m+1; l++)
05003       {
05004         fg_psij2 *= fg_psij1;
05005         psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
05006       }
05007 
05008 #ifdef _OPENMP
05009       nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05010 #else
05011       nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05012 #endif
05013     }
05014 
05015     return;
05016   } /* if(FG_PSI) */
05017 
05018   if(ths->nfft_flags & PRE_LIN_PSI)
05019   {
05020     const int K = ths->K;
05021     const int ip_s = K / (m + 2);
05022 
05023     nfft_sort_nodes(ths);
05024 
05025 #ifdef _OPENMP
05026     MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
05027 #endif
05028 
05029     #pragma openmp parallel for default(shared) private(k)
05030     for (k = 0; k < M; k++)
05031     {
05032       int u,o,l;
05033       int ip_u;
05034       R ip_y, ip_w;
05035       int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
05036       R psij_const[3*(2*m+2)];
05037 
05038       nfft_uo(ths,j,&u,&o,0);
05039       ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s);
05040       ip_u = LRINT(FLOOR(ip_y));
05041       ip_w = ip_y-ip_u;
05042       for(l=0; l < 2*m+2; l++)
05043         psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
05044           ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
05045 
05046       nfft_uo(ths,j,&u,&o,1);
05047       ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s);
05048       ip_u = LRINT(FLOOR(ip_y));
05049       ip_w = ip_y-ip_u;
05050       for(l=0; l < 2*m+2; l++)
05051         psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
05052           ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
05053 
05054       nfft_uo(ths,j,&u,&o,2);
05055       ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s);
05056       ip_u = LRINT(FLOOR(ip_y));
05057       ip_w = ip_y-ip_u;
05058       for(l=0; l < 2*m+2; l++)
05059         psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
05060           ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
05061 
05062 #ifdef _OPENMP
05063       nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05064 #else
05065       nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05066 #endif
05067     }
05068     return;
05069   } /* if(PRE_LIN_PSI) */
05070 
05071   /* no precomputed psi at all */
05072   nfft_sort_nodes(ths);
05073 
05074 #ifdef _OPENMP
05075   MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
05076 #endif
05077 
05078   #pragma omp parallel for default(shared) private(k)
05079   for (k = 0; k < M; k++)
05080   {
05081     int u,o,l;
05082     R psij_const[3*(2*m+2)];
05083     int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
05084 
05085     nfft_uo(ths,j,&u,&o,0);
05086     for(l=0;l<=2*m+1;l++)
05087       psij_const[l]=(PHI(ths->x[3*j]-((R)((u+l)))/n0,0));
05088 
05089     nfft_uo(ths,j,&u,&o,1);
05090     for(l=0;l<=2*m+1;l++)
05091       psij_const[2*m+2+l]=(PHI(ths->x[3*j+1]-((R)((u+l)))/n1,1));
05092 
05093     nfft_uo(ths,j,&u,&o,2);
05094     for(l=0;l<=2*m+1;l++)
05095       psij_const[2*(2*m+2)+l]=(PHI(ths->x[3*j+2]-((R)((u+l)))/n2,2));
05096 
05097 #ifdef _OPENMP
05098     nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05099 #else
05100     nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05101 #endif
05102   }
05103 }
05104 
05105 
05106 void nfft_trafo_3d(nfft_plan *ths)
05107 {
05108   int k0,k1,k2,n0,n1,n2,N0,N1,N2;
05109   C *g_hat,*f_hat;
05110   R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
05111   R ck01, ck02, ck11, ck12, ck21, ck22;
05112   C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
05113   C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
05114 
05115   ths->g_hat=ths->g1;
05116   ths->g=ths->g2;
05117 
05118   N0=ths->N[0];
05119   N1=ths->N[1];
05120   N2=ths->N[2];
05121   n0=ths->n[0];
05122   n1=ths->n[1];
05123   n2=ths->n[2];
05124 
05125   f_hat=(C*)ths->f_hat;
05126   g_hat=(C*)ths->g_hat;
05127 
05128   TIC(0)
05129 #ifdef _OPENMP
05130   #pragma omp parallel for default(shared) private(k0)
05131   for (k0 = 0; k0 < ths->n_total; k0++)
05132     ths->g_hat[k0] = 0.0;
05133 #else
05134   memset(ths->g_hat,0,ths->n_total*sizeof(C));
05135 #endif
05136 
05137   if(ths->nfft_flags & PRE_PHI_HUT)
05138     {
05139       c_phi_inv01=ths->c_phi_inv[0];
05140       c_phi_inv02=&ths->c_phi_inv[0][N0/2];
05141 
05142       #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
05143       for(k0=0;k0<N0/2;k0++)
05144   {
05145     ck01=c_phi_inv01[k0];
05146     ck02=c_phi_inv02[k0];
05147     c_phi_inv11=ths->c_phi_inv[1];
05148     c_phi_inv12=&ths->c_phi_inv[1][N1/2];
05149 
05150     for(k1=0;k1<N1/2;k1++)
05151       {
05152         ck11=c_phi_inv11[k1];
05153         ck12=c_phi_inv12[k1];
05154         c_phi_inv21=ths->c_phi_inv[2];
05155         c_phi_inv22=&ths->c_phi_inv[2][N2/2];
05156 
05157         g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05158         f_hat111=f_hat + (k0*N1+k1)*N2;
05159         g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05160         f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
05161         g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
05162         f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
05163         g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
05164         f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
05165 
05166         g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
05167         f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
05168         g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
05169         f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
05170         g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
05171         f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
05172         g_hat222=g_hat + (k0*n1+k1)*n2;
05173         f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
05174 
05175         for(k2=0;k2<N2/2;k2++)
05176     {
05177       ck21=c_phi_inv21[k2];
05178       ck22=c_phi_inv22[k2];
05179 
05180       g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
05181       g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
05182       g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
05183       g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
05184 
05185       g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
05186       g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
05187       g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
05188       g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
05189     }
05190       }
05191   }
05192     }
05193   else
05194     #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
05195     for(k0=0;k0<N0/2;k0++)
05196       {
05197   ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
05198   ck02=K(1.0)/(PHI_HUT(k0,0));
05199   for(k1=0;k1<N1/2;k1++)
05200     {
05201       ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
05202       ck12=K(1.0)/(PHI_HUT(k1,1));
05203 
05204       for(k2=0;k2<N2/2;k2++)
05205         {
05206     ck21=K(1.0)/(PHI_HUT(k2-N2/2,2));
05207     ck22=K(1.0)/(PHI_HUT(k2,2));
05208 
05209     g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2]                  * ck01 * ck11 * ck21;
05210     g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2]           = f_hat[((N0/2+k0)*N1+k1)*N2+k2]           * ck02 * ck11 * ck21;
05211     g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2]         = f_hat[(k0*N1+N1/2+k1)*N2+k2]             * ck01 * ck12 * ck21;
05212     g_hat[(k0*n1+k1)*n2+n2-N2/2+k2]                   = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2]      * ck02 * ck12 * ck21;
05213 
05214     g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2]         = f_hat[(k0*N1+k1)*N2+N2/2+k2]             * ck01 * ck11 * ck22;
05215     g_hat[(k0*n1+n1-N1/2+k1)*n2+k2]                   = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2]      * ck02 * ck11 * ck22;
05216     g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2]                 = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2]        * ck01 * ck12 * ck22;
05217     g_hat[(k0*n1+k1)*n2+k2]                           = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
05218         }
05219     }
05220       }
05221 
05222   TOC(0)
05223 
05224   TIC_FFTW(1)
05225   fftw_execute(ths->my_fftw_plan1);
05226   TOC_FFTW(1);
05227 
05228   TIC(2);
05229   nfft_trafo_3d_B(ths);
05230   TOC(2);
05231 }
05232 
05233 void nfft_adjoint_3d(nfft_plan *ths)
05234 {
05235   int k0,k1,k2,n0,n1,n2,N0,N1,N2;
05236   C *g_hat,*f_hat;
05237   R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
05238   R ck01, ck02, ck11, ck12, ck21, ck22;
05239   C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
05240   C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
05241 
05242   ths->g_hat=ths->g1;
05243   ths->g=ths->g2;
05244 
05245   N0=ths->N[0];
05246   N1=ths->N[1];
05247   N2=ths->N[2];
05248   n0=ths->n[0];
05249   n1=ths->n[1];
05250   n2=ths->n[2];
05251 
05252   f_hat=(C*)ths->f_hat;
05253   g_hat=(C*)ths->g_hat;
05254 
05255   TIC(2);
05256   nfft_adjoint_3d_B(ths);
05257   TOC(2);
05258 
05259   TIC_FFTW(1)
05260   fftw_execute(ths->my_fftw_plan2);
05261   TOC_FFTW(1);
05262 
05263   TIC(0)
05264   if(ths->nfft_flags & PRE_PHI_HUT)
05265     {
05266       c_phi_inv01=ths->c_phi_inv[0];
05267       c_phi_inv02=&ths->c_phi_inv[0][N0/2];
05268 
05269       #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
05270       for(k0=0;k0<N0/2;k0++)
05271   {
05272     ck01=c_phi_inv01[k0];
05273     ck02=c_phi_inv02[k0];
05274     c_phi_inv11=ths->c_phi_inv[1];
05275     c_phi_inv12=&ths->c_phi_inv[1][N1/2];
05276 
05277     for(k1=0;k1<N1/2;k1++)
05278       {
05279         ck11=c_phi_inv11[k1];
05280         ck12=c_phi_inv12[k1];
05281         c_phi_inv21=ths->c_phi_inv[2];
05282         c_phi_inv22=&ths->c_phi_inv[2][N2/2];
05283 
05284         g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05285         f_hat111=f_hat + (k0*N1+k1)*N2;
05286         g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05287         f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
05288         g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
05289         f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
05290         g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
05291         f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
05292 
05293         g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
05294         f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
05295         g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
05296         f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
05297         g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
05298         f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
05299         g_hat222=g_hat + (k0*n1+k1)*n2;
05300         f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
05301 
05302         for(k2=0;k2<N2/2;k2++)
05303     {
05304       ck21=c_phi_inv21[k2];
05305       ck22=c_phi_inv22[k2];
05306 
05307       f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
05308       f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
05309       f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
05310       f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
05311 
05312       f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
05313       f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
05314       f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
05315       f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
05316     }
05317       }
05318   }
05319     }
05320   else
05321     #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
05322     for(k0=0;k0<N0/2;k0++)
05323       {
05324   ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
05325   ck02=K(1.0)/(PHI_HUT(k0,0));
05326   for(k1=0;k1<N1/2;k1++)
05327     {
05328       ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
05329       ck12=K(1.0)/(PHI_HUT(k1,1));
05330 
05331       for(k2=0;k2<N2/2;k2++)
05332         {
05333     ck21=K(1.0)/(PHI_HUT(k2-N2/2,2));
05334     ck22=K(1.0)/(PHI_HUT(k2,2));
05335 
05336     f_hat[(k0*N1+k1)*N2+k2]                  = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
05337     f_hat[((N0/2+k0)*N1+k1)*N2+k2]           = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2]           * ck02 * ck11 * ck21;
05338     f_hat[(k0*N1+N1/2+k1)*N2+k2]             = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2]         * ck01 * ck12 * ck21;
05339     f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2]      = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2]                   * ck02 * ck12 * ck21;
05340 
05341     f_hat[(k0*N1+k1)*N2+N2/2+k2]             = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2]         * ck01 * ck11 * ck22;
05342     f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2]      = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2]                   * ck02 * ck11 * ck22;
05343     f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2]        = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2]                 * ck01 * ck12 * ck22;
05344     f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2]                           * ck02 * ck12 * ck22;
05345         }
05346     }
05347       }
05348 
05349   TOC(0)
05350 }
05351 
05354 void nfft_trafo(nfft_plan *ths)
05355 {
05356   switch(ths->d)
05357     {
05358     case 1: nfft_trafo_1d(ths); break;
05359     case 2: nfft_trafo_2d(ths); break;
05360     case 3: nfft_trafo_3d(ths); break;
05361     default:
05362     /* use ths->my_fftw_plan1 */
05363     ths->g_hat=ths->g1;
05364     ths->g=ths->g2;
05365 
05369     TIC(0)
05370     nfft_D_A(ths);
05371     TOC(0)
05372 
05373     
05377     TIC_FFTW(1)
05378     fftw_execute(ths->my_fftw_plan1);
05379     TOC_FFTW(1)
05380 
05381     
05384     TIC(2)
05385     nfft_B_A(ths);
05386     TOC(2)
05387     }
05388 } /* nfft_trafo */
05389 
05390 void nfft_adjoint(nfft_plan *ths)
05391 {
05392   switch(ths->d)
05393     {
05394     case 1: nfft_adjoint_1d(ths); break;
05395     case 2: nfft_adjoint_2d(ths); break;
05396     case 3: nfft_adjoint_3d(ths); break;
05397     default:
05398       /* use ths->my_fftw_plan2 */
05399       ths->g_hat=ths->g1;
05400       ths->g=ths->g2;
05401       
05405       TIC(2)
05406       nfft_B_T(ths);
05407       TOC(2)
05408 
05413       TIC_FFTW(1)
05414       fftw_execute(ths->my_fftw_plan2);
05415       TOC_FFTW(1)
05416 
05420       TIC(0)
05421       nfft_D_T(ths);
05422       TOC(0)
05423     }
05424 } /* nfft_adjoint */
05425 
05426 
05429 static void nfft_precompute_phi_hut(nfft_plan *ths)
05430 {
05431   int ks[ths->d];                       
05432   int t;                                
05434   ths->c_phi_inv = (R**) nfft_malloc(ths->d*sizeof(R*));
05435 
05436   for(t=0; t<ths->d; t++)
05437     {
05438       ths->c_phi_inv[t]= (R*)nfft_malloc(ths->N[t]*sizeof(R));
05439       for(ks[t]=0; ks[t]<ths->N[t]; ks[t]++)
05440   ths->c_phi_inv[t][ks[t]]= K(1.0)/(PHI_HUT(ks[t]-ths->N[t]/2,t));
05441     }
05442 } /* nfft_phi_hut */
05443 
05449 void nfft_precompute_lin_psi(nfft_plan *ths)
05450 {
05451   int t;                                
05452   int j;                                
05453   R step;                          
05455   for (t=0; t<ths->d; t++)
05456     {
05457       step=((R)(ths->m+2))/(((R)ths->K)*ths->n[t]);
05458       for(j=0;j<=ths->K;j++)
05459   {
05460     ths->psi[(ths->K+1)*t + j] = PHI(j*step,t);
05461   } /* for(j) */
05462     } /* for(t) */
05463 }
05464 
05465 static void nfft_precompute_fg_psi(nfft_plan *ths)
05466 {
05467   int t;                                
05468   int u, o;                             
05470   nfft_sort_nodes(ths);
05471 
05472   for (t=0; t<ths->d; t++)
05473   {
05474     int j;
05475     #pragma omp parallel for default(shared) private(j,u,o)
05476     for (j = 0; j < ths->M_total; j++)
05477       {
05478   nfft_uo(ths,j,&u,&o,t);
05479 
05480         ths->psi[2*(j*ths->d+t)]=
05481             (PHI((ths->x[j*ths->d+t]-((R)u)/ths->n[t]),t));
05482 
05483         ths->psi[2*(j*ths->d+t)+1]=
05484             EXP(K(2.0)*(ths->n[t]*ths->x[j*ths->d+t] - u) / ths->b[t]);
05485       } /* for(j) */
05486   }
05487   /* for(t) */
05488 } /* nfft_precompute_fg_psi */
05489 
05490 void nfft_precompute_psi(nfft_plan *ths)
05491 {
05492   int t;                                
05493   int l;                                
05494   int lj;                               
05495   int u, o;                             
05497   nfft_sort_nodes(ths);
05498 
05499   for (t=0; t<ths->d; t++)
05500   {
05501     int j;
05502     #pragma omp parallel for default(shared) private(j,l,lj,u,o)
05503     for (j = 0; j < ths->M_total; j++)
05504       {
05505   nfft_uo(ths,j,&u,&o,t);
05506 
05507   for(l=u, lj=0; l <= o; l++, lj++)
05508     ths->psi[(j*ths->d+t)*(2*ths->m+2)+lj]=
05509       (PHI((ths->x[j*ths->d+t]-((R)l)/ths->n[t]),t));
05510       } /* for(j) */
05511   }
05512   /* for(t) */
05513 } /* nfft_precompute_psi */
05514 
05515 #ifdef _OPENMP
05516 static void nfft_precompute_full_psi_omp(nfft_plan *ths)
05517 {
05518   int j;                                
05519   int lprod;                            
05521   {
05522     int t;
05523     for(t=0,lprod = 1; t<ths->d; t++)
05524         lprod *= 2*ths->m+2;
05525   }
05526 
05527   #pragma omp parallel for default(shared) private(j)
05528   for(j=0; j<ths->M_total; j++)
05529     {
05530       int t,t2;                             
05531       int l_L;                              
05532       int l[ths->d];                        
05533       int lj[ths->d];                       
05534       int ll_plain[ths->d+1];               
05536       int u[ths->d], o[ths->d];             
05538       R phi_prod[ths->d+1];
05539       int ix = j*lprod;
05540 
05541       phi_prod[0]=1;
05542       ll_plain[0]=0;
05543 
05544       MACRO_init_uo_l_lj_t;
05545 
05546       for(l_L=0; l_L<lprod; l_L++, ix++)
05547       {
05548         MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
05549 
05550         ths->psi_index_g[ix]=ll_plain[ths->d];
05551         ths->psi[ix]=phi_prod[ths->d];
05552 
05553         MACRO_count_uo_l_lj_t;
05554       } /* for(l_L) */
05555 
05556       ths->psi_index_f[j]=lprod;
05557     } /* for(j) */
05558 }
05559 #endif
05560 
05561 void nfft_precompute_full_psi(nfft_plan *ths)
05562 {
05563 #ifdef _OPENMP
05564   nfft_sort_nodes(ths);
05565 
05566   nfft_precompute_full_psi_omp(ths);
05567 #else
05568   int t,t2;                             
05569   int j;                                
05570   int l_L;                              
05571   int l[ths->d];                        
05572   int lj[ths->d];                       
05573   int ll_plain[ths->d+1];               
05574   int lprod;                            
05575   int u[ths->d], o[ths->d];             
05577   R phi_prod[ths->d+1];
05578 
05579   int ix,ix_old;
05580 
05581   nfft_sort_nodes(ths);
05582 
05583   phi_prod[0]=1;
05584   ll_plain[0]=0;
05585 
05586   for(t=0,lprod = 1; t<ths->d; t++)
05587       lprod *= 2*ths->m+2;
05588 
05589   for(j=0,ix=0,ix_old=0; j<ths->M_total; j++)
05590     {
05591       MACRO_init_uo_l_lj_t;
05592 
05593       for(l_L=0; l_L<lprod; l_L++, ix++)
05594   {
05595     MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
05596 
05597     ths->psi_index_g[ix]=ll_plain[ths->d];
05598     ths->psi[ix]=phi_prod[ths->d];
05599 
05600     MACRO_count_uo_l_lj_t;
05601   } /* for(l_L) */
05602 
05603 
05604       ths->psi_index_f[j]=ix-ix_old;
05605       ix_old=ix;
05606     } /* for(j) */
05607 #endif
05608 }
05609 
05610 void nfft_precompute_one_psi(nfft_plan *ths)
05611 {
05612   if(ths->nfft_flags & PRE_LIN_PSI)
05613     nfft_precompute_lin_psi(ths);
05614   if(ths->nfft_flags & PRE_FG_PSI)
05615     nfft_precompute_fg_psi(ths);
05616   if(ths->nfft_flags & PRE_PSI)
05617     nfft_precompute_psi(ths);
05618   if(ths->nfft_flags & PRE_FULL_PSI)
05619     nfft_precompute_full_psi(ths);
05620 }
05621 
05622 
05623 static void nfft_init_help(nfft_plan *ths)
05624 {
05625   int t;                                
05626   int lprod;                            
05628   if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)
05629     ths->nfft_flags |= NFFT_SORT_NODES;
05630 
05631   ths->N_total=nfft_prod_int(ths->N, ths->d);
05632   ths->n_total=nfft_prod_int(ths->n, ths->d);
05633 
05634   ths->sigma = (R*) nfft_malloc(ths->d*sizeof(R));
05635   for(t = 0;t < ths->d; t++)
05636     ths->sigma[t] = ((R)ths->n[t])/ths->N[t];
05637 
05638   WINDOW_HELP_INIT;
05639 
05640   if(ths->nfft_flags & MALLOC_X)
05641     ths->x = (R*)nfft_malloc(ths->d*ths->M_total*sizeof(R));
05642 
05643   if(ths->nfft_flags & MALLOC_F_HAT)
05644     ths->f_hat = (fftw_complex*)nfft_malloc(ths->N_total*sizeof(C));
05645 
05646   if(ths->nfft_flags & MALLOC_F)
05647     ths->f = (fftw_complex*)nfft_malloc(ths->M_total*sizeof(C));
05648 
05649   if(ths->nfft_flags & PRE_PHI_HUT)
05650     nfft_precompute_phi_hut(ths);
05651 
05652   if(ths->nfft_flags & PRE_LIN_PSI)
05653   {
05654       ths->K=(1U<< 10)*(ths->m+2);
05655       ths->psi = (R*) nfft_malloc((ths->K+1)*ths->d*sizeof(R));
05656   }
05657 
05658   if(ths->nfft_flags & PRE_FG_PSI)
05659     ths->psi = (R*) nfft_malloc(ths->M_total*ths->d*2*sizeof(R));
05660 
05661   if(ths->nfft_flags & PRE_PSI)
05662     ths->psi = (R*) nfft_malloc(ths->M_total*ths->d*
05663              (2*ths->m+2)*sizeof(R));
05664 
05665   if(ths->nfft_flags & PRE_FULL_PSI)
05666   {
05667       for(t=0,lprod = 1; t<ths->d; t++)
05668     lprod *= 2*ths->m+2;
05669 
05670       ths->psi = (R*) nfft_malloc(ths->M_total*lprod*sizeof(R));
05671 
05672       ths->psi_index_f = (int*) nfft_malloc(ths->M_total*sizeof(int));
05673       ths->psi_index_g = (int*) nfft_malloc(ths->M_total*lprod*sizeof(int));
05674   }
05675 
05676   if(ths->nfft_flags & FFTW_INIT)
05677   {
05678 #ifdef _OPENMP
05679     int nthreads = nfft_get_omp_num_threads();
05680 #endif
05681 
05682     ths->g1=(fftw_complex*)nfft_malloc(ths->n_total*sizeof(C));
05683 
05684     if(ths->nfft_flags & FFT_OUT_OF_PLACE)
05685       ths->g2 = (fftw_complex*) nfft_malloc(ths->n_total*sizeof(C));
05686     else
05687       ths->g2 = ths->g1;
05688 
05689 #ifdef _OPENMP
05690 #pragma omp critical (nfft_omp_critical_fftw_plan)
05691 {
05692     fftw_plan_with_nthreads(nthreads);
05693 #endif
05694     ths->my_fftw_plan1 = fftw_plan_dft(ths->d, ths->n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
05695     ths->my_fftw_plan2 = fftw_plan_dft(ths->d, ths->n, ths->g2, ths->g1,
05696       FFTW_BACKWARD, ths->fftw_flags);
05697 #ifdef _OPENMP
05698 }
05699 #endif
05700   }
05701 
05702   if(ths->nfft_flags & NFFT_SORT_NODES)
05703     ths->index_x = (int*) nfft_malloc(sizeof(int)*2*ths->M_total);
05704   else
05705     ths->index_x = NULL;
05706 
05707   ths->mv_trafo = (void (*) (void* ))nfft_trafo;
05708   ths->mv_adjoint = (void (*) (void* ))nfft_adjoint;
05709 }
05710 
05711 void nfft_init(nfft_plan *ths, int d, int *N, int M_total)
05712 {
05713   int t;                                
05715   ths->d = d;
05716 
05717   ths->N=(int*) nfft_malloc(d*sizeof(int));
05718 
05719   for (t = 0;t < d; t++)
05720     ths->N[t] = N[t];
05721 
05722   ths->M_total = M_total;
05723 
05724   ths->n = (int*) nfft_malloc(d*sizeof(int));
05725   for (t = 0;t < d; t++)
05726     ths->n[t] = 2*X(next_power_of_2)(ths->N[t]);
05727 
05728   ths->m = WINDOW_HELP_ESTIMATE_m;
05729 
05730   if (d > 1)
05731   {
05732 #ifdef _OPENMP
05733     ths->nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
05734                       FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
05735           NFFT_OMP_BLOCKWISE_ADJOINT;
05736 #else
05737     ths->nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
05738                       FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
05739 #endif
05740   }
05741   else
05742     ths->nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
05743                       FFTW_INIT | FFT_OUT_OF_PLACE;
05744 
05745   ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
05746 
05747   nfft_init_help(ths);
05748 }
05749 
05750 void nfft_init_guru(nfft_plan *ths, int d, int *N, int M_total, int *n,
05751       int m, unsigned nfft_flags, unsigned fftw_flags)
05752 {
05753   int t;                                
05755   ths->d =d;
05756   ths->N= (int*) nfft_malloc(ths->d*sizeof(int));
05757   for(t=0; t<d; t++)
05758     ths->N[t]= N[t];
05759   ths->M_total= M_total;
05760   ths->n= (int*) nfft_malloc(ths->d*sizeof(int));
05761   for(t=0; t<d; t++)
05762     ths->n[t]= n[t];
05763   ths->m= m;
05764   ths->nfft_flags= nfft_flags;
05765   ths->fftw_flags= fftw_flags;
05766 
05767   nfft_init_help(ths);
05768 }
05769 
05770 void nfft_init_1d(nfft_plan *ths, int N1, int M_total)
05771 {
05772   int N[1];
05773 
05774   N[0]=N1;
05775 
05776   nfft_init(ths, 1, N, M_total);
05777 }
05778 
05779 void nfft_init_2d(nfft_plan *ths, int N1, int N2, int M_total)
05780 {
05781   int N[2];
05782 
05783   N[0]=N1;
05784   N[1]=N2;
05785   nfft_init(ths,2,N,M_total);
05786 }
05787 
05788 void nfft_init_3d(nfft_plan *ths, int N1, int N2, int N3, int M_total)
05789 {
05790   int N[3];
05791 
05792   N[0]=N1;
05793   N[1]=N2;
05794   N[2]=N3;
05795   nfft_init(ths,3,N,M_total);
05796 }
05797 
05798 const char* nfft_check(nfft_plan *ths)
05799 {
05800   int j;
05801 
05802   for(j=0;j<ths->M_total*ths->d;j++)
05803     if((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
05804       return "ths->x out of range [-0.5,0.5)";
05805 
05806   for(j=0;j<ths->d;j++)
05807   {
05808     if(ths->sigma[j]<=1)
05809       return "nfft_check: oversampling factor too small";
05810 
05811     if(ths->N[j]<=ths->m)
05812       return "Polynomial degree N is smaller than cut-off m";
05813 
05814     if(ths->N[j]%2==1)
05815       return "polynomial degree N has to be even";
05816   }
05817   return 0;
05818 }
05819 
05820 void nfft_finalize(nfft_plan *ths)
05821 {
05822   int t; /* index over dimensions */
05823 
05824   if(ths->nfft_flags & NFFT_SORT_NODES)
05825     nfft_free(ths->index_x);
05826 
05827   if(ths->nfft_flags & FFTW_INIT)
05828   {
05829 #pragma omp critical (nfft_omp_critical_fftw_plan)
05830 {
05831     fftw_destroy_plan(ths->my_fftw_plan2);
05832     fftw_destroy_plan(ths->my_fftw_plan1);
05833 }
05834 
05835     if(ths->nfft_flags & FFT_OUT_OF_PLACE)
05836       nfft_free(ths->g2);
05837 
05838     nfft_free(ths->g1);
05839   }
05840 
05841   if(ths->nfft_flags & PRE_FULL_PSI)
05842     {
05843       nfft_free(ths->psi_index_g);
05844       nfft_free(ths->psi_index_f);
05845       nfft_free(ths->psi);
05846     }
05847 
05848   if(ths->nfft_flags & PRE_PSI)
05849     nfft_free(ths->psi);
05850 
05851   if(ths->nfft_flags & PRE_FG_PSI)
05852     nfft_free(ths->psi);
05853 
05854   if(ths->nfft_flags & PRE_LIN_PSI)
05855     nfft_free(ths->psi);
05856 
05857   if(ths->nfft_flags & PRE_PHI_HUT)
05858     {
05859       for(t=0; t<ths->d; t++)
05860         nfft_free(ths->c_phi_inv[t]);
05861       nfft_free(ths->c_phi_inv);
05862     }
05863 
05864   if(ths->nfft_flags & MALLOC_F)
05865     nfft_free(ths->f);
05866 
05867   if(ths->nfft_flags & MALLOC_F_HAT)
05868     nfft_free(ths->f_hat);
05869 
05870   if(ths->nfft_flags & MALLOC_X)
05871   nfft_free(ths->x);
05872 
05873   WINDOW_HELP_FINALIZE;
05874 
05875   nfft_free(ths->sigma);
05876   nfft_free(ths->n);
05877   nfft_free(ths->N);
05878 }

Generated on Fri Oct 12 2012 by Doxygen 1.8.0-20120409