00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00027 #include "config.h"
00028
00029 #include <stdlib.h>
00030 #include <math.h>
00031 #ifdef HAVE_COMPLEX_H
00032 #include <complex.h>
00033 #endif
00034
00035 #include "nfft3util.h"
00036 #include "nfft3.h"
00037 #include "fastsum.h"
00038 #include "infft.h"
00039
00041 #include "kernels.h"
00042
00049 int max_i(int a, int b)
00050 {
00051 return a >= b ? a : b;
00052 }
00053
00055 double fak(int n)
00056 {
00057 if (n<=1) return 1.0;
00058 else return (double)n*fak(n-1);
00059 }
00060
00062 double binom(int n, int m)
00063 {
00064 return fak(n)/fak(m)/fak(n-m);
00065 }
00066
00068 double BasisPoly(int m, int r, double xx)
00069 {
00070 int k;
00071 double sum=0.0;
00072
00073 for (k=0; k<=m-r; k++) {
00074 sum+=binom(m+k,k)*pow((xx+1.0)/2.0,(double)k);
00075 }
00076 return sum*pow((xx+1.0),(double)r)*pow(1.0-xx,(double)(m+1))/(1<<(m+1))/fak(r);
00077 }
00078
00080 double _Complex regkern(kernel k, double xx, int p, const double *param, double a, double b)
00081 {
00082 int r;
00083 double _Complex sum=0.0;
00084
00085 if (xx<-0.5)
00086 xx=-0.5;
00087 if (xx>0.5)
00088 xx=0.5;
00089 if ((xx>=-0.5+b && xx<=-a) || (xx>=a && xx<=0.5-b)) {
00090 return k(xx,0,param);
00091 }
00092 else if (xx<-0.5+b) {
00093 sum=(k(-0.5,0,param)+k(0.5,0,param))/2.0
00094 *BasisPoly(p-1,0,2.0*xx/b+(1.0-b)/b);
00095 for (r=0; r<p; r++) {
00096 sum+=pow(-b/2.0,(double)r)
00097 *k(-0.5+b,r,param)
00098 *BasisPoly(p-1,r,-2.0*xx/b+(b-1.0)/b);
00099 }
00100 return sum;
00101 }
00102 else if ((xx>-a) && (xx<a)) {
00103 for (r=0; r<p; r++) {
00104 sum+=pow(a,(double)r)
00105 *( k(-a,r,param)*BasisPoly(p-1,r,xx/a)
00106 +k( a,r,param)*BasisPoly(p-1,r,-xx/a)*(r & 1 ? -1 : 1));
00107 }
00108 return sum;
00109 }
00110 else if (xx>0.5-b) {
00111 sum=(k(-0.5,0,param)+k(0.5,0,param))/2.0
00112 *BasisPoly(p-1,0,-2.0*xx/b+(1.0-b)/b);
00113 for (r=0; r<p; r++) {
00114 sum+=pow(b/2.0,(double)r)
00115 *k(0.5-b,r,param)
00116 *BasisPoly(p-1,r,2.0*xx/b-(1.0-b)/b);
00117 }
00118 return sum;
00119 }
00120 return k(xx,0,param);
00121 }
00122
00126 double _Complex regkern1(kernel k, double xx, int p, const double *param, double a, double b)
00127 {
00128 int r;
00129 double _Complex sum=0.0;
00130
00131 if (xx<-0.5)
00132 xx=-0.5;
00133 if (xx>0.5)
00134 xx=0.5;
00135 if ((xx>=-0.5+b && xx<=-a) || (xx>=a && xx<=0.5-b))
00136 {
00137 return k(xx,0,param);
00138 }
00139 else if ((xx>-a) && (xx<a))
00140 {
00141 for (r=0; r<p; r++) {
00142 sum+=pow(a,(double)r)
00143 *( k(-a,r,param)*BasisPoly(p-1,r,xx/a)
00144 +k( a,r,param)*BasisPoly(p-1,r,-xx/a)*(r & 1 ? -1 : 1));
00145 }
00146 return sum;
00147 }
00148 else if (xx<-0.5+b)
00149 {
00150 for (r=0; r<p; r++) {
00151 sum+=pow(b,(double)r)
00152 *( k(0.5-b,r,param)*BasisPoly(p-1,r,(xx+0.5)/b)
00153 +k(-0.5+b,r,param)*BasisPoly(p-1,r,-(xx+0.5)/b)*(r & 1 ? -1 : 1));
00154 }
00155 return sum;
00156 }
00157 else if (xx>0.5-b)
00158 {
00159 for (r=0; r<p; r++) {
00160 sum+=pow(b,(double)r)
00161 *( k(0.5-b,r,param)*BasisPoly(p-1,r,(xx-0.5)/b)
00162 +k(-0.5+b,r,param)*BasisPoly(p-1,r,-(xx-0.5)/b)*(r & 1 ? -1 : 1));
00163 }
00164 return sum;
00165 }
00166 return k(xx,0,param);
00167 }
00168
00170 double _Complex regkern2(kernel k, double xx, int p, const double *param, double a, double b)
00171 {
00172 int r;
00173 double _Complex sum=0.0;
00174
00175 xx=fabs(xx);
00176
00177 if (xx>0.5) {
00178 for (r=0; r<p; r++) {
00179 sum+=pow(b,(double)r)*k(0.5-b,r,param)
00180 *(BasisPoly(p-1,r,0)+BasisPoly(p-1,r,0));
00181 }
00182 return sum;
00183 }
00184 else if ((a<=xx) && (xx<=0.5-b)) {
00185 return k(xx,0,param);
00186 }
00187 else if (xx<a) {
00188 for (r=0; r<p; r++) {
00189 sum+=pow(-a,(double)r)*k(a,r,param)
00190 *(BasisPoly(p-1,r,xx/a)+BasisPoly(p-1,r,-xx/a));
00191 }
00192 return sum;
00193 }
00194 else if ((0.5-b<xx) && (xx<=0.5)) {
00195 for (r=0; r<p; r++) {
00196 sum+=pow(b,(double)r)*k(0.5-b,r,param)
00197 *(BasisPoly(p-1,r,(xx-0.5)/b)+BasisPoly(p-1,r,-(xx-0.5)/b));
00198 }
00199 return sum;
00200 }
00201 return 0.0;
00202 }
00203
00207 double _Complex regkern3(kernel k, double xx, int p, const double *param, double a, double b)
00208 {
00209 int r;
00210 double _Complex sum=0.0;
00211
00212 xx=fabs(xx);
00213
00214 if (xx>=0.5) {
00215
00216 xx=0.5;
00217 }
00218
00219 if ((a<=xx) && (xx<=0.5-b)) {
00220 return k(xx,0,param);
00221 }
00222 else if (xx<a) {
00223 for (r=0; r<p; r++) {
00224 sum+=pow(-a,(double)r)*k(a,r,param)
00225 *(BasisPoly(p-1,r,xx/a)+BasisPoly(p-1,r,-xx/a));
00226 }
00227
00228 return sum;
00229 }
00230 else if ((0.5-b<xx) && (xx<=0.5)) {
00231 sum=k(0.5,0,param)*BasisPoly(p-1,0,-2.0*xx/b+(1.0-b)/b);
00232
00233 for (r=0; r<p; r++) {
00234 sum+=pow(b/2.0,(double)r)
00235 *k(0.5-b,r,param)
00236 *BasisPoly(p-1,r,2.0*xx/b-(1.0-b)/b);
00237 }
00238 return sum;
00239 }
00240 return 0.0;
00241 }
00242
00244 double _Complex linintkern(const double x, const double _Complex *Add,
00245 const int Ad, const double a)
00246 {
00247 double c,c1,c3;
00248 int r;
00249 double _Complex f1,f2;
00250
00251 c=x*Ad/a;
00252 r=c; r=abs(r);
00253 f1=Add[r];f2=Add[r+1];
00254 c=fabs(c);
00255 c1=c-r;
00256 c3=c1-1.0;
00257 return (-f1*c3+f2*c1);
00258 }
00259
00260 double _Complex quadrintkern(const double x, const double _Complex *Add,
00261 const int Ad, const double a)
00262 {
00263 double c,c1,c2,c3;
00264 int r;
00265 double _Complex f0,f1,f2;
00266
00267 c=x*Ad/a;
00268 r=c; r=abs(r);
00269 if (r==0) {f0=Add[r+1];f1=Add[r];f2=Add[r+1];}
00270 else { f0=Add[r-1];f1=Add[r];f2=Add[r+1];}
00271 c=fabs(c);
00272 c1=c-r;
00273 c2=c1+1.0;
00274 c3=c1-1.0;
00275 return (f0*c1*c3/2.0-f1*c2*c3+f2*c2*c1/2.0);
00276 }
00277
00279 double _Complex kubintkern(const double x, const double _Complex *Add,
00280 const int Ad, const double a)
00281 {
00282 double c,c1,c2,c3,c4;
00283 int r;
00284 double _Complex f0,f1,f2,f3;
00285 c=x*Ad/a;
00286 r=c; r=abs(r);
00287 if (r==0) {f0=Add[r+1];f1=Add[r];f2=Add[r+1];f3=Add[r+2];}
00288 else { f0=Add[r-1];f1=Add[r];f2=Add[r+1];f3=Add[r+2];}
00289 c=fabs(c);
00290 c1=c-r;
00291 c2=c1+1.0;
00292 c3=c1-1.0;
00293 c4=c1-2.0;
00294
00295
00296 return(-f0*c1*c3*c4/6.0+f1*c2*c3*c4/2.0-f2*c2*c1*c4/2.0+f3*c2*c1*c3/6.0);
00297 }
00298
00300 double _Complex kubintkern1(const double x, const double _Complex *Add,
00301 const int Ad, const double a)
00302 {
00303 double c,c1,c2,c3,c4;
00304 int r;
00305 double _Complex f0,f1,f2,f3;
00306 Add+=2;
00307 c=(x+a)*Ad/2/a;
00308 r=c; r=abs(r);
00309
00310
00311 { f0=Add[r-1];f1=Add[r];f2=Add[r+1];f3=Add[r+2];}
00312 c=fabs(c);
00313 c1=c-r;
00314 c2=c1+1.0;
00315 c3=c1-1.0;
00316 c4=c1-2.0;
00317
00318
00319 return(-f0*c1*c3*c4/6.0+f1*c2*c3*c4/2.0-f2*c2*c1*c4/2.0+f3*c2*c1*c3/6.0);
00320 }
00321
00323 void quicksort(int d, int t, double *x, double _Complex *alpha, int N)
00324 {
00325 int lpos=0;
00326 int rpos=N-1;
00327
00328 double pivot=x[(N/2)*d+t];
00329
00330 int k;
00331 double temp1;
00332 double _Complex temp2;
00333
00334 while (lpos<=rpos)
00335 {
00336 while (x[lpos*d+t]<pivot)
00337 lpos++;
00338 while (x[rpos*d+t]>pivot)
00339 rpos--;
00340 if (lpos<=rpos)
00341 {
00342 for (k=0; k<d; k++)
00343 {
00344 temp1=x[lpos*d+k];
00345 x[lpos*d+k]=x[rpos*d+k];
00346 x[rpos*d+k]=temp1;
00347 }
00348 temp2=alpha[lpos];
00349 alpha[lpos]=alpha[rpos];
00350 alpha[rpos]=temp2;
00351
00352 lpos++;
00353 rpos--;
00354 }
00355 }
00356 if (0<rpos)
00357 quicksort(d,t,x,alpha,rpos+1);
00358 if (lpos<N-1)
00359 quicksort(d,t,x+lpos*d,alpha+lpos,N-lpos);
00360 }
00361
00363 static void BuildBox(fastsum_plan *ths)
00364 {
00365 int t, l;
00366 int *box_index;
00367 double val[ths->d];
00368
00369 box_index = (int *) malloc(ths->box_count * sizeof(int));
00370 for (t=0; t < ths->box_count; t++)
00371 box_index[t] = 0;
00372
00373 for (l=0; l < ths->N_total; l++)
00374 {
00375 int ind = 0;
00376 for (t=0; t < ths->d; t++)
00377 {
00378 val[t] = ths->x[ths->d * l + t] + 0.25 - ths->eps_B/2.0;
00379 ind *= ths->box_count_per_dim;
00380 ind += (int) (val[t] / ths->eps_I);
00381 }
00382 box_index[ind]++;
00383 }
00384
00385 ths->box_offset[0] = 0;
00386 for (t=1; t<=ths->box_count; t++)
00387 {
00388 ths->box_offset[t] = ths->box_offset[t-1] + box_index[t-1];
00389 box_index[t-1] = ths->box_offset[t-1];
00390 }
00391
00392 for (l=0; l < ths->N_total; l++)
00393 {
00394 int ind = 0;
00395 for (t=0; t < ths->d; t++)
00396 {
00397 val[t] = ths->x[ths->d * l + t] + 0.25 - ths->eps_B/2.0;
00398 ind *= ths->box_count_per_dim;
00399 ind += (int) (val[t] / ths->eps_I);
00400 }
00401
00402 ths->box_alpha[box_index[ind]] = ths->alpha[l];
00403
00404 for (t=0; t < ths->d; t++)
00405 {
00406 ths->box_x[ths->d * box_index[ind] + t] = ths->x[ths->d * l + t];
00407 }
00408 box_index[ind]++;
00409 }
00410 free(box_index);
00411 }
00412
00414 static inline double _Complex calc_SearchBox(int d, double *y, double *x, double _Complex *alpha, int start, int end_lt, const double _Complex *Add, const int Ad, int p, double a, const kernel k, const double *param, const unsigned flags)
00415 {
00416 double _Complex result = 0.0;
00417
00418 int m, l;
00419 double r;
00420
00421 for (m = start; m < end_lt; m++)
00422 {
00423 if (d==1)
00424 {
00425 r = y[0]-x[m];
00426 }
00427 else
00428 {
00429 r=0.0;
00430 for (l=0; l<d; l++)
00431 r+=(y[l]-x[m*d+l])*(y[l]-x[m*d+l]);
00432 r=sqrt(r);
00433 }
00434 if (fabs(r)<a)
00435 {
00436 result += alpha[m]*k(r,0,param);
00437 if (d==1)
00438 {
00439 if (flags & EXACT_NEARFIELD)
00440 result -= alpha[m]*regkern1(k,r,p,param,a,1.0/16.0);
00441 else
00442 result -= alpha[m]*kubintkern1(r,Add,Ad,a);
00443 }
00444 else
00445 {
00446 if (flags & EXACT_NEARFIELD)
00447 result -= alpha[m]*regkern(k,r,p,param,a,1.0/16.0);
00448 else
00449 #if defined(NF_KUB)
00450 result -= alpha[m]*kubintkern(r,Add,Ad,a);
00451 #elif defined(NF_QUADR)
00452 result -= alpha[m]*quadrintkern(r,Add,Ad,a);
00453 #elif defined(NF_LIN)
00454 result -= alpha[m]*linintkern(r,Add,Ad,a);
00455 #else
00456 #error define interpolation method
00457 #endif
00458 }
00459 }
00460 }
00461 return result;
00462 }
00463
00465 static double _Complex SearchBox(double *y, fastsum_plan *ths)
00466 {
00467 double _Complex val = 0.0;
00468 int t, l;
00469 int y_multiind[ths->d];
00470 int multiindex[ths->d];
00471 int y_ind;
00472
00473 for (t=0; t < ths->d; t++)
00474 {
00475 y_multiind[t] = ((y[t] + 0.25 - ths->eps_B/2.0) / ths->eps_I);
00476 }
00477
00478 if (ths->d==1)
00479 {
00480 for (y_ind = max_i(0, y_multiind[0]-1); y_ind < ths->box_count_per_dim && y_ind <= y_multiind[0]+1; y_ind++){
00481 val += calc_SearchBox(ths->d, y, ths->box_x, ths->box_alpha, ths->box_offset[y_ind], ths->box_offset[y_ind+1], ths->Add, ths->Ad, ths->p, ths->eps_I, ths->k, ths->kernel_param, ths->flags);
00482 }
00483 }
00484 else if (ths->d==2)
00485 {
00486 for (multiindex[0] = max_i(0, y_multiind[0]-1); multiindex[0] < ths->box_count_per_dim && multiindex[0] <= y_multiind[0]+1; multiindex[0]++)
00487 for (multiindex[1] = max_i(0, y_multiind[1]-1); multiindex[1] < ths->box_count_per_dim && multiindex[1] <= y_multiind[1]+1; multiindex[1]++)
00488 {
00489 y_ind = (ths->box_count_per_dim * multiindex[0]) + multiindex[1];
00490 val += calc_SearchBox(ths->d, y, ths->box_x, ths->box_alpha, ths->box_offset[y_ind], ths->box_offset[y_ind+1], ths->Add, ths->Ad, ths->p, ths->eps_I, ths->k, ths->kernel_param, ths->flags);
00491 }
00492 }
00493 else if(ths->d==3)
00494 {
00495 for (multiindex[0] = max_i(0, y_multiind[0]-1); multiindex[0] < ths->box_count_per_dim && multiindex[0] <= y_multiind[0]+1; multiindex[0]++)
00496 for (multiindex[1] = max_i(0, y_multiind[1]-1); multiindex[1] < ths->box_count_per_dim && multiindex[1] <= y_multiind[1]+1; multiindex[1]++)
00497 for (multiindex[2] = max_i(0, y_multiind[2]-1); multiindex[2] < ths->box_count_per_dim && multiindex[2] <= y_multiind[2]+1; multiindex[2]++)
00498 {
00499 y_ind = ((ths->box_count_per_dim * multiindex[0]) + multiindex[1]) * ths->box_count_per_dim + multiindex[2];
00500 val += calc_SearchBox(ths->d, y, ths->box_x, ths->box_alpha, ths->box_offset[y_ind], ths->box_offset[y_ind+1], ths->Add, ths->Ad, ths->p, ths->eps_I, ths->k, ths->kernel_param, ths->flags);
00501 }
00502 }
00503 else {
00504 exit(-1);
00505 }
00506 return val;
00507 }
00508
00510 void BuildTree(int d, int t, double *x, double _Complex *alpha, int N)
00511 {
00512 if (N>1)
00513 {
00514 int m=N/2;
00515
00516 quicksort(d,t,x,alpha,N);
00517
00518 BuildTree(d, (t+1)%d, x, alpha, m);
00519 BuildTree(d, (t+1)%d, x+(m+1)*d, alpha+(m+1), N-m-1);
00520 }
00521 }
00522
00524 double _Complex SearchTree(const int d, const int t, const double *x,
00525 const double _Complex *alpha, const double *xmin, const double *xmax,
00526 const int N, const kernel k, const double *param, const int Ad,
00527 const double _Complex *Add, const int p, const unsigned flags)
00528 {
00529 int m=N/2;
00530 double Min=xmin[t], Max=xmax[t], Median=x[m*d+t];
00531 double a=fabs(Max-Min)/2;
00532 int l;
00533 int E=0;
00534 double r;
00535
00536 if (N==0)
00537 return 0.0;
00538 if (Min>Median)
00539 return SearchTree(d,(t+1)%d,x+(m+1)*d,alpha+(m+1),xmin,xmax,N-m-1,k,param,Ad,Add,p,flags);
00540 else if (Max<Median)
00541 return SearchTree(d,(t+1)%d,x,alpha,xmin,xmax,m,k,param,Ad,Add,p,flags);
00542 else
00543 {
00544 double _Complex result = 0.0;
00545 E=0;
00546
00547 for (l=0; l<d; l++)
00548 {
00549 if ( x[m*d+l]>xmin[l] && x[m*d+l]<xmax[l] )
00550 E++;
00551 }
00552
00553 if (E==d)
00554 {
00555 if (d==1)
00556 {
00557 r = xmin[0]+a-x[m];
00558 }
00559 else
00560 {
00561 r=0.0;
00562 for (l=0; l<d; l++)
00563 r+=(xmin[l]+a-x[m*d+l])*(xmin[l]+a-x[m*d+l]);
00564 r=sqrt(r);
00565 }
00566 if (fabs(r)<a)
00567 {
00568 result += alpha[m]*k(r,0,param);
00569 if (d==1)
00570 {
00571 if (flags & EXACT_NEARFIELD)
00572 result -= alpha[m]*regkern1(k,r,p,param,a,1.0/16.0);
00573 else
00574 result -= alpha[m]*kubintkern1(r,Add,Ad,a);
00575 }
00576 else
00577 {
00578 if (flags & EXACT_NEARFIELD)
00579 result -= alpha[m]*regkern(k,r,p,param,a,1.0/16.0);
00580 else
00581 #if defined(NF_KUB)
00582 result -= alpha[m]*kubintkern(r,Add,Ad,a);
00583 #elif defined(NF_QUADR)
00584 result -= alpha[m]*quadrintkern(r,Add,Ad,a);
00585 #elif defined(NF_LIN)
00586 result -= alpha[m]*linintkern(r,Add,Ad,a);
00587 #else
00588 #error define interpolation method
00589 #endif
00590 }
00591 }
00592 }
00593 result += SearchTree(d,(t+1)%d,x+(m+1)*d,alpha+(m+1),xmin,xmax,N-m-1,k,param,Ad,Add,p,flags)
00594 + SearchTree(d,(t+1)%d,x,alpha,xmin,xmax,m,k,param,Ad,Add,p,flags);
00595 return result;
00596 }
00597 }
00598
00600 void fastsum_init_guru(fastsum_plan *ths, int d, int N_total, int M_total, kernel k, double *param, unsigned flags, int nn, int m, int p, double eps_I, double eps_B)
00601 {
00602 int t;
00603 int N[d], n[d];
00604 int n_total;
00605 int sort_flags_trafo = 0;
00606 int sort_flags_adjoint = 0;
00607 #ifdef _OPENMP
00608 int nthreads = nfft_get_omp_num_threads();
00609 #endif
00610
00611 if (d > 1)
00612 {
00613 sort_flags_trafo = NFFT_SORT_NODES;
00614 #ifdef _OPENMP
00615 sort_flags_adjoint = NFFT_SORT_NODES | NFFT_OMP_BLOCKWISE_ADJOINT;
00616 #else
00617 sort_flags_adjoint = NFFT_SORT_NODES;
00618 #endif
00619 }
00620
00621 ths->d = d;
00622
00623 ths->N_total = N_total;
00624 ths->M_total = M_total;
00625
00626 ths->x = (double *)nfft_malloc(d*N_total*(sizeof(double)));
00627 ths->alpha = (double _Complex *)nfft_malloc(N_total*(sizeof(double _Complex)));
00628
00629 ths->y = (double *)nfft_malloc(d*M_total*(sizeof(double)));
00630 ths->f = (double _Complex *)nfft_malloc(M_total*(sizeof(double _Complex)));
00631
00632 ths->k = k;
00633 ths->kernel_param = param;
00634
00635 ths->flags = flags;
00636
00637 ths->p = p;
00638 ths->eps_I = eps_I;
00639 ths->eps_B = eps_B;
00642 if (!(ths->flags & EXACT_NEARFIELD))
00643 {
00644 if (ths->d==1)
00645 {
00646 ths->Ad = 4*(ths->p)*(ths->p);
00647 ths->Add = (double _Complex *)nfft_malloc((ths->Ad+5)*(sizeof(double _Complex)));
00648 }
00649 else
00650 {
00651 if (ths->k == one_over_x)
00652 {
00653 double delta = 1e-8;
00654 switch(p)
00655 {
00656 case 2: delta = 1e-3;
00657 break;
00658 case 3: delta = 1e-4;
00659 break;
00660 case 4: delta = 1e-5;
00661 break;
00662 case 5: delta = 1e-6;
00663 break;
00664 case 6: delta = 1e-6;
00665 break;
00666 case 7: delta = 1e-7;
00667 break;
00668 default: delta = 1e-8;
00669 }
00670
00671 #if defined(NF_KUB)
00672 ths->Ad = max_i(10, (int) ceil(1.4/pow(delta,1.0/4.0)));
00673 ths->Add = (double _Complex *)nfft_malloc((ths->Ad+3)*(sizeof(double _Complex)));
00674 #elif defined(NF_QUADR)
00675 ths->Ad = (int) ceil(2.2/pow(delta,1.0/3.0));
00676 ths->Add = (double _Complex *)nfft_malloc((ths->Ad+3)*(sizeof(double _Complex)));
00677 #elif defined(NF_LIN)
00678 ths->Ad = (int) ceil(1.7/pow(delta,1.0/2.0));
00679 ths->Add = (double _Complex *)nfft_malloc((ths->Ad+3)*(sizeof(double _Complex)));
00680 #else
00681 #error define NF_LIN or NF_QUADR or NF_KUB
00682 #endif
00683 }
00684 else
00685 {
00686 ths->Ad = 2*(ths->p)*(ths->p);
00687 ths->Add = (double _Complex *)nfft_malloc((ths->Ad+3)*(sizeof(double _Complex)));
00688 }
00689 }
00690 }
00691
00693 ths->n = nn;
00694 for (t=0; t<d; t++)
00695 {
00696 N[t] = nn;
00697 n[t] = 2*nn;
00698 }
00699 nfft_init_guru(&(ths->mv1), d, N, N_total, n, m,
00700 sort_flags_adjoint |
00701 PRE_PHI_HUT| PRE_PSI| MALLOC_X | MALLOC_F_HAT| MALLOC_F| FFTW_INIT | FFT_OUT_OF_PLACE,
00702 FFTW_MEASURE| FFTW_DESTROY_INPUT);
00703 nfft_init_guru(&(ths->mv2), d, N, M_total, n, m,
00704 sort_flags_trafo |
00705 PRE_PHI_HUT| PRE_PSI| MALLOC_X | MALLOC_F_HAT| MALLOC_F| FFTW_INIT | FFT_OUT_OF_PLACE,
00706 FFTW_MEASURE| FFTW_DESTROY_INPUT);
00707
00709 n_total = 1;
00710 for (t=0; t<d; t++)
00711 n_total *= nn;
00712
00713 ths->b = (fftw_complex *)nfft_malloc(n_total*sizeof(fftw_complex));
00714 #ifdef _OPENMP
00715 #pragma omp critical (nfft_omp_critical_fftw_plan)
00716 {
00717 fftw_plan_with_nthreads(nthreads);
00718 #endif
00719
00720 ths->fft_plan = fftw_plan_dft(d,N,ths->b,ths->b,FFTW_FORWARD,FFTW_ESTIMATE);
00721
00722 #ifdef _OPENMP
00723 }
00724 #endif
00725
00726 if (ths->flags & NEARFIELD_BOXES)
00727 {
00728 ths->box_count_per_dim = floor((0.5 - ths->eps_B) / ths->eps_I) + 1;
00729 ths->box_count = 1;
00730 for (t=0; t<ths->d; t++)
00731 ths->box_count *= ths->box_count_per_dim;
00732
00733 ths->box_offset = (int *) nfft_malloc((ths->box_count+1) * sizeof(int));
00734
00735 ths->box_alpha = (double _Complex *)nfft_malloc(ths->N_total*(sizeof(double _Complex)));
00736
00737 ths->box_x = (double *) nfft_malloc(ths->d * ths->N_total * sizeof(double));
00738 }
00739 }
00740
00742 void fastsum_finalize(fastsum_plan *ths)
00743 {
00744 nfft_free(ths->x);
00745 nfft_free(ths->alpha);
00746 nfft_free(ths->y);
00747 nfft_free(ths->f);
00748
00749 if (!(ths->flags & EXACT_NEARFIELD))
00750 nfft_free(ths->Add);
00751
00752 nfft_finalize(&(ths->mv1));
00753 nfft_finalize(&(ths->mv2));
00754
00755 #ifdef _OPENMP
00756 #pragma omp critical (nfft_omp_critical_fftw_plan)
00757 {
00758 #endif
00759 fftw_destroy_plan(ths->fft_plan);
00760 #ifdef _OPENMP
00761 }
00762 #endif
00763
00764 nfft_free(ths->b);
00765
00766 if (ths->flags & NEARFIELD_BOXES)
00767 {
00768 nfft_free(ths->box_offset);
00769 nfft_free(ths->box_alpha);
00770 nfft_free(ths->box_x);
00771 }
00772 }
00773
00775 void fastsum_exact(fastsum_plan *ths)
00776 {
00777 int j,k;
00778 int t;
00779 double r;
00780
00781 #pragma omp parallel for default(shared) private(j,k,t,r)
00782 for (j=0; j<ths->M_total; j++)
00783 {
00784 ths->f[j]=0.0;
00785 for (k=0; k<ths->N_total; k++)
00786 {
00787 if (ths->d==1)
00788 r = ths->y[j] - ths->x[k];
00789 else
00790 {
00791 r=0.0;
00792 for (t=0; t<ths->d; t++)
00793 r += (ths->y[j*ths->d+t]-ths->x[k*ths->d+t])*(ths->y[j*ths->d+t]-ths->x[k*ths->d+t]);
00794 r=sqrt(r);
00795 }
00796 ths->f[j] += ths->alpha[k] * ths->k(r,0,ths->kernel_param);
00797 }
00798 }
00799 }
00800
00802 void fastsum_precompute(fastsum_plan *ths)
00803 {
00804 int j,k,t;
00805 int n_total;
00806 ticks t0, t1;
00807
00808 ths->MEASURE_TIME_t[0] = 0.0;
00809 ths->MEASURE_TIME_t[1] = 0.0;
00810 ths->MEASURE_TIME_t[2] = 0.0;
00811 ths->MEASURE_TIME_t[3] = 0.0;
00812
00813 #ifdef MEASURE_TIME
00814 t0 = getticks();
00815 #endif
00816
00817
00818 if (ths->flags & NEARFIELD_BOXES)
00819 {
00820 BuildBox(ths);
00821 }
00822 else
00823 {
00825 BuildTree(ths->d,0,ths->x,ths->alpha,ths->N_total);
00826 }
00827
00828 #ifdef MEASURE_TIME
00829 t1 = getticks();
00830 ths->MEASURE_TIME_t[3] += nfft_elapsed_seconds(t1,t0);
00831 #endif
00832
00833
00834 #ifdef MEASURE_TIME
00835 t0 = getticks();
00836 #endif
00837
00838 if (!(ths->flags & EXACT_NEARFIELD))
00839 {
00840 if (ths->d==1)
00841 #pragma omp parallel for default(shared) private(k)
00842 for (k=-ths->Ad/2-2; k <= ths->Ad/2+2; k++)
00843 ths->Add[k+ths->Ad/2+2] = regkern1(ths->k, ths->eps_I*(double)k/ths->Ad*2, ths->p, ths->kernel_param, ths->eps_I, ths->eps_B);
00844 else
00845 #pragma omp parallel for default(shared) private(k)
00846 for (k=0; k <= ths->Ad+2; k++)
00847 ths->Add[k] = regkern3(ths->k, ths->eps_I*(double)k/ths->Ad, ths->p, ths->kernel_param, ths->eps_I, ths->eps_B);
00848 }
00849 #ifdef MEASURE_TIME
00850 t1 = getticks();
00851 ths->MEASURE_TIME_t[0] += nfft_elapsed_seconds(t1,t0);
00852 #endif
00853
00854
00855 #ifdef MEASURE_TIME
00856 t0 = getticks();
00857 #endif
00858
00859 for (k=0; k<ths->mv1.M_total; k++)
00860 for (t=0; t<ths->mv1.d; t++)
00861 ths->mv1.x[ths->mv1.d*k+t] = - ths->x[ths->mv1.d*k+t];
00862
00864 if(ths->mv1.nfft_flags & PRE_LIN_PSI)
00865 nfft_precompute_lin_psi(&(ths->mv1));
00866
00867 if(ths->mv1.nfft_flags & PRE_PSI)
00868 nfft_precompute_psi(&(ths->mv1));
00869
00870 if(ths->mv1.nfft_flags & PRE_FULL_PSI)
00871 nfft_precompute_full_psi(&(ths->mv1));
00872 #ifdef MEASURE_TIME
00873 t1 = getticks();
00874 ths->MEASURE_TIME_t[1] += nfft_elapsed_seconds(t1,t0);
00875 #endif
00876
00878 for(k=0; k<ths->mv1.M_total;k++)
00879 ths->mv1.f[k] = ths->alpha[k];
00880
00881 #ifdef MEASURE_TIME
00882 t0 = getticks();
00883 #endif
00884
00885 for (j=0; j<ths->mv2.M_total; j++)
00886 for (t=0; t<ths->mv2.d; t++)
00887 ths->mv2.x[ths->mv2.d*j+t] = - ths->y[ths->mv2.d*j+t];
00888
00890 if(ths->mv2.nfft_flags & PRE_LIN_PSI)
00891 nfft_precompute_lin_psi(&(ths->mv2));
00892
00893 if(ths->mv2.nfft_flags & PRE_PSI)
00894 nfft_precompute_psi(&(ths->mv2));
00895
00896 if(ths->mv2.nfft_flags & PRE_FULL_PSI)
00897 nfft_precompute_full_psi(&(ths->mv2));
00898 #ifdef MEASURE_TIME
00899 t1 = getticks();
00900 ths->MEASURE_TIME_t[2] += nfft_elapsed_seconds(t1,t0);
00901 #endif
00902
00903
00904 #ifdef MEASURE_TIME
00905 t0 = getticks();
00906 #endif
00907
00908 n_total = 1;
00909 for (t=0; t<ths->d; t++)
00910 n_total *= ths->n;
00911
00912 #pragma omp parallel for default(shared) private(j,k,t)
00913 for (j=0; j<n_total; j++)
00914 {
00915 if (ths->d==1)
00916 ths->b[j] = regkern1(ths->k, (double)j / (ths->n) - 0.5, ths->p, ths->kernel_param, ths->eps_I, ths->eps_B)/n_total;
00917 else
00918 {
00919 k=j;
00920 ths->b[j]=0.0;
00921 for (t=0; t<ths->d; t++)
00922 {
00923 ths->b[j] += ((double)(k % (ths->n)) / (ths->n) - 0.5) * ((double)(k % (ths->n)) / (ths->n) - 0.5);
00924 k = k / (ths->n);
00925 }
00926 ths->b[j] = regkern3(ths->k, sqrt(ths->b[j]), ths->p, ths->kernel_param, ths->eps_I, ths->eps_B)/n_total;
00927 }
00928 }
00929
00930 nfft_fftshift_complex(ths->b, ths->mv1.d, ths->mv1.N);
00931 fftw_execute(ths->fft_plan);
00932 nfft_fftshift_complex(ths->b, ths->mv1.d, ths->mv1.N);
00933 #ifdef MEASURE_TIME
00934 t1 = getticks();
00935 ths->MEASURE_TIME_t[0] += nfft_elapsed_seconds(t1,t0);
00936 #endif
00937 }
00938
00940 void fastsum_trafo(fastsum_plan *ths)
00941 {
00942 int j,k,t;
00943 ticks t0, t1;
00944
00945 ths->MEASURE_TIME_t[4] = 0.0;
00946 ths->MEASURE_TIME_t[5] = 0.0;
00947 ths->MEASURE_TIME_t[6] = 0.0;
00948 ths->MEASURE_TIME_t[7] = 0.0;
00949
00950 #ifdef MEASURE_TIME
00951 t0 = getticks();
00952 #endif
00953
00954 nfft_adjoint(&(ths->mv1));
00955 #ifdef MEASURE_TIME
00956 t1 = getticks();
00957 ths->MEASURE_TIME_t[4] += nfft_elapsed_seconds(t1,t0);
00958 #endif
00959
00960
00961 #ifdef MEASURE_TIME
00962 t0 = getticks();
00963 #endif
00964
00965 #pragma omp parallel for default(shared) private(k)
00966 for (k=0; k<ths->mv2.N_total; k++)
00967 ths->mv2.f_hat[k] = ths->b[k] * ths->mv1.f_hat[k];
00968 #ifdef MEASURE_TIME
00969 t1 = getticks();
00970 ths->MEASURE_TIME_t[5] += nfft_elapsed_seconds(t1,t0);
00971 #endif
00972
00973
00974 #ifdef MEASURE_TIME
00975 t0 = getticks();
00976 #endif
00977
00978 nfft_trafo(&(ths->mv2));
00979 #ifdef MEASURE_TIME
00980 t1 = getticks();
00981 ths->MEASURE_TIME_t[6] += nfft_elapsed_seconds(t1,t0);
00982 #endif
00983
00984
00985 #ifdef MEASURE_TIME
00986 t0 = getticks();
00987 #endif
00988
00989 #pragma omp parallel for default(shared) private(j,k,t)
00990 for (j=0; j<ths->M_total; j++)
00991 {
00992 double ymin[ths->d], ymax[ths->d];
00994 if (ths->flags & NEARFIELD_BOXES)
00995 {
00996 ths->f[j] = ths->mv2.f[j] + SearchBox(ths->y + ths->d*j, ths);
00997 }
00998 else
00999 {
01000 for (t=0; t<ths->d; t++)
01001 {
01002 ymin[t] = ths->y[ths->d*j+t] - ths->eps_I;
01003 ymax[t] = ths->y[ths->d*j+t] + ths->eps_I;
01004 }
01005 ths->f[j] = ths->mv2.f[j] + SearchTree(ths->d,0, ths->x, ths->alpha, ymin, ymax, ths->N_total, ths->k, ths->kernel_param, ths->Ad, ths->Add, ths->p, ths->flags);
01006 }
01007
01008
01009 }
01010
01011 #ifdef MEASURE_TIME
01012 t1 = getticks();
01013 ths->MEASURE_TIME_t[7] += nfft_elapsed_seconds(t1,t0);
01014 #endif
01015 }
01016
01017
01018