AccFFT
accfft.cpp
Go to the documentation of this file.
1 
5 /*
6  * Copyright (c) 2014-2015, Amir Gholami, George Biros
7  * All rights reserved.
8  * This file is part of AccFFT library.
9  *
10  * AccFFT is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation, either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * AccFFT is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with AccFFT. If not, see <http://www.gnu.org/licenses/>.
22  *
23 */
24 #include <mpi.h>
25 #include <fftw3.h>
26 #include <omp.h>
27 #include <iostream>
28 #include <cmath>
29 #include <math.h>
30 #include "transpose.h"
31 #include <string.h>
32 #include "accfft.h"
33 #include "accfft_common.h"
34 #define VERBOSE 0
35 #define PCOUT if(procid==0) std::cout
36 typedef double Complex[2];
37 
43 int accfft_init(int nthreads){
44  int threads_ok;
45  if (threads_ok) threads_ok = fftw_init_threads();
46  if (threads_ok) fftw_plan_with_nthreads(nthreads);
47  return (!threads_ok);
48 }
49 
54  fftw_cleanup_threads();
55  fftw_cleanup();
56 }
57 
58 int dfft_get_local_size(int N0, int N1, int N2, int * isize, int * istart,MPI_Comm c_comm ){
59  int nprocs, procid;
60  MPI_Comm_rank(c_comm, &procid);
61 
62  int coords[2],np[2],periods[2];
63  MPI_Cart_get(c_comm,2,np,periods,coords);
64  isize[2]=N2;
65  isize[0]=ceil(N0/(double)np[0]);
66  isize[1]=ceil(N1/(double)np[1]);
67 
68  istart[0]=isize[0]*(coords[0]);
69  istart[1]=isize[1]*(coords[1]);
70  istart[2]=0;
71 
72  if((N0-isize[0]*coords[0])<isize[0]) {isize[0]=N0-isize[0]*coords[0]; isize[0]*=(int) isize[0]>0; istart[0]=N0-isize[0];}
73  if((N1-isize[1]*coords[1])<isize[1]) {isize[1]=N1-isize[1]*coords[1]; isize[1]*=(int) isize[1]>0; istart[1]=N1-isize[1];}
74 
75 
76  if(VERBOSE>=2){
77  MPI_Barrier(c_comm);
78  for(int r=0;r<np[0];r++)
79  for(int c=0;c<np[1];c++){
80  MPI_Barrier(c_comm);
81  if((coords[0]==r) && (coords[1]==c))
82  std::cout<<coords[0]<<","<<coords[1]<<" isize[0]= "<<isize[0]<<" isize[1]= "<<isize[1]<<" isize[2]= "<<isize[2]<<" istart[0]= "<<istart[0]<<" istart[1]= "<<istart[1]<<" istart[2]= "<<istart[2]<<std::endl;
83  MPI_Barrier(c_comm);
84  }
85  MPI_Barrier(c_comm);
86  }
87  int alloc_local=isize[0]*isize[1]*isize[2]*sizeof(double);
88 
89 
90  return alloc_local;
91 }
92 
93 
106 int accfft_local_size_dft_r2c( int * n,int * isize, int * istart, int * osize, int *ostart,MPI_Comm c_comm){
107 
108  //1D & 2D Decomp
109  int osize_0[3]={0}, ostart_0[3]={0};
110  int osize_1[3]={0}, ostart_1[3]={0};
111  int osize_2[3]={0}, ostart_2[3]={0};
112 
113  int alloc_local;
114  int alloc_max=0,n_tuples;
115  //inplace==true ? n_tuples=(n[2]/2+1)*2: n_tuples=n[2]*2;
116  n_tuples=(n[2]/2+1)*2;
117  alloc_local=dfft_get_local_size(n[0],n[1],n_tuples,osize_0,ostart_0,c_comm);
118  alloc_max=std::max(alloc_max, alloc_local);
119  alloc_local=dfft_get_local_size(n[0],n_tuples/2,n[1],osize_1,ostart_1,c_comm);
120  alloc_max=std::max(alloc_max, alloc_local*2);
121  alloc_local=dfft_get_local_size(n[1],n_tuples/2,n[0],osize_2,ostart_2,c_comm);
122  alloc_max=std::max(alloc_max, alloc_local*2);
123 
124  std::swap(osize_1[1],osize_1[2]);
125  std::swap(ostart_1[1],ostart_1[2]);
126 
127  std::swap(ostart_2[1],ostart_2[2]);
128  std::swap(ostart_2[0],ostart_2[1]);
129  std::swap(osize_2[1],osize_2[2]);
130  std::swap(osize_2[0],osize_2[1]);
131 
132  //isize[0]=osize_0[0];
133  //isize[1]=osize_0[1];
134  //isize[2]=n[2];//osize_0[2];
135  dfft_get_local_size(n[0],n[1],n[2],isize,istart,c_comm);
136 
137  osize[0]=osize_2[0];
138  osize[1]=osize_2[1];
139  osize[2]=osize_2[2];
140 
141  ostart[0]=ostart_2[0];
142  ostart[1]=ostart_2[1];
143  ostart[2]=ostart_2[2];
144 
145  return alloc_max;
146 
147 }
148 
149 
160 accfft_plan* accfft_plan_dft_3d_r2c(int * n, double * data, double * data_out, MPI_Comm c_comm,unsigned flags){
161  accfft_plan *plan=new accfft_plan;
162  int nprocs, procid;
163  MPI_Comm_rank(c_comm, &procid);
164  plan->procid=procid;
165  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
166  plan->c_comm=c_comm;
167  int *coord=plan->coord;
168  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
169  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
170 
171  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
172 
173  plan->data=data;
174  plan->data_out=data_out;
175  if(data_out==data){
176  plan->inplace=true;}
177  else{plan->inplace=false;}
178 
179  // 1D Decomposition
180  if(plan->np[1]==1){
181  unsigned fftw_flags=FFTW_MEASURE;
182  int N0=n[0], N1=n[1], N2=n[2];
183 
184  int n_tuples_o,n_tuples_i;
185  plan->inplace==true ? n_tuples_i=(N2/2+1)*2: n_tuples_i=N2;
186  n_tuples_o=(N2/2+1)*2;
187 
188  int isize[3],osize[3],istart[3],ostart[3];
189  int alloc_max=accfft_local_size_dft_r2c(n,isize,istart,osize,ostart,c_comm,plan->inplace);
190  plan->alloc_max=alloc_max;
191 
192  plan->Mem_mgr= new Mem_Mgr(N0,N1,n_tuples_o,c_comm);
193  plan->T_plan_1= new T_Plan(N0,N1,n_tuples_o, plan->Mem_mgr, c_comm);
194  plan->T_plan_1i= new T_Plan(N1,N0,n_tuples_o,plan->Mem_mgr, c_comm);
195 
196  plan->T_plan_1->alloc_local=alloc_max;
197  plan->T_plan_1i->alloc_local=alloc_max;
198 
199  long int BATCH=plan->T_plan_1->local_n0;
200  plan->fplan_0= fftw_plan_many_dft_r2c(2, &n[1],plan->T_plan_1->local_n0, //int rank, const int *n, int howmany
201  data, NULL, //double *in, const int *inembed,
202  1, N1*n_tuples_i, //int istride, int idist,
203  (fftw_complex*)data_out, NULL, //fftw_complex *out, const int *onembed,
204  1, N1*n_tuples_o/2, // int ostride, int odist,
205  fftw_flags);
206  if(plan->fplan_0==NULL) std::cout<<"!!! Forward Plan not Created !!!"<<std::endl;
207 
208  ptrdiff_t local_n0=plan->T_plan_1->local_n0;
209  ptrdiff_t local_n1=plan->T_plan_1->local_n1;
210  plan->fplan_1= fftw_plan_many_dft(1, &n[0],n_tuples_o/2*local_n1, //int rank, const int *n, int howmany
211  (fftw_complex*)data_out, NULL, //double *in, const int *inembed,
212  local_n1*n_tuples_o/2,1, //int istride, int idist,
213  (fftw_complex*)data_out, NULL, //fftw_complex *out, const int *onembed,
214  local_n1*n_tuples_o/2,1, // int ostride, int odist,
215  FFTW_FORWARD,fftw_flags);
216  if(plan->fplan_1==NULL) std::cout<<"!!! Forward Plan not Created !!!"<<std::endl;
217 
218  plan->iplan_0=fftw_plan_many_dft_c2r(2, &n[1],plan->T_plan_1->local_n0, //int rank, const int *n, int howmany,
219  (fftw_complex*)data_out, NULL, //fftw_complex *in, const int *inembed,
220  1, N1*n_tuples_o/2, //int istride, int idist,
221  data, NULL, //double *out, const int *onembed,
222  1, N1*n_tuples_i, //int ostride, int odist,
223  fftw_flags);
224  if(plan->iplan_0==NULL) std::cout<<"!!! Inverse Plan not Created !!!"<<std::endl;
225 
226  plan->iplan_1= fftw_plan_many_dft(1, &n[0],n_tuples_o/2*local_n1, //int rank, const int *n, int howmany
227  (fftw_complex*)data_out, NULL, //double *in, const int *inembed,
228  local_n1*n_tuples_o/2,1, //int istride, int idist,
229  (fftw_complex*)data_out, NULL, //fftw_complex *out, const int *onembed,
230  local_n1*n_tuples_o/2,1, // int ostride, int odist,
231  FFTW_BACKWARD,fftw_flags);
232  if(plan->iplan_1==NULL) std::cout<<"!!! Inverse Plan2 not Created !!!"<<std::endl;
233 
234  static int method_static=0;
235  static int kway_static_2=0;
236  if(method_static==0){
237  plan->T_plan_1->which_fast_method(plan->T_plan_1,data_out);
238  method_static=plan->T_plan_1->method;
239  kway_static_2=plan->T_plan_1->kway;
240  }
241  else{
242  plan->T_plan_1->method=method_static;
243  plan->T_plan_1->kway=kway_static_2;
244  }
245  plan->T_plan_1->method=plan->T_plan_1->method;
246  plan->T_plan_1->kway=kway_static_2;
247  plan->T_plan_1i->method=plan->T_plan_1->method;
248  plan->T_plan_1i->kway=kway_static_2;
249  plan->data=data;
250 
251  // Make unused parts of plan NULL
252  plan->T_plan_2=NULL;
253  plan->T_plan_2i=NULL;
254  plan->fplan_2=NULL;
255  plan->iplan_2=NULL;
256 
257  }
258 
259  // 2D Decomposition
260  if (plan->np[1]!=1){
261 
262  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
263  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
264  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
265  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
266  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
267 
268  int alloc_local;
269  int alloc_max=0;
270  int n_tuples_o,n_tuples_i;
271  plan->inplace==true ? n_tuples_i=(n[2]/2+1)*2: n_tuples_i=n[2];
272  n_tuples_o=(n[2]/2+1)*2;
273 
274  int isize[3],osize[3],istart[3],ostart[3];
275  alloc_max=accfft_local_size_dft_r2c(n,isize,istart,osize,ostart,c_comm,plan->inplace);
276 
277  dfft_get_local_size(n[0],n[1],n_tuples_o,osize_0,ostart_0,c_comm);
278  dfft_get_local_size(n[0],n_tuples_o/2,n[1],osize_1,ostart_1,c_comm);
279  dfft_get_local_size(n[1],n_tuples_o/2,n[0],osize_2,ostart_2,c_comm);
280 
281  std::swap(osize_1[1],osize_1[2]);
282  std::swap(ostart_1[1],ostart_1[2]);
283 
284  std::swap(ostart_2[1],ostart_2[2]);
285  std::swap(ostart_2[0],ostart_2[1]);
286  std::swap(osize_2[1],osize_2[2]);
287  std::swap(osize_2[0],osize_2[1]);
288 
289  for(int i=0;i<3;i++){
290  osize_1i[i]=osize_1[i];
291  osize_2i[i]=osize_2[i];
292  ostart_1i[i]=ostart_1[i];
293  ostart_2i[i]=ostart_2[i];
294  }
295 
296  // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
297  plan->Mem_mgr= new Mem_Mgr(n[1],n_tuples_o/2,2,plan->row_comm,osize_0[0],alloc_max);
298  plan->T_plan_1= new T_Plan(n[1],n_tuples_o/2,2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
299  plan->T_plan_2= new T_Plan(n[0],n[1],osize_2[2]*2, plan->Mem_mgr, plan->col_comm);
300  plan->T_plan_2i= new T_Plan(n[1],n[0],osize_2i[2]*2, plan->Mem_mgr, plan->col_comm);
301  plan->T_plan_1i= new T_Plan(n_tuples_o/2,n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);
302 
303  plan->T_plan_1->alloc_local=plan->alloc_max;
304  plan->T_plan_2->alloc_local=plan->alloc_max;
305  plan->T_plan_2i->alloc_local=plan->alloc_max;
306  plan->T_plan_1i->alloc_local=plan->alloc_max;
307 
308  {
309  unsigned fftw_flags=FFTW_MEASURE;
310  plan->fplan_0= fftw_plan_many_dft_r2c(1, &n[2],osize_0[0]*osize_0[1], //int rank, const int *n, int howmany
311  data, NULL, //double *in, const int *inembed,
312  1, n_tuples_i, //int istride, int idist,
313  (fftw_complex*)data_out, NULL, //fftw_complex *out, const int *onembed,
314  1, n_tuples_o/2, // int ostride, int odist,
315  fftw_flags);
316  if(plan->fplan_0==NULL) std::cout<<"!!! fplan_0 not Created !!!"<<std::endl;
317 
318  plan->iplan_0= fftw_plan_many_dft_c2r(1, &n[2],osize_0[0]*osize_0[1], //int rank, const int *n, int howmany
319  (fftw_complex*)data_out, NULL, //double *in, const int *inembed,
320  1, n_tuples_o/2, //int istride, int idist,
321  data, NULL, //fftw_complex *out, const int *onembed,
322  1, n_tuples_i, // int ostride, int odist,
323  fftw_flags);
324  if(plan->iplan_0==NULL) std::cout<<"!!! iplan_0 not Created !!!"<<std::endl;
325 
326  // ----
327 
328  fftw_iodim dims, howmany_dims[2];
329  dims.n=osize_1[1];
330  dims.is=osize_1[2];
331  dims.os=osize_1[2];
332 
333  howmany_dims[0].n=osize_1[2];
334  howmany_dims[0].is=1;
335  howmany_dims[0].os=1;
336 
337  howmany_dims[1].n=osize_1[0];
338  howmany_dims[1].is=osize_1[1]*osize_1[2];
339  howmany_dims[1].os=osize_1[1]*osize_1[2];
340 
341  plan->fplan_1=fftw_plan_guru_dft(
342  1, &dims,
343  2,howmany_dims,
344  (fftw_complex*)data_out, (fftw_complex*)data_out,
345  -1,fftw_flags);
346  if(plan->fplan_1==NULL) std::cout<<"!!! fplan1 not Created !!!"<<std::endl;
347  plan->iplan_1=fftw_plan_guru_dft(
348  1, &dims,
349  2,howmany_dims,
350  (fftw_complex*)data_out, (fftw_complex*)data_out,
351  1,fftw_flags);
352  if(plan->iplan_1==NULL) std::cout<<"!!! fplan1 not Created !!!"<<std::endl;
353 
354  // ----
355  dims.n=n[0];
356  dims.is=osize_2[2];
357  dims.os=osize_2[2];
358 
359  howmany_dims[0].n=osize_2[2];
360  howmany_dims[0].is=1;
361  howmany_dims[0].os=1;
362 
363  howmany_dims[1].n=osize_2[0];
364  howmany_dims[1].is=osize_2[1]*osize_2[2];
365  howmany_dims[1].os=osize_2[1]*osize_2[2];
366 
367  plan->fplan_2= fftw_plan_many_dft(1, &n[0],osize_2[2]*osize_2[1], //int rank, const int *n, int howmany
368  (fftw_complex*)data_out, NULL, //double *in, const int *inembed,
369  osize_2[2]*osize_2[1],1, //int istride, int idist,
370  (fftw_complex*)data_out, NULL, //fftw_complex *out, const int *onembed,
371  osize_2[2]*osize_2[1],1, // int ostride, int odist,
372  FFTW_FORWARD,fftw_flags);
373  if(plan->fplan_2==NULL) std::cout<<"!!! fplan2 not Created !!!"<<std::endl;
374 
375  plan->iplan_2= fftw_plan_many_dft(1, &n[0],osize_2[2]*osize_2[1], //int rank, const int *n, int howmany
376  (fftw_complex*)data_out, NULL, //double *in, const int *inembed,
377  osize_2[2]*osize_2[1],1, //int istride, int idist,
378  (fftw_complex*)data_out, NULL, //fftw_complex *out, const int *onembed,
379  osize_2[2]*osize_2[1],1, // int ostride, int odist,
380  FFTW_BACKWARD,fftw_flags);
381  if(plan->iplan_2==NULL) std::cout<<"!!! fplan2 not Created !!!"<<std::endl;
382  }
383 
384  static int method_static_2=0;
385  static int kway_static_2=0;
386  if(method_static_2==0){
387  if(coord[0]==0){
388  plan->T_plan_1->which_fast_method(plan->T_plan_1,data_out);
389  method_static_2=plan->T_plan_1->method;
390  kway_static_2=plan->T_plan_1->kway;
391  }
392  MPI_Bcast(&method_static_2,1, MPI_INT,0, c_comm );
393  MPI_Bcast(&kway_static_2,1, MPI_INT,0, c_comm );
394  MPI_Barrier(c_comm);
395  }
396  plan->T_plan_1->method=method_static_2;
397  plan->T_plan_2->method=method_static_2;
398  plan->T_plan_2i->method=method_static_2;
399  plan->T_plan_1i->method=method_static_2;
400  plan->T_plan_1->kway=kway_static_2;
401  plan->T_plan_2->kway=kway_static_2;
402  plan->T_plan_2i->kway=kway_static_2;
403  plan->T_plan_1i->kway=kway_static_2;
404  plan->data=data;
405  }
406  return plan;
407 
408 }
409 
422 int accfft_local_size_dft_c2c( int * n,int * isize, int * istart, int * osize, int *ostart,MPI_Comm c_comm){
423 
424  int osize_0[3]={0}, ostart_0[3]={0};
425  int osize_1[3]={0}, ostart_1[3]={0};
426  int osize_2[3]={0}, ostart_2[3]={0};
427  int osize_1i[3]={0}, ostart_1i[3]={0};
428  int osize_2i[3]={0}, ostart_2i[3]={0};
429 
430  int alloc_local;
431  int alloc_max=0,n_tuples=n[2]*2;
432  alloc_local=dfft_get_local_size(n[0],n[1],n[2],osize_0,ostart_0,c_comm);
433  alloc_max=std::max(alloc_max, alloc_local);
434  alloc_local=dfft_get_local_size(n[0],n[2],n[1],osize_1,ostart_1,c_comm);
435  alloc_max=std::max(alloc_max, alloc_local);
436  alloc_local=dfft_get_local_size(n[1],n[2],n[0],osize_2,ostart_2,c_comm);
437  alloc_max=std::max(alloc_max, alloc_local);
438  alloc_max*=2; // because of c2c
439 
440  std::swap(osize_1[1],osize_1[2]);
441  std::swap(ostart_1[1],ostart_1[2]);
442 
443  std::swap(ostart_2[1],ostart_2[2]);
444  std::swap(ostart_2[0],ostart_2[1]);
445  std::swap(osize_2[1],osize_2[2]);
446  std::swap(osize_2[0],osize_2[1]);
447 
448  for(int i=0;i<3;i++){
449  osize_1i[i]=osize_1[i];
450  osize_2i[i]=osize_2[i];
451  ostart_1i[i]=ostart_1[i];
452  ostart_2i[i]=ostart_2[i];
453  }
454 
455  //isize[0]=osize_0[0];
456  //isize[1]=osize_0[1];
457  //isize[2]=n[2];//osize_0[2];
458  dfft_get_local_size(n[0],n[1],n[2],isize,istart,c_comm);
459 
460  osize[0]=osize_2[0];
461  osize[1]=osize_2[1];
462  osize[2]=osize_2[2];
463 
464  ostart[0]=ostart_2[0];
465  ostart[1]=ostart_2[1];
466  ostart[2]=ostart_2[2];
467 
468  return alloc_max;
469 
470 }
471 
482 accfft_plan* accfft_plan_dft_3d_c2c(int * n, Complex * data, Complex * data_out, MPI_Comm c_comm,unsigned flags){
483  accfft_plan *plan=new accfft_plan;
484  int nprocs, procid;
485  MPI_Comm_rank(c_comm, &procid);
486  plan->procid=procid;
487  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
488  plan->c_comm=c_comm;
489  int *coord=plan->coord;
490  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
491  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
492  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
493 
494  plan->data_c=data;
495  plan->data_out_c=data_out;
496  if(data_out==data){
497  plan->inplace=true;}
498  else{plan->inplace=false;}
499 
500  // 1D Decomposition
501  if (plan->np[1]==1){
502  int NX=n[0],NY=n[1],NZ=n[2];
503  plan->Mem_mgr= new Mem_Mgr(NX,NY,(NZ)*2,c_comm);
504  plan->T_plan_1= new T_Plan(NX,NY,(NZ)*2, plan->Mem_mgr,c_comm);
505  plan->T_plan_1i= new T_Plan(NY,NX,NZ*2, plan->Mem_mgr,c_comm);
506 
507  int isize[3],osize[3],istart[3],ostart[3];
508  int alloc_max=accfft_local_size_dft_c2c(n,isize,istart,osize,ostart,c_comm);
509  plan->alloc_max=alloc_max;
510  plan->T_plan_1->alloc_local=alloc_max;
511  plan->T_plan_1i->alloc_local=alloc_max;
512 
513 
514  ptrdiff_t local_n0=plan->T_plan_1->local_n0;
515  ptrdiff_t local_n1=plan->T_plan_1->local_n1;
516  int N0=NX, N1=NY, N2=NZ;
517 
518  unsigned fftw_flags=FFTW_MEASURE;
519  plan->fplan_0= fftw_plan_many_dft(2, &n[1],plan->T_plan_1->local_n0, //int rank, const int *n, int howmany
520  data, NULL, //double *in, const int *inembed,
521  1, N1*N2, //int istride, int idist,
522  data_out, NULL, //fftw_complex *out, const int *onembed,
523  1, N1*N2, // int ostride, int odist,
524  FFTW_FORWARD,fftw_flags);
525  if(plan->fplan_0==NULL) std::cout<<"!!! Forward Plan not Created !!!"<<std::endl;
526  plan->iplan_0= fftw_plan_many_dft(2, &n[1],plan->T_plan_1->local_n0, //int rank, const int *n, int howmany
527  data_out, NULL, //double *in, const int *inembed,
528  1, N1*N2, //int istride, int idist,
529  data, NULL, //fftw_complex *out, const int *onembed,
530  1, N1*N2, // int ostride, int odist,
531  FFTW_BACKWARD,fftw_flags);
532  if(plan->iplan_0==NULL) std::cout<<"!!! Forward Plan not Created !!!"<<std::endl;
533 
534  MPI_Barrier(c_comm);
535 
536  plan->fplan_1= fftw_plan_many_dft(1, &n[0],plan->T_plan_1->local_n1*NZ, //int rank, const int *n, int howmany
537  data_out, NULL, //double *in, const int *inembed,
538  plan->T_plan_1->local_n1*NZ, 1, //int istride, int idist,
539  data_out, NULL, //fftw_complex *out, const int *onembed,
540  plan->T_plan_1->local_n1*NZ, 1, // int ostride, int odist,
541  FFTW_FORWARD,fftw_flags);
542  if(plan->fplan_1==NULL) std::cout<<"!!! Forward Plan not Created !!!"<<std::endl;
543 
544  plan->iplan_1= fftw_plan_many_dft(1, &n[0],plan->T_plan_1->local_n1*NZ, //int rank, const int *n, int howmany
545  data_out, NULL, //double *in, const int *inembed,
546  plan->T_plan_1->local_n1*NZ, 1, //int istride, int idist,
547  data_out, NULL, //fftw_complex *out, const int *onembed,
548  plan->T_plan_1->local_n1*NZ, 1, // int ostride, int odist,
549  FFTW_BACKWARD,fftw_flags);
550  if(plan->iplan_1==NULL) std::cout<<"!!! Forward Plan not Created !!!"<<std::endl;
551 
552 
553  plan->T_plan_1->which_fast_method(plan->T_plan_1,(double*)data_out);
554  plan->T_plan_1i->method=plan->T_plan_1->method;
555  plan->T_plan_1i->kway=plan->T_plan_1->kway;
556 
557  // Make unused parts of plan NULL
558  plan->T_plan_2=NULL;
559  plan->T_plan_2i=NULL;
560  plan->fplan_2=NULL;
561  plan->iplan_2=NULL;
562  }
563 
564  // 2D Decomposition
565  if (plan->np[1]!=1){
566 
567  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
568  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
569  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
570  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
571  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
572 
573  int alloc_local;
574  int alloc_max=0,n_tuples=(n[2]/2+1)*2;
575 
576  int isize[3],osize[3],istart[3],ostart[3];
577  alloc_max=accfft_local_size_dft_c2c(n,isize,istart,osize,ostart,c_comm);
578  plan->alloc_max=alloc_max;
579 
580  dfft_get_local_size(n[0],n[1],n[2],osize_0,ostart_0,c_comm);
581  dfft_get_local_size(n[0],n[2],n[1],osize_1,ostart_1,c_comm);
582  dfft_get_local_size(n[1],n[2],n[0],osize_2,ostart_2,c_comm);
583 
584  std::swap(osize_1[1],osize_1[2]);
585  std::swap(ostart_1[1],ostart_1[2]);
586 
587  std::swap(ostart_2[1],ostart_2[2]);
588  std::swap(ostart_2[0],ostart_2[1]);
589  std::swap(osize_2[1],osize_2[2]);
590  std::swap(osize_2[0],osize_2[1]);
591 
592  for(int i=0;i<3;i++){
593  osize_1i[i]=osize_1[i];
594  osize_2i[i]=osize_2[i];
595  ostart_1i[i]=ostart_1[i];
596  ostart_2i[i]=ostart_2[i];
597  }
598 
599 
600 
601 
602  // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
603 
604  plan->Mem_mgr= new Mem_Mgr(n[1],n[2],2,plan->row_comm,osize_0[0],alloc_max);
605  plan->T_plan_1= new T_Plan(n[1],n[2],2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
606  plan->T_plan_2= new T_Plan(n[0],n[1],2*osize_2[2], plan->Mem_mgr, plan->col_comm);
607  plan->T_plan_2i=new T_Plan(n[1],n[0],2*osize_2i[2], plan->Mem_mgr, plan->col_comm);
608  plan->T_plan_1i=new T_Plan(n[2],n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);
609 
610  plan->T_plan_1->alloc_local=plan->alloc_max;
611  plan->T_plan_2->alloc_local=plan->alloc_max;
612  plan->T_plan_2i->alloc_local=plan->alloc_max;
613  plan->T_plan_1i->alloc_local=plan->alloc_max;
614 
615  {
616  // fplan_0
617  unsigned fftw_flags=FFTW_MEASURE;
618  plan->fplan_0= fftw_plan_many_dft(1, &n[2],osize_0[0]*osize_0[1], //int rank, const int *n, int howmany
619  data, NULL, //double *in, const int *inembed,
620  1, n[2], //int istride, int idist,
621  data_out, NULL, //fftw_complex *out, const int *onembed,
622  1, n[2], // int ostride, int odist,
623  FFTW_FORWARD,fftw_flags);
624  if(plan->fplan_0==NULL) std::cout<<"!!! Forward Plan not Created !!!"<<std::endl;
625  plan->iplan_0= fftw_plan_many_dft(1, &n[2],osize_0[0]*osize_0[1], //int rank, const int *n, int howmany
626  data_out, NULL, //double *in, const int *inembed,
627  1, n[2], //int istride, int idist,
628  data, NULL, //fftw_complex *out, const int *onembed,
629  1, n[2], // int ostride, int odist,
630  FFTW_BACKWARD,fftw_flags);
631  if(plan->iplan_0==NULL) std::cout<<"!!! Forward Plan not Created !!!"<<std::endl;
632 
633 
634  // fplan_1
635  fftw_iodim dims, howmany_dims[2];
636  dims.n=osize_1[1];
637  dims.is=osize_1[2];
638  dims.os=osize_1[2];
639 
640  howmany_dims[0].n=osize_1[2];
641  howmany_dims[0].is=1;
642  howmany_dims[0].os=1;
643 
644  howmany_dims[1].n=osize_1[0];
645  howmany_dims[1].is=osize_1[1]*osize_1[2];
646  howmany_dims[1].os=osize_1[1]*osize_1[2];
647 
648  plan->fplan_1=fftw_plan_guru_dft(
649  1, &dims,
650  2,howmany_dims,
651  data_out, data_out,
652  -1,fftw_flags);
653  if(plan->fplan_1==NULL) std::cout<<"!!! fplan1 not Created !!!"<<std::endl;
654  plan->iplan_1=fftw_plan_guru_dft(
655  1, &dims,
656  2,howmany_dims,
657  (fftw_complex*)data_out, (fftw_complex*)data_out,
658  1,fftw_flags);
659  if(plan->iplan_1==NULL) std::cout<<"!!! iplan1 not Created !!!"<<std::endl;
660 
661 
662  // fplan_2
663  plan->fplan_2= fftw_plan_many_dft(1, &n[0],osize_2[1]*osize_2[2], //int rank, const int *n, int howmany
664  data_out, NULL, //double *in, const int *inembed,
665  osize_2[1]*osize_2[2],1 , //int istride, int idist,
666  data_out, NULL, //fftw_complex *out, const int *onembed,
667  osize_2[1]*osize_2[2],1 , // int ostride, int odist,
668  FFTW_FORWARD,fftw_flags);
669  if(plan->fplan_2==NULL) std::cout<<"!!! Forward Plan not Created !!!"<<std::endl;
670  plan->iplan_2= fftw_plan_many_dft(1, &n[0],osize_2[1]*osize_2[2], //int rank, const int *n, int howmany
671  data_out, NULL, //double *in, const int *inembed,
672  osize_2[1]*osize_2[2],1 , //int istride, int idist,
673  data_out, NULL, //fftw_complex *out, const int *onembed,
674  osize_2[1]*osize_2[2],1 , // int ostride, int odist,
675  FFTW_BACKWARD,fftw_flags);
676  if(plan->iplan_2==NULL) std::cout<<"!!! Forward Plan not Created !!!"<<std::endl;
677 
678  }
679 
680  static int method_static_2=0;
681  static int kway_static_2=0;
682  if(method_static_2==0){
683  if(coord[0]==0){
684  plan->T_plan_1->which_fast_method(plan->T_plan_1,(double*)data_out);
685  method_static_2=plan->T_plan_1->method;
686  kway_static_2=plan->T_plan_1->kway;
687  }
688  MPI_Bcast(&method_static_2,1, MPI_INT,0, c_comm );
689  MPI_Bcast(&kway_static_2,1, MPI_INT,0, c_comm );
690  MPI_Barrier(c_comm);
691  }
692  plan->T_plan_1->method=method_static_2;
693  plan->T_plan_2->method=method_static_2;
694  plan->T_plan_2i->method=method_static_2;
695  plan->T_plan_1i->method=method_static_2;
696  plan->T_plan_1->kway=kway_static_2;
697  plan->T_plan_2->kway=kway_static_2;
698  plan->T_plan_2i->kway=kway_static_2;
699  plan->T_plan_1i->kway=kway_static_2;
700  }
701  return plan;
702 
703 }
704 
714 void accfft_execute_r2c(accfft_plan* plan, double * data,Complex * data_out, double * timer){
715  accfft_execute(plan,-1,data,(double*)data_out,timer);
716 
717  return;
718 }
719 
720 
730 void accfft_execute_c2r(accfft_plan* plan, Complex * data,double * data_out, double * timer){
731  accfft_execute(plan,1,(double*)data,data_out,timer);
732 
733  return;
734 }
735 
736 void accfft_execute(accfft_plan* plan, int direction,double * data,double * data_out, double * timer){
737 
738  if(data==NULL)
739  data=plan->data;
740  if(data_out==NULL)
741  data_out=plan->data_out;
742  int * coords=plan->coord;
743  int procid=plan->procid;
744  double fft_time=0;
745  double * timings;
746  if(timer==NULL){
747  timings=new double[5];
748  memset(timings,0,sizeof(double)*5);
749  }
750  else{
751  timings=timer;
752  }
753 
754 
755  // 1D Decomposition
756  if(plan->np[1]==1){
757  if(direction==-1){
758 
759  /**************************************************************/
760  /******************* N0/P0 x N1 x N2 *************************/
761  /**************************************************************/
762  fft_time-=MPI_Wtime();
763  fftw_execute_dft_r2c(plan->fplan_0,(double*)data,(fftw_complex*)data_out);
764  fft_time+=MPI_Wtime();
765 
766  MPI_Barrier(plan->c_comm);
767  plan->T_plan_1->execute(plan->T_plan_1,data_out,timings,2);
768  /**************************************************************/
769  /******************* N1 x N0/P0 x N2 *************************/
770  /**************************************************************/
771  fft_time-=MPI_Wtime();
772  fftw_execute_dft(plan->fplan_1,(fftw_complex*)data_out,(fftw_complex*)data_out);
773  fft_time+=MPI_Wtime();
774  }
775 
776  if(direction==1){
777  /* Now Perform the inverse transform */
778  fft_time-=MPI_Wtime();
779  fftw_execute_dft(plan->iplan_1,(fftw_complex*)data,(fftw_complex*)data);
780  fft_time+=MPI_Wtime();
781 
782  plan->T_plan_1i->execute(plan->T_plan_1i,data,timings,1);
783  /**************************************************************/
784  /******************* N0/P0 x N1 x N2 *************************/
785  /**************************************************************/
786 
787  fft_time-=MPI_Wtime();
788  fftw_execute_dft_c2r(plan->iplan_0,(fftw_complex*)data,(double*)data_out);
789  fft_time+=MPI_Wtime();
790 
791  }
792  }
793 
794  // 2D Decomposition
795  if(plan->np[1]!=1){
796  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
797  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
798  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
799  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
800  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
801 
802  if(direction==-1){
803  /**************************************************************/
804  /******************* N0/P0 x N1/P1 x N2 **********************/
805  /**************************************************************/
806  // FFT in Z direction
807  fft_time-=MPI_Wtime();
808  fftw_execute_dft_r2c(plan->fplan_0,(double*)data,(fftw_complex*)data_out);
809  fft_time+=MPI_Wtime();
810 
811  // Perform N0/P0 transpose
812 
813 
814  plan->T_plan_1->execute(plan->T_plan_1,data_out,timings,2,osize_0[0],coords[0]);
815  /**************************************************************/
816  /******************* N0/P0 x N1 x N2/P1 **********************/
817  /**************************************************************/
818  fft_time-=MPI_Wtime();
819  fftw_execute_dft(plan->fplan_1,(fftw_complex*)data_out,(fftw_complex*)data_out);
820  fft_time+=MPI_Wtime();
821 
822 
823 
824  plan->T_plan_2->execute(plan->T_plan_2,plan->data_out,timings,2,1,coords[1]);
825  /**************************************************************/
826  /******************* N0 x N1/P0 x N2/P1 **********************/
827  /**************************************************************/
828  fft_time-=MPI_Wtime();
829  fftw_execute_dft(plan->fplan_2,(fftw_complex*)data_out,(fftw_complex*)data_out);
830  fft_time+=MPI_Wtime();
831  }
832  else if (direction==1){
833  fft_time-=MPI_Wtime();
834  fftw_execute_dft(plan->iplan_2,(fftw_complex*)data,(fftw_complex*)data);
835  fft_time+=MPI_Wtime();
836 
837 
838  plan->T_plan_2i->execute(plan->T_plan_2i,data,timings,1,1,coords[1]);
839  MPI_Barrier(plan->c_comm);
840  /**************************************************************/
841  /******************* N0/P0 x N1 x N2/P1 **********************/
842  /**************************************************************/
843  fft_time-=MPI_Wtime();
844  fftw_execute_dft(plan->iplan_1,(fftw_complex*)data,(fftw_complex*)data);
845  fft_time+=MPI_Wtime();
846 
847 
848  plan->T_plan_1i->execute(plan->T_plan_1i,data,timings,1,osize_1i[0],coords[0]);
849  MPI_Barrier(plan->c_comm);
850  /**************************************************************/
851  /******************* N0/P0 x N1/P1 x N2 **********************/
852  /**************************************************************/
853 
854  // IFFT in Z direction
855  fft_time-=MPI_Wtime();
856  fftw_execute_dft_c2r(plan->iplan_0,(fftw_complex*)data,(double*)data_out);
857  fft_time+=MPI_Wtime();
858  MPI_Barrier(plan->c_comm);
859 
860  }
861  }
862  timings[4]=fft_time;
863  if(timer==NULL){
864  delete [] timings;
865  }
866  MPI_Barrier(plan->c_comm);
867 
868  return;
869 }
870 
880 void accfft_execute_c2c(accfft_plan* plan, int direction,Complex * data, Complex * data_out, double * timer){
881 
882  if(data==NULL)
883  data=plan->data_c;
884  if(data_out==NULL)
885  data_out=plan->data_out_c;
886  int * coords=plan->coord;
887  int procid=plan->procid;
888  double fft_time=0;
889  double * timings;
890  if(timer==NULL){
891  timings=new double[5];
892  memset(timings,0,sizeof(double)*5);
893  }
894  else{
895  timings=timer;
896  }
897 
898 
899 
900  // 1D Decomposition
901  if(plan->np[1]==1){
902  if(direction==-1){
903 
904  /**************************************************************/
905  /******************* N0/P0 x N1 x N2 *************************/
906  /**************************************************************/
907  fft_time-=MPI_Wtime();
908  fftw_execute_dft(plan->fplan_0,data,data_out);
909  fft_time+=MPI_Wtime();
910 
911  MPI_Barrier(plan->c_comm);
912  plan->T_plan_1->execute(plan->T_plan_1,(double*)data_out,timings,2);
913  /**************************************************************/
914  /******************* N1 x N0/P0 x N2 *************************/
915  /**************************************************************/
916  fft_time-=MPI_Wtime();
917  fftw_execute_dft(plan->fplan_1,(fftw_complex*)data_out,(fftw_complex*)data_out);
918  fft_time+=MPI_Wtime();
919  }
920 
921  if(direction==1){
922  /* Now Perform the inverse transform */
923  fft_time-=MPI_Wtime();
924  fftw_execute_dft(plan->iplan_1,(fftw_complex*)data,(fftw_complex*)data);
925  fft_time+=MPI_Wtime();
926 
927  plan->T_plan_1i->execute(plan->T_plan_1i,(double*)data,timings,1);
928  /**************************************************************/
929  /******************* N0/P0 x N1 x N2 *************************/
930  /**************************************************************/
931 
932  fft_time-=MPI_Wtime();
933  fftw_execute_dft(plan->iplan_0,data,data_out);
934  fft_time+=MPI_Wtime();
935 
936  }
937  }
938 
939  // 2D Decomposition
940  if(plan->np[1]!=1){
941  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
942  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
943  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
944  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
945  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
946 
947  if(direction==-1){
948  /**************************************************************/
949  /******************* N0/P0 x N1/P1 x N2 **********************/
950  /**************************************************************/
951  // FFT in Z direction
952  fft_time-=MPI_Wtime();
953  fftw_execute_dft(plan->fplan_0,data,data_out);
954  fft_time+=MPI_Wtime();
955 
956 
957 
958  plan->T_plan_1->execute(plan->T_plan_1,(double*)data_out,timings,2,osize_0[0],coords[0]);
959  /**************************************************************/
960  /******************* N0/P0 x N1 x N2/P1 **********************/
961  /**************************************************************/
962  fft_time-=MPI_Wtime();
963  fftw_execute_dft(plan->fplan_1,(fftw_complex*)data_out,(fftw_complex*)data_out);
964  fft_time+=MPI_Wtime();
965 
966 
967 
968  plan->T_plan_2->execute(plan->T_plan_2,(double*)data_out,timings,2,1,coords[1]);
969  /**************************************************************/
970  /******************* N0 x N1/P0 x N2/P1 **********************/
971  /**************************************************************/
972  fft_time-=MPI_Wtime();
973  fftw_execute_dft(plan->fplan_2,(fftw_complex*)data_out,(fftw_complex*)data_out);
974  fft_time+=MPI_Wtime();
975  }
976  else if (direction==1){
977  fft_time-=MPI_Wtime();
978  fftw_execute_dft(plan->iplan_2,(fftw_complex*)data,(fftw_complex*)data);
979  fft_time+=MPI_Wtime();
980 
981 
982  plan->T_plan_2i->execute(plan->T_plan_2i,(double*)data,timings,1,1,coords[1]);
983  /**************************************************************/
984  /******************* N0/P0 x N1 x N2/P1 **********************/
985  /**************************************************************/
986  fft_time-=MPI_Wtime();
987  fftw_execute_dft(plan->iplan_1,(fftw_complex*)data,(fftw_complex*)data);
988  fft_time+=MPI_Wtime();
989 
990 
991  plan->T_plan_1i->execute(plan->T_plan_1i,(double*)data,timings,1,osize_1i[0],coords[0]);
992  MPI_Barrier(plan->c_comm);
993  /**************************************************************/
994  /******************* N0/P0 x N1/P1 x N2 **********************/
995  /**************************************************************/
996 
997  // IFFT in Z direction
998  fft_time-=MPI_Wtime();
999  fftw_execute_dft(plan->iplan_0,data,data_out);
1000  fft_time+=MPI_Wtime();
1001 
1002  }
1003  }
1004  timings[4]=fft_time;
1005  if(timer==NULL){
1006  delete [] timings;
1007  }
1008  MPI_Barrier(plan->c_comm);
1009 
1010  return;
1011 }
1012 
1017 void accfft_destroy_plan(accfft_plan * plan){
1018 
1019  if(plan->T_plan_1!=NULL)delete(plan->T_plan_1);
1020  if(plan->T_plan_1i!=NULL)delete(plan->T_plan_1i);
1021  if(plan->T_plan_2!=NULL)delete(plan->T_plan_2);
1022  if(plan->T_plan_2i!=NULL)delete(plan->T_plan_2i);
1023  if(plan->Mem_mgr!=NULL)delete(plan->Mem_mgr);
1024  if(plan->fplan_0!=NULL)fftw_destroy_plan(plan->fplan_0);
1025  if(plan->fplan_1!=NULL)fftw_destroy_plan(plan->fplan_1);
1026  if(plan->fplan_2!=NULL)fftw_destroy_plan(plan->fplan_2);
1027  if(plan->iplan_0!=NULL)fftw_destroy_plan(plan->iplan_0);
1028  if(plan->iplan_1!=NULL)fftw_destroy_plan(plan->iplan_1);
1029  if(plan->iplan_2!=NULL)fftw_destroy_plan(plan->iplan_2);
1030 
1031  MPI_Comm_free(&plan->row_comm);
1032  MPI_Comm_free(&plan->col_comm);
1033 }
void accfft_execute_c2c(accfft_plan *plan, int direction, Complex *data, Complex *data_out, double *timer)
Definition: accfft.cpp:880
void accfft_execute_c2r(accfft_plan *plan, Complex *data, double *data_out, double *timer)
Definition: accfft.cpp:730
void accfft_cleanup()
Definition: accfft.cpp:53
void accfft_destroy_plan(accfft_plan *plan)
Definition: accfft.cpp:1017
void accfft_execute_r2c(accfft_plan *plan, double *data, Complex *data_out, double *timer)
Definition: accfft.cpp:714
int accfft_init(int nthreads)
Definition: accfft.cpp:43
int accfft_local_size_dft_r2c(int *n, int *isize, int *istart, int *osize, int *ostart, MPI_Comm c_comm)
Definition: accfft.cpp:106
accfft_plan * accfft_plan_dft_3d_c2c(int *n, Complex *data, Complex *data_out, MPI_Comm c_comm, unsigned flags)
Definition: accfft.cpp:482
accfft_plan * accfft_plan_dft_3d_r2c(int *n, double *data, double *data_out, MPI_Comm c_comm, unsigned flags)
Definition: accfft.cpp:160
int accfft_local_size_dft_c2c(int *n, int *isize, int *istart, int *osize, int *ostart, MPI_Comm c_comm)
Definition: accfft.cpp:422