AccFFT
accfft_gpu.cpp
Go to the documentation of this file.
1 
5 /*
6  * Copyright (c) 2014-2015, Amir Gholami, George Biros
7  * All rights reserved.
8  * This file is part of AccFFT library.
9  *
10  * AccFFT is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation, either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * AccFFT is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with AccFFT. If not, see <http://www.gnu.org/licenses/>.
22  *
23 */
24 
25 #include "accfft_gpu.h"
26 #include <mpi.h>
27 #include <omp.h>
28 #include <iostream>
29 #include <cmath>
30 #include <math.h>
31 #include "transpose_cuda.h"
32 #include <cuda_runtime_api.h>
33 #include <string.h>
34 #include <cuda.h>
35 #include <cufft.h>
36 #include "accfft_common.h"
37 #define VERBOSE 0
38 #define PCOUT if(procid==0) std::cout
39 typedef double Complex[2];
40 
45  // empty for now
46 }
47 
48 int dfft_get_local_size_gpu(int N0, int N1, int N2, int * isize, int * istart,MPI_Comm c_comm ){
49  int procid;
50  MPI_Comm_rank(c_comm, &procid);
51 
52  int coords[2],np[2],periods[2];
53  MPI_Cart_get(c_comm,2,np,periods,coords);
54  isize[2]=N2;
55  isize[0]=ceil(N0/(double)np[0]);
56  isize[1]=ceil(N1/(double)np[1]);
57 
58  istart[0]=isize[0]*(coords[0]);
59  istart[1]=isize[1]*(coords[1]);
60  istart[2]=0;
61 
62  if((N0-isize[0]*coords[0])<isize[0]) {isize[0]=N0-isize[0]*coords[0]; isize[0]*=(int) isize[0]>0; istart[0]=N0-isize[0];}
63  if((N1-isize[1]*coords[1])<isize[1]) {isize[1]=N1-isize[1]*coords[1]; isize[1]*=(int) isize[1]>0; istart[1]=N1-isize[1];}
64 
65 
66  if(VERBOSE>=2){
67  MPI_Barrier(c_comm);
68  for(int r=0;r<np[0];r++)
69  for(int c=0;c<np[1];c++){
70  MPI_Barrier(c_comm);
71  if((coords[0]==r) && (coords[1]==c))
72  std::cout<<coords[0]<<","<<coords[1]<<" isize[0]= "<<isize[0]<<" isize[1]= "<<isize[1]<<" isize[2]= "<<isize[2]<<" istart[0]= "<<istart[0]<<" istart[1]= "<<istart[1]<<" istart[2]= "<<istart[2]<<std::endl;
73  MPI_Barrier(c_comm);
74  }
75  MPI_Barrier(c_comm);
76  }
77  int alloc_local=isize[0]*isize[1]*isize[2]*sizeof(double);
78 
79 
80  return alloc_local;
81 }
82 
95 int accfft_local_size_dft_r2c_gpu( int * n,int * isize, int * istart, int * osize, int *ostart,MPI_Comm c_comm, bool inplace){
96 
97  //1D & 2D Decomp
98  int osize_0[3]={0}, ostart_0[3]={0};
99  int osize_1[3]={0}, ostart_1[3]={0};
100  int osize_2[3]={0}, ostart_2[3]={0};
101 
102  int alloc_local;
103  int alloc_max=0,n_tuples;
104  //inplace==true ? n_tuples=(n[2]/2+1)*2: n_tuples=n[2];
105  n_tuples=(n[2]/2+1)*2;//SNAFU
106  alloc_local=dfft_get_local_size_gpu(n[0],n[1],n_tuples,osize_0,ostart_0,c_comm);
107  alloc_max=std::max(alloc_max, alloc_local);
108  alloc_local=dfft_get_local_size_gpu(n[0],n_tuples/2,n[1],osize_1,ostart_1,c_comm);
109  alloc_max=std::max(alloc_max, alloc_local*2);
110  alloc_local=dfft_get_local_size_gpu(n[1],n_tuples/2,n[0],osize_2,ostart_2,c_comm);
111  alloc_max=std::max(alloc_max, alloc_local*2);
112 
113  std::swap(osize_1[1],osize_1[2]);
114  std::swap(ostart_1[1],ostart_1[2]);
115 
116  std::swap(ostart_2[1],ostart_2[2]);
117  std::swap(ostart_2[0],ostart_2[1]);
118  std::swap(osize_2[1],osize_2[2]);
119  std::swap(osize_2[0],osize_2[1]);
120 
121  //isize[0]=osize_0[0];
122  //isize[1]=osize_0[1];
123  //isize[2]=n[2];//osize_0[2];
124  dfft_get_local_size_gpu(n[0],n[1],n[2],isize,istart,c_comm);
125 
126  osize[0]=osize_2[0];
127  osize[1]=osize_2[1];
128  osize[2]=osize_2[2];
129 
130  ostart[0]=ostart_2[0];
131  ostart[1]=ostart_2[1];
132  ostart[2]=ostart_2[2];
133 
134  return alloc_max;
135 
136 }
137 
138 
149 accfft_plan_gpu* accfft_plan_dft_3d_r2c_gpu(int * n, double * data_d, double * data_out_d, MPI_Comm c_comm,unsigned flags){
150  accfft_plan_gpu *plan=new accfft_plan_gpu;
151  int procid;
152  MPI_Comm_rank(c_comm, &procid);
153  plan->procid=procid;
154  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
155  plan->c_comm=c_comm;
156  int *coord=plan->coord;
157  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
158  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
159  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
160  plan->data=data_d;
161  plan->data_out=data_out_d;
162 
163  if(data_out_d==data_d){
164  plan->inplace=true;}
165  else{plan->inplace=false;}
166 
167  // 1D Decomposition
168  if(plan->np[1]==1){
169  int N0=n[0], N1=n[1], N2=n[2];
170  int n_tuples_o,n_tuples_i;
171 // plan->inplace==true ? n_tuples=(N2/2+1)*2: n_tuples=N2*2;
172  plan->inplace==true ? n_tuples_i=(N2/2+1)*2: n_tuples_i=N2;
173  n_tuples_o=(N2/2+1)*2;
174  plan->Mem_mgr= new Mem_Mgr_gpu(N0,N1,n_tuples_o,c_comm);
175  plan->T_plan_1= new T_Plan_gpu(N0,N1,n_tuples_o, plan->Mem_mgr, c_comm);
176  plan->T_plan_1i= new T_Plan_gpu(N1,N0,n_tuples_o,plan->Mem_mgr, c_comm);
177 
178  int isize[3],osize[3],istart[3],ostart[3];
179  int alloc_max=accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm,plan->inplace);
180  plan->alloc_max=alloc_max;
181  plan->T_plan_1->alloc_local=alloc_max;
182  plan->T_plan_1i->alloc_local=alloc_max;
183 
184 
185  /* Forward Transform Plan. */
186  {
187  int NX=n[0], NY=n[1], NZ=n[2];
188  int f_inembed[2]={NY,n_tuples_i};
189  int f_onembed[2]={NY,n_tuples_o/2};
190  int idist=NY*n_tuples_i;
191  int odist=NY*n_tuples_o/2;
192  int istride=1;
193  int ostride=1;
194  int batch=plan->T_plan_1->local_n0;//NX;
195 
196  cufftResult_t cufft_error;
197  if(batch!=0)
198  {
199  cufft_error=cufftPlanMany(&plan->fplan_0, 2, &n[1],
200  f_inembed, istride, idist, // *inembed, istride, idist
201  f_onembed, ostride, odist, // *onembed, ostride, odist
202  CUFFT_D2Z, batch);
203  if(cufft_error!= CUFFT_SUCCESS){
204  fprintf(stderr, "CUFFT error: fplan creation failed %d \n",cufft_error); return NULL;
205  }
206  //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
207  }
208 
209 
210  int local_n0=plan->T_plan_1->local_n0;
211  int local_n1=plan->T_plan_1->local_n1;
212  int f_inembed2[1]={NX};
213  int f_onembed2[1]={NX};
214  int idist2=1;
215  int odist2=1;
216  int istride2=local_n1*n_tuples_o/2;
217  int ostride2=local_n1*n_tuples_o/2;
218  if(plan->T_plan_1->local_n1*n_tuples_o/2!=0)
219  {
220  cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[0],
221  f_inembed2, istride2, idist2, // *inembed, istride, idist
222  f_onembed2, ostride2, odist2, // *onembed, ostride, odist
223  CUFFT_Z2Z,plan->T_plan_1->local_n1*n_tuples_o/2);
224  if(cufft_error!= CUFFT_SUCCESS){
225  fprintf(stderr, "CUFFT error: fplan2 creation failed %d\n",cufft_error); return NULL;
226  }
227  //cufftSetCompatibilityMode(fplan2,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan2 cuda compatibility\n"); return;}
228  }
229 
230  /* Backward Transform Plan */
231  if(batch!=0)
232  {
233  cufft_error=cufftPlanMany(&plan->iplan_0, 2, &n[1],
234  f_onembed, ostride,odist , // *inembed, istride, idist
235  f_inembed, istride,idist, // *onembed, ostride, odist
236  CUFFT_Z2D, batch);
237  if(cufft_error!= CUFFT_SUCCESS){
238  fprintf(stderr, "CUFFT error: iplan creation failed %d\n",cufft_error); return NULL;
239  }
240  //cufftSetCompatibilityMode(iplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at cuda compatibility\n"); return;}
241  }
242  }
243 
244 
245  static int method_static=0;
246  static int kway_static_2=0;
247  if(method_static==0){
248  plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d);
249  method_static=plan->T_plan_1->method;
250  kway_static_2=plan->T_plan_1->kway;//-4;
251  }
252  else{
253  plan->T_plan_1->method=method_static;
254  plan->T_plan_1->kway=kway_static_2;
255  }
256 
257  checkCuda_accfft (cudaDeviceSynchronize());
258  MPI_Barrier(plan->c_comm);
259 
260  plan->T_plan_1->method=plan->T_plan_1->method;
261  plan->T_plan_1->kway=kway_static_2;
262  plan->T_plan_1i->method=plan->T_plan_1->method;
263  plan->T_plan_1i->kway=kway_static_2;
264 
265  // Make unused parts of plan NULL
266  plan->T_plan_2=NULL;
267  plan->T_plan_2i=NULL;
268  plan->fplan_2=NULL;
269  plan->iplan_2=NULL;
270 
271  }
272 
273  // 2D Decomposition
274  if (plan->np[1]!=1){
275 
276  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
277  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
278  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
279  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
280  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
281 
282  int alloc_max=0;
283  int n_tuples_i, n_tuples_o;
284  //plan->inplace==true ? n_tuples=(n[2]/2+1)*2: n_tuples=n[2]*2;
285  plan->inplace==true ? n_tuples_i=(n[2]/2+1)*2: n_tuples_i=n[2];
286  n_tuples_o=(n[2]/2+1)*2;
287 
288  int isize[3],osize[3],istart[3],ostart[3];
289  alloc_max=accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm,plan->inplace);
290  plan->alloc_max=alloc_max;
291 
292  dfft_get_local_size_gpu(n[0],n[1],n_tuples_o,osize_0,ostart_0,c_comm);
293  dfft_get_local_size_gpu(n[0],n_tuples_o/2,n[1],osize_1,ostart_1,c_comm);
294  dfft_get_local_size_gpu(n[1],n_tuples_o/2,n[0],osize_2,ostart_2,c_comm);
295 
296  std::swap(osize_1[1],osize_1[2]);
297  std::swap(ostart_1[1],ostart_1[2]);
298 
299  std::swap(ostart_2[1],ostart_2[2]);
300  std::swap(ostart_2[0],ostart_2[1]);
301  std::swap(osize_2[1],osize_2[2]);
302  std::swap(osize_2[0],osize_2[1]);
303 
304  for(int i=0;i<3;i++){
305  osize_1i[i]=osize_1[i];
306  osize_2i[i]=osize_2[i];
307  ostart_1i[i]=ostart_1[i];
308  ostart_2i[i]=ostart_2[i];
309  }
310 
311  // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
312  plan->Mem_mgr= new Mem_Mgr_gpu(n[1],n_tuples_o/2,2,plan->row_comm,osize_0[0],alloc_max);
313  plan->T_plan_1= new T_Plan_gpu(n[1],n_tuples_o/2,2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
314  plan->T_plan_2= new T_Plan_gpu(n[0],n[1],osize_2[2]*2,plan->Mem_mgr, plan->col_comm);
315  plan->T_plan_2i= new T_Plan_gpu(n[1],n[0],osize_2i[2]*2, plan->Mem_mgr, plan->col_comm);
316  plan->T_plan_1i= new T_Plan_gpu(n_tuples_o/2,n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);
317 
318 
319  plan->T_plan_1->alloc_local=plan->alloc_max;
320  plan->T_plan_2->alloc_local=plan->alloc_max;
321  plan->T_plan_2i->alloc_local=plan->alloc_max;
322  plan->T_plan_1i->alloc_local=plan->alloc_max;
323 
324  // fplan_0
325  int NX=n[0], NY=n[1], NZ=n[2];
326  cufftResult_t cufft_error;
327  {
328  int f_inembed[1]={n_tuples_i};
329  int f_onembed[1]={n_tuples_o/2};
330  int idist=(n_tuples_i);
331  int odist=n_tuples_o/2;
332  int istride=1;
333  int ostride=1;
334  int batch=osize_0[0]*osize_0[1];//NX;
335 
336  if(batch!=0)
337  {
338  cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
339  f_inembed, istride, idist, // *inembed, istride, idist
340  f_onembed, ostride, odist, // *onembed, ostride, odist
341  CUFFT_D2Z, batch);
342  if(cufft_error!= CUFFT_SUCCESS)
343  {
344  fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL;
345  }
346  //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
347  }
348  if(batch!=0)
349  {
350  cufft_error=cufftPlanMany(&plan->iplan_0, 1, &n[2],
351  f_onembed, ostride, odist, // *onembed, ostride, odist
352  f_inembed, istride, idist, // *inembed, istride, idist
353  CUFFT_Z2D, batch);
354  if(cufft_error!= CUFFT_SUCCESS)
355  {
356  fprintf(stderr, "CUFFT error: iplan_0 creation failed %d \n",cufft_error); return NULL;
357  }
358  //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
359  }
360  }
361  // fplan_1
362  {
363  int f_inembed[1]={NY};
364  int f_onembed[1]={NY};
365  int idist=1;
366  int odist=1;
367  int istride=osize_1[2];
368  int ostride=osize_1[2];
369  int batch=osize_1[2];
370 
371  if(batch!=0)
372  {
373  cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
374  f_inembed, istride, idist, // *inembed, istride, idist
375  f_onembed, ostride, odist, // *onembed, ostride, odist
376  CUFFT_Z2Z, batch);
377  if(cufft_error!= CUFFT_SUCCESS)
378  {
379  fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL;
380  }
381  //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
382  }
383  }
384  // fplan_2
385  {
386  int f_inembed[1]={NX};
387  int f_onembed[1]={NX};
388  int idist=1;
389  int odist=1;
390  int istride=osize_2[1]*osize_2[2];
391  int ostride=osize_2[1]*osize_2[2];
392  int batch=osize_2[1]*osize_2[2];;
393 
394  if(batch!=0)
395  {
396  cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
397  f_inembed, istride, idist, // *inembed, istride, idist
398  f_onembed, ostride, odist, // *onembed, ostride, odist
399  CUFFT_Z2Z, batch);
400  if(cufft_error!= CUFFT_SUCCESS)
401  {
402  fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL;
403  }
404  //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
405  }
406  }
407 
408 
409 
410  static int method_static_2=0;
411  static int kway_static_2=0;
412  if(method_static_2==0){
413  if(coord[0]==0){
414  plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d);
415  method_static_2=plan->T_plan_1->method;//-4;
416  kway_static_2=plan->T_plan_1->kway;//-4;
417  }
418  MPI_Bcast(&method_static_2,1, MPI_INT,0, c_comm );
419  MPI_Bcast(&kway_static_2,1, MPI_INT,0, c_comm );
420  MPI_Barrier(c_comm);
421  }
422  checkCuda_accfft (cudaDeviceSynchronize());
423  MPI_Barrier(plan->c_comm);
424  plan->T_plan_1->method=method_static_2;
425  plan->T_plan_2->method=method_static_2;
426  plan->T_plan_2i->method=method_static_2;
427  plan->T_plan_1i->method=method_static_2;
428  plan->T_plan_1->kway=kway_static_2;
429  plan->T_plan_2->kway=kway_static_2;
430  plan->T_plan_2i->kway=kway_static_2;
431  plan->T_plan_1i->kway=kway_static_2;
432 
433  plan->iplan_1=-1;
434  plan->iplan_2=-1;
435 
436  }
437  return plan;
438 
439 }
440 
441 
442 void accfft_execute_gpu(accfft_plan_gpu* plan, int direction,double * data_d, double * data_out_d, double * timer){
443 
444  if(data_d==NULL)
445  data_d=plan->data;
446  if(data_out_d==NULL)
447  data_out_d=plan->data_out;
448 
449  int * coords=plan->coord;
450  int procid=plan->procid;
451  double fft_time=0;
452  double * timings;
453  if(timer==NULL){
454  timings=new double[5];
455  memset(timings,0,sizeof(double)*5);
456  }
457  else{
458  timings=timer;
459  }
460 
461  cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
462  cudaEvent_t fft_startEvent, fft_stopEvent;
463  checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
464  checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
465  checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
466  checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
467  int NY=plan->N[1];
468  float dummy_time=0;
469 
470  // 1D Decomposition
471  if(plan->np[1]==1){
472  if(direction==-1){
473 
474  /**************************************************************/
475  /******************* N0/P0 x N1 x N2 *************************/
476  /**************************************************************/
477  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
478  checkCuda_accfft(cufftExecD2Z(plan->fplan_0, (cufftDoubleReal*)data_d, (cufftDoubleComplex*)data_out_d));
479  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
480  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
481  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
482  fft_time+=dummy_time/1000;
483 
484  MPI_Barrier(plan->c_comm);
485  plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2);
486  /**************************************************************/
487  /******************* N1 x N0/P0 x N2 *************************/
488  /**************************************************************/
489  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
490  checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
491  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
492  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
493  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
494  fft_time+=dummy_time/1000;
495 
496  }
497 
498  if(direction==1){
499  /* Now Perform the inverse transform */
500  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
501  checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_d,CUFFT_INVERSE));
502  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
503  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
504  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
505  fft_time+=dummy_time/1000;
506 
507  plan->T_plan_1i->execute_gpu(plan->T_plan_1i,data_d,timings,1);
508  /**************************************************************/
509  /******************* N0/P0 x N1 x N2 *************************/
510  /**************************************************************/
511  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
512  checkCuda_accfft(cufftExecZ2D(plan->iplan_0, (cufftDoubleComplex*)data_d,(cufftDoubleReal*)data_out_d));
513  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
514  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
515  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
516  fft_time+=dummy_time/1000;
517 
518  }
519  }
520 
521  // 2D Decomposition
522  if(plan->np[1]!=1){
523  int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0;
524  int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1;
525  //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
526  int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i;
527  //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
528 
529  if(direction==-1){
530  /**************************************************************/
531  /******************* N0/P0 x N1/P1 x N2 **********************/
532  /**************************************************************/
533  // FFT in Z direction
534  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
535  checkCuda_accfft (cufftExecD2Z(plan->fplan_0,(cufftDoubleReal*)data_d, (cufftDoubleComplex*)data_out_d));
536  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
537  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
538  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
539  fft_time+=dummy_time/1000;
540 
541  // Perform N0/P0 transpose
542 
543 
544  plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]);
545  /**************************************************************/
546  /******************* N0/P0 x N1 x N2/P1 **********************/
547  /**************************************************************/
548  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
549  for (int i=0;i<osize_1[0];++i){
550  checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
551  }
552  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
553  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
554  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
555  fft_time+=dummy_time/1000;
556  MPI_Barrier(plan->c_comm);
557 
558  plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]);
559  /**************************************************************/
560  /******************* N0 x N1/P0 x N2/P1 **********************/
561  /**************************************************************/
562  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
563  checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
564  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
565  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
566  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
567  fft_time+=dummy_time/1000;
568  }
569  else if (direction==1){
570  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
571  checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE));
572  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
573  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
574  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
575  fft_time+=dummy_time/1000;
576 
577  MPI_Barrier(plan->c_comm);
578 
579 
580  plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1,1,coords[1]);
581  /**************************************************************/
582  /******************* N0/P0 x N1 x N2/P1 **********************/
583  /**************************************************************/
584  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
585  for (int i=0;i<osize_1i[0];++i){
586  checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE));
587  }
588  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
589  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
590  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
591  fft_time+=dummy_time/1000;
592  MPI_Barrier(plan->c_comm);
593 
594 
595 
596  plan->T_plan_1i->execute_gpu(plan->T_plan_1i,data_d,timings,1,osize_1i[0],coords[0]);
597  MPI_Barrier(plan->c_comm);
598  /**************************************************************/
599  /******************* N0/P0 x N1/P1 x N2 **********************/
600  /**************************************************************/
601 
602  // IFFT in Z direction
603  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
604  checkCuda_accfft (cufftExecZ2D(plan->iplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleReal*)data_out_d));
605  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
606  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
607  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
608  fft_time+=dummy_time/1000;
609 
610  }
611  }
612  timings[4]=fft_time;
613  if(timer==NULL){
614  delete [] timings;
615  }
616  MPI_Barrier(plan->c_comm);
617 
618  return;
619 }
620 
621 
631 void accfft_execute_r2c_gpu(accfft_plan_gpu* plan, double * data,Complex * data_out, double * timer){
632  accfft_execute_gpu(plan,-1,data,(double*)data_out,timer);
633 
634  return;
635 }
636 
637 
647 void accfft_execute_c2r_gpu(accfft_plan_gpu* plan, Complex * data,double * data_out, double * timer){
648  accfft_execute_gpu(plan,1,(double*)data,data_out,timer);
649 
650  return;
651 }
652 
653 
666 int accfft_local_size_dft_c2c_gpu( int * n,int * isize, int * istart, int * osize, int *ostart,MPI_Comm c_comm){
667 
668  int osize_0[3]={0}, ostart_0[3]={0};
669  int osize_1[3]={0}, ostart_1[3]={0};
670  int osize_2[3]={0}, ostart_2[3]={0};
671  //int osize_1i[3]={0}, ostart_1i[3]={0};
672  //int osize_2i[3]={0}, ostart_2i[3]={0};
673 
674  int alloc_local;
675  int alloc_max=0;//,n_tuples=n[2]*2;
676  alloc_local=dfft_get_local_size_gpu(n[0],n[1],n[2],osize_0,ostart_0,c_comm);
677  alloc_max=std::max(alloc_max, alloc_local);
678  alloc_local=dfft_get_local_size_gpu(n[0],n[2],n[1],osize_1,ostart_1,c_comm);
679  alloc_max=std::max(alloc_max, alloc_local);
680  alloc_local=dfft_get_local_size_gpu(n[1],n[2],n[0],osize_2,ostart_2,c_comm);
681  alloc_max=std::max(alloc_max, alloc_local);
682  alloc_max*=2; // because of c2c
683 
684  std::swap(osize_1[1],osize_1[2]);
685  std::swap(ostart_1[1],ostart_1[2]);
686 
687  std::swap(ostart_2[1],ostart_2[2]);
688  std::swap(ostart_2[0],ostart_2[1]);
689  std::swap(osize_2[1],osize_2[2]);
690  std::swap(osize_2[0],osize_2[1]);
691 
692  //isize[0]=osize_0[0];
693  //isize[1]=osize_0[1];
694  //isize[2]=n[2];//osize_0[2];
695  dfft_get_local_size_gpu(n[0],n[1],n[2],isize,istart,c_comm);
696 
697  osize[0]=osize_2[0];
698  osize[1]=osize_2[1];
699  osize[2]=osize_2[2];
700 
701  ostart[0]=ostart_2[0];
702  ostart[1]=ostart_2[1];
703  ostart[2]=ostart_2[2];
704 
705  return alloc_max;
706 
707 }
708 
709 
720 accfft_plan_gpu* accfft_plan_dft_3d_c2c_gpu(int * n, Complex * data_d, Complex * data_out_d, MPI_Comm c_comm,unsigned flags){
721  accfft_plan_gpu *plan=new accfft_plan_gpu;
722  int nprocs, procid;
723  MPI_Comm_rank(c_comm, &procid);
724  plan->procid=procid;
725  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
726  plan->c_comm=c_comm;
727  int *coord=plan->coord;
728  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
729  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
730  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
731  int NX=n[0], NY=n[1], NZ=n[2];
732  cufftResult_t cufft_error;
733 
734  plan->data_c=data_d;
735  plan->data_out_c=data_out_d;
736  if(data_out_d==data_d){
737  plan->inplace=true;}
738  else{plan->inplace=false;}
739 
740  // 1D Decomposition
741  if (plan->np[1]==1){
742  int NX=n[0],NY=n[1],NZ=n[2];
743 
744 
745  int isize[3],osize[3],istart[3],ostart[3];
746  int alloc_max=accfft_local_size_dft_c2c_gpu(n,isize,istart,osize,ostart,c_comm);
747  plan->alloc_max=alloc_max;
748 
749  plan->Mem_mgr= new Mem_Mgr_gpu(NX,NY,(NZ)*2,c_comm);
750  plan->T_plan_1= new T_Plan_gpu(NX,NY,(NZ)*2, plan->Mem_mgr,c_comm);
751  plan->T_plan_1i= new T_Plan_gpu(NY,NX,NZ*2, plan->Mem_mgr,c_comm);
752 
753  plan->T_plan_1->alloc_local=alloc_max;
754  plan->T_plan_1i->alloc_local=alloc_max;
755 
756  ptrdiff_t local_n0=plan->T_plan_1->local_n0;
757  ptrdiff_t local_n1=plan->T_plan_1->local_n1;
758  int N0=NX, N1=NY, N2=NZ;
759 
760  /* Forward Transform Plan. */
761  int n[3] = {NX, NY, NZ};
762  int f_inembed[2]={NY,(NZ)};
763  int f_onembed[2]={NY,NZ};
764  int idist=NY*(NZ);
765  int odist=NY*(NZ);
766  int istride=1;
767  int ostride=1;
768  int batch=plan->T_plan_1->local_n0;//NX;
769 
770  cufftResult_t cufft_error;
771  if(batch!=0)
772  {
773  cufft_error=cufftPlanMany(&plan->fplan_0, 2, &n[1],
774  f_inembed, istride, idist, // *inembed, istride, idist
775  f_onembed, ostride, odist, // *onembed, ostride, odist
776  CUFFT_Z2Z, batch);
777  if(cufft_error!= CUFFT_SUCCESS)
778  {
779  fprintf(stderr, "CUFFT error: fplan creation failed %d \n",cufft_error); return NULL;
780  }
781  //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
782  }
783  int f_inembed2[1]={NX};
784  int f_onembed2[1]={NX};
785  int idist2=1;
786  int odist2=1;
787  int istride2=local_n1*(NZ);
788  int ostride2=local_n1*(NZ);
789  MPI_Barrier(c_comm);
790  if(local_n1*NZ!=0){
791  cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[0],
792  f_inembed2, istride2, idist2, // *inembed, istride, idist
793  f_onembed2, ostride2, odist2, // *onembed, ostride, odist
794  CUFFT_Z2Z, local_n1*(NZ));
795  if(cufft_error!= CUFFT_SUCCESS){
796  fprintf(stderr, "CUFFT error: fplan2 creation failed %d\n",cufft_error); return NULL;
797  }
798  //cufftSetCompatibilityMode(fplan2,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan2 cuda compatibility\n"); return;}
799  }
800 
801  plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,(double*)data_out_d);
802  plan->T_plan_1i->method=plan->T_plan_1->method;
803  plan->T_plan_1i->kway=plan->T_plan_1->kway;
804 
805  // Make unused parts of plan NULL
806  plan->T_plan_2=NULL;
807  plan->T_plan_2i=NULL;
808  plan->fplan_2=NULL;
809  plan->iplan_2=NULL;
810  }
811 
812  // 2D Decomposition
813  if (plan->np[1]!=1){
814 
815  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
816  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
817  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
818  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
819  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
820 
821  int alloc_local;
822  int alloc_max=0,n_tuples=n[2]*2;
823 
824  int isize[3],osize[3],istart[3],ostart[3];
825  alloc_max=accfft_local_size_dft_c2c_gpu(n,isize,istart,osize,ostart,c_comm);
826  plan->alloc_max=alloc_max;
827 
828  dfft_get_local_size_gpu(n[0],n[1],n[2],osize_0,ostart_0,c_comm);
829  dfft_get_local_size_gpu(n[0],n[2],n[1],osize_1,ostart_1,c_comm);
830  dfft_get_local_size_gpu(n[1],n[2],n[0],osize_2,ostart_2,c_comm);
831 
832 
833  std::swap(osize_1[1],osize_1[2]);
834  std::swap(ostart_1[1],ostart_1[2]);
835 
836  std::swap(ostart_2[1],ostart_2[2]);
837  std::swap(ostart_2[0],ostart_2[1]);
838  std::swap(osize_2[1],osize_2[2]);
839  std::swap(osize_2[0],osize_2[1]);
840 
841  for(int i=0;i<3;i++){
842  osize_1i[i]=osize_1[i];
843  osize_2i[i]=osize_2[i];
844  ostart_1i[i]=ostart_1[i];
845  ostart_2i[i]=ostart_2[i];
846  }
847 
848 
849 
850  // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
851  plan->Mem_mgr= new Mem_Mgr_gpu(n[1],n[2],2,plan->row_comm,osize_0[0],alloc_max);
852  plan->T_plan_1= new T_Plan_gpu(n[1],n[2],2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
853  plan->T_plan_2= new T_Plan_gpu(n[0],n[1],2*osize_2[2], plan->Mem_mgr, plan->col_comm);
854  plan->T_plan_2i= new T_Plan_gpu(n[1],n[0],2*osize_2i[2], plan->Mem_mgr, plan->col_comm);
855  plan->T_plan_1i= new T_Plan_gpu(n[2],n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);
856 
857  plan->T_plan_1->alloc_local=plan->alloc_max;
858  plan->T_plan_2->alloc_local=plan->alloc_max;
859  plan->T_plan_2i->alloc_local=plan->alloc_max;
860  plan->T_plan_1i->alloc_local=plan->alloc_max;
861 
862 
863  // fplan_0
864  {
865  int f_inembed[1]={NZ};
866  int f_onembed[1]={NZ};
867  int idist=(NZ);
868  int odist=(NZ);
869  int istride=1;
870  int ostride=1;
871  int batch=osize_0[0]*osize_0[1];//NX;
872 
873  if(batch!=0)
874  {
875  cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
876  f_inembed, istride, idist, // *inembed, istride, idist
877  f_onembed, ostride, odist, // *onembed, ostride, odist
878  CUFFT_Z2Z, batch);
879  if(cufft_error!= CUFFT_SUCCESS)
880  {
881  fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL;
882  }
883  //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
884  }
885  }
886  // fplan_1
887  {
888  int f_inembed[1]={NY};
889  int f_onembed[1]={NY};
890  int idist=1;
891  int odist=1;
892  int istride=osize_1[2];
893  int ostride=osize_1[2];
894  int batch=osize_1[2];
895 
896  if(batch!=0)
897  {
898  cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
899  f_inembed, istride, idist, // *inembed, istride, idist
900  f_onembed, ostride, odist, // *onembed, ostride, odist
901  CUFFT_Z2Z, batch);
902  if(cufft_error!= CUFFT_SUCCESS)
903  {
904  fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL;
905  }
906  //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
907  }
908  }
909  // fplan_2
910  {
911  int f_inembed[1]={NX};
912  int f_onembed[1]={NX};
913  int idist=1;
914  int odist=1;
915  int istride=osize_2[1]*osize_2[2];
916  int ostride=osize_2[1]*osize_2[2];
917  int batch=osize_2[1]*osize_2[2];;
918 
919  if(batch!=0)
920  {
921  cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
922  f_inembed, istride, idist, // *inembed, istride, idist
923  f_onembed, ostride, odist, // *onembed, ostride, odist
924  CUFFT_Z2Z, batch);
925  if(cufft_error!= CUFFT_SUCCESS)
926  {
927  fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL;
928  }
929  //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
930  }
931  }
932 
933  plan->iplan_0=NULL;
934  plan->iplan_1=NULL;
935  plan->iplan_2=NULL;
936 
937  int coords[2],np[2],periods[2];
938  MPI_Cart_get(c_comm,2,np,periods,coords);
939  int transpose_method=0;
940  int kway_method=0;
941  if(coords[0]==0){
942  plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,(double*)data_out_d);
943  transpose_method=plan->T_plan_1->method;
944  kway_method=plan->T_plan_1->kway;
945  }
946  checkCuda_accfft (cudaDeviceSynchronize());
947  MPI_Barrier(plan->c_comm);
948 
949 
950  MPI_Bcast(&transpose_method,1, MPI_INT,0, c_comm);
951  MPI_Bcast(&kway_method,1, MPI_INT,0, c_comm);
952  MPI_Barrier(c_comm);
953  plan->T_plan_1->method=transpose_method;
954  plan->T_plan_2->method= transpose_method;
955  plan->T_plan_2i->method=transpose_method;
956  plan->T_plan_1i->method=transpose_method;
957 
958  plan->T_plan_1->kway=kway_method;
959  plan->T_plan_2->kway= kway_method;
960  plan->T_plan_2i->kway=kway_method;
961  plan->T_plan_1i->kway=kway_method;
962 
963 
964  }
965  return plan;
966 }
967 
968 
978 void accfft_execute_c2c_gpu(accfft_plan_gpu* plan, int direction,Complex * data_d, Complex * data_out_d, double * timer){
979 
980  if(data_d==NULL)
981  data_d=plan->data_c;
982  if(data_out_d==NULL)
983  data_out_d=plan->data_out_c;
984  int * coords=plan->coord;
985  int procid=plan->procid;
986  double fft_time=0;
987  double * timings;
988  if(timer==NULL){
989  timings=new double[5];
990  memset(timings,0,sizeof(double)*5);
991  }
992  else{
993  timings=timer;
994  }
995 
996  int NY=plan->N[1];
997  cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
998  cudaEvent_t fft_startEvent, fft_stopEvent;
999  checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
1000  checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
1001  checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
1002  checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
1003  cufftResult_t cufft_error;
1004  float dummy_time=0;
1005 
1006  // 1D Decomposition
1007  if(plan->np[1]==1){
1008  if(direction==-1){
1009 
1010  /**************************************************************/
1011  /******************* N0/P0 x N1 x N2 *************************/
1012  /**************************************************************/
1013  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1014  checkCuda_accfft(cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
1015  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1016  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
1017  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1018  fft_time+=dummy_time/1000;
1019 
1020 
1021 
1022  MPI_Barrier(plan->c_comm);
1023  plan->T_plan_1->execute_gpu(plan->T_plan_1,(double*)data_out_d,timings,2);
1024  /**************************************************************/
1025  /******************* N1 x N0/P0 x N2 *************************/
1026  /**************************************************************/
1027  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1028  checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
1029  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1030  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
1031  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1032  fft_time+=dummy_time/1000;
1033  MPI_Barrier(plan->c_comm);
1034 
1035  }
1036 
1037  if(direction==1){
1038  /* Now Perform the inverse transform */
1039  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1040  checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_d,CUFFT_INVERSE));
1041  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1042  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
1043  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1044  fft_time+=dummy_time/1000;
1045 
1046  plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(double*)data_d,timings,1);
1047  /**************************************************************/
1048  /******************* N0/P0 x N1 x N2 *************************/
1049  /**************************************************************/
1050 
1051  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1052  checkCuda_accfft(cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_out_d,CUFFT_INVERSE));
1053  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1054  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
1055  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1056  fft_time+=dummy_time/1000;
1057 
1058  }
1059  }
1060 
1061  // 2D Decomposition
1062  if(plan->np[1]!=1){
1063  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
1064  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
1065  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
1066  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
1067  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
1068 
1069  if(direction==-1){
1070  /**************************************************************/
1071  /******************* N0/P0 x N1/P1 x N2 **********************/
1072  /**************************************************************/
1073  // FFT in Z direction
1074  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1075  checkCuda_accfft(cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
1076  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1077  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
1078  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1079  fft_time+=dummy_time/1000;
1080 
1081  plan->T_plan_1->execute_gpu(plan->T_plan_1,(double*)data_out_d,timings,2,osize_0[0],coords[0]);
1082  checkCuda_accfft (cudaDeviceSynchronize());
1083  MPI_Barrier(plan->c_comm);
1084  /**************************************************************/
1085  /******************* N0/P0 x N1 x N2/P1 **********************/
1086  /**************************************************************/
1087  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1088  for (int i=0;i<osize_1[0];++i){
1089  checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
1090  }
1091  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1092  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
1093  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1094  fft_time+=dummy_time/1000;
1095  MPI_Barrier(plan->c_comm);
1096 
1097 
1098 
1099  plan->T_plan_2->execute_gpu(plan->T_plan_2,(double*)data_out_d,timings,2,1,coords[1]);
1100  checkCuda_accfft (cudaDeviceSynchronize());
1101  MPI_Barrier(plan->c_comm);
1102  /**************************************************************/
1103  /******************* N0 x N1/P0 x N2/P1 **********************/
1104  /**************************************************************/
1105  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1106  checkCuda_accfft(cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
1107  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1108  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
1109  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1110  fft_time+=dummy_time/1000;
1111 
1112  }
1113  else if (direction==1){
1114  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1115  checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE));
1116  checkCuda_accfft (cudaDeviceSynchronize());
1117  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1118  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
1119  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1120  fft_time+=dummy_time/1000;
1121  MPI_Barrier(plan->c_comm);
1122 
1123 
1124  plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(double*)data_d,timings,1,1,coords[1]);
1125  checkCuda_accfft (cudaDeviceSynchronize());
1126  MPI_Barrier(plan->c_comm);
1127  /**************************************************************/
1128  /******************* N0/P0 x N1 x N2/P1 **********************/
1129  /**************************************************************/
1130  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1131  for (int i=0;i<osize_1i[0];++i){
1132  checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]],CUFFT_INVERSE));
1133  }
1134  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1135  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
1136  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1137  fft_time+=dummy_time/1000;
1138  MPI_Barrier(plan->c_comm);
1139 
1140  plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(double*)data_d,timings,1,osize_1i[0],coords[0]);
1141  checkCuda_accfft (cudaDeviceSynchronize());
1142  MPI_Barrier(plan->c_comm);
1143  /**************************************************************/
1144  /******************* N0/P0 x N1/P1 x N2 **********************/
1145  /**************************************************************/
1146 
1147  // IFFT in Z direction
1148  checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1149  checkCuda_accfft (cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_out_d,CUFFT_INVERSE));
1150  checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1151  checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
1152  checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1153  fft_time+=dummy_time/1000;
1154 
1155  }
1156 
1157  }
1158  timings[4]=fft_time;
1159  if(timer==NULL){
1160  delete [] timings;
1161  }
1162  MPI_Barrier(plan->c_comm);
1163 
1164  return;
1165 }
1166 
1167 
1172 void accfft_destroy_plan(accfft_plan_gpu * plan){
1173  return (accfft_destroy_plan_gpu(plan));
1174 }
1175 
1180 void accfft_destroy_plan_gpu(accfft_plan_gpu * plan){
1181 
1182  if(plan->T_plan_1!=NULL)delete(plan->T_plan_1);
1183  if(plan->T_plan_1i!=NULL)delete(plan->T_plan_1i);
1184  if(plan->T_plan_2!=NULL)delete(plan->T_plan_2);
1185  if(plan->T_plan_2i!=NULL)delete(plan->T_plan_2i);
1186  if(plan->Mem_mgr!=NULL)delete(plan->Mem_mgr);
1187 
1188  if(plan->fplan_0!=-1)cufftDestroy(plan->fplan_0);
1189  if(plan->fplan_1!=-1)cufftDestroy(plan->fplan_1);
1190  if(plan->fplan_2!=-1)cufftDestroy(plan->fplan_2);
1191 
1192  if(plan->iplan_0!=-1)cufftDestroy(plan->iplan_0);
1193  if(plan->iplan_1!=-1)cufftDestroy(plan->iplan_1);
1194  if(plan->iplan_2!=-1)cufftDestroy(plan->iplan_2);
1195 
1196  MPI_Comm_free(&plan->row_comm);
1197  MPI_Comm_free(&plan->col_comm);
1198  return;
1199 }
void accfft_cleanup_gpu()
Definition: accfft_gpu.cpp:44
void accfft_execute_c2c_gpu(accfft_plan_gpu *plan, int direction, Complex *data_d, Complex *data_out_d, double *timer)
Definition: accfft_gpu.cpp:978
void accfft_execute_c2r_gpu(accfft_plan_gpu *plan, Complex *data, double *data_out, double *timer)
Definition: accfft_gpu.cpp:647
accfft_plan_gpu * accfft_plan_dft_3d_r2c_gpu(int *n, double *data_d, double *data_out_d, MPI_Comm c_comm, unsigned flags)
Definition: accfft_gpu.cpp:149
void accfft_execute_r2c_gpu(accfft_plan_gpu *plan, double *data, Complex *data_out, double *timer)
Definition: accfft_gpu.cpp:631
void accfft_destroy_plan_gpu(accfft_plan_gpu *plan)
int accfft_local_size_dft_c2c_gpu(int *n, int *isize, int *istart, int *osize, int *ostart, MPI_Comm c_comm)
Definition: accfft_gpu.cpp:666
accfft_plan_gpu * accfft_plan_dft_3d_c2c_gpu(int *n, Complex *data_d, Complex *data_out_d, MPI_Comm c_comm, unsigned flags)
Definition: accfft_gpu.cpp:720
int accfft_local_size_dft_r2c_gpu(int *n, int *isize, int *istart, int *osize, int *ostart, MPI_Comm c_comm, bool inplace)
Definition: accfft_gpu.cpp:95
void accfft_destroy_plan(accfft_plan_gpu *plan)