25 #include "accfft_gpu.h"
31 #include "transpose_cuda.h"
32 #include <cuda_runtime_api.h>
36 #include "accfft_common.h"
38 #define PCOUT if(procid==0) std::cout
39 typedef double Complex[2];
48 int dfft_get_local_size_gpu(
int N0,
int N1,
int N2,
int * isize,
int * istart,MPI_Comm c_comm ){
50 MPI_Comm_rank(c_comm, &procid);
52 int coords[2],np[2],periods[2];
53 MPI_Cart_get(c_comm,2,np,periods,coords);
55 isize[0]=ceil(N0/(
double)np[0]);
56 isize[1]=ceil(N1/(
double)np[1]);
58 istart[0]=isize[0]*(coords[0]);
59 istart[1]=isize[1]*(coords[1]);
62 if((N0-isize[0]*coords[0])<isize[0]) {isize[0]=N0-isize[0]*coords[0]; isize[0]*=(int) isize[0]>0; istart[0]=N0-isize[0];}
63 if((N1-isize[1]*coords[1])<isize[1]) {isize[1]=N1-isize[1]*coords[1]; isize[1]*=(int) isize[1]>0; istart[1]=N1-isize[1];}
68 for(
int r=0;r<np[0];r++)
69 for(
int c=0;c<np[1];c++){
71 if((coords[0]==r) && (coords[1]==c))
72 std::cout<<coords[0]<<
","<<coords[1]<<
" isize[0]= "<<isize[0]<<
" isize[1]= "<<isize[1]<<
" isize[2]= "<<isize[2]<<
" istart[0]= "<<istart[0]<<
" istart[1]= "<<istart[1]<<
" istart[2]= "<<istart[2]<<std::endl;
77 int alloc_local=isize[0]*isize[1]*isize[2]*
sizeof(double);
98 int osize_0[3]={0}, ostart_0[3]={0};
99 int osize_1[3]={0}, ostart_1[3]={0};
100 int osize_2[3]={0}, ostart_2[3]={0};
103 int alloc_max=0,n_tuples;
105 n_tuples=(n[2]/2+1)*2;
106 alloc_local=dfft_get_local_size_gpu(n[0],n[1],n_tuples,osize_0,ostart_0,c_comm);
107 alloc_max=std::max(alloc_max, alloc_local);
108 alloc_local=dfft_get_local_size_gpu(n[0],n_tuples/2,n[1],osize_1,ostart_1,c_comm);
109 alloc_max=std::max(alloc_max, alloc_local*2);
110 alloc_local=dfft_get_local_size_gpu(n[1],n_tuples/2,n[0],osize_2,ostart_2,c_comm);
111 alloc_max=std::max(alloc_max, alloc_local*2);
113 std::swap(osize_1[1],osize_1[2]);
114 std::swap(ostart_1[1],ostart_1[2]);
116 std::swap(ostart_2[1],ostart_2[2]);
117 std::swap(ostart_2[0],ostart_2[1]);
118 std::swap(osize_2[1],osize_2[2]);
119 std::swap(osize_2[0],osize_2[1]);
124 dfft_get_local_size_gpu(n[0],n[1],n[2],isize,istart,c_comm);
130 ostart[0]=ostart_2[0];
131 ostart[1]=ostart_2[1];
132 ostart[2]=ostart_2[2];
150 accfft_plan_gpu *plan=
new accfft_plan_gpu;
152 MPI_Comm_rank(c_comm, &procid);
154 MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
156 int *coord=plan->coord;
157 MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
158 MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
159 plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
161 plan->data_out=data_out_d;
163 if(data_out_d==data_d){
165 else{plan->inplace=
false;}
169 int N0=n[0], N1=n[1], N2=n[2];
170 int n_tuples_o,n_tuples_i;
172 plan->inplace==
true ? n_tuples_i=(N2/2+1)*2: n_tuples_i=N2;
173 n_tuples_o=(N2/2+1)*2;
174 plan->Mem_mgr=
new Mem_Mgr_gpu(N0,N1,n_tuples_o,c_comm);
175 plan->T_plan_1=
new T_Plan_gpu(N0,N1,n_tuples_o, plan->Mem_mgr, c_comm);
176 plan->T_plan_1i=
new T_Plan_gpu(N1,N0,n_tuples_o,plan->Mem_mgr, c_comm);
178 int isize[3],osize[3],istart[3],ostart[3];
180 plan->alloc_max=alloc_max;
181 plan->T_plan_1->alloc_local=alloc_max;
182 plan->T_plan_1i->alloc_local=alloc_max;
187 int NX=n[0], NY=n[1], NZ=n[2];
188 int f_inembed[2]={NY,n_tuples_i};
189 int f_onembed[2]={NY,n_tuples_o/2};
190 int idist=NY*n_tuples_i;
191 int odist=NY*n_tuples_o/2;
194 int batch=plan->T_plan_1->local_n0;
196 cufftResult_t cufft_error;
199 cufft_error=cufftPlanMany(&plan->fplan_0, 2, &n[1],
200 f_inembed, istride, idist,
201 f_onembed, ostride, odist,
203 if(cufft_error!= CUFFT_SUCCESS){
204 fprintf(stderr,
"CUFFT error: fplan creation failed %d \n",cufft_error);
return NULL;
210 int local_n0=plan->T_plan_1->local_n0;
211 int local_n1=plan->T_plan_1->local_n1;
212 int f_inembed2[1]={NX};
213 int f_onembed2[1]={NX};
216 int istride2=local_n1*n_tuples_o/2;
217 int ostride2=local_n1*n_tuples_o/2;
218 if(plan->T_plan_1->local_n1*n_tuples_o/2!=0)
220 cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[0],
221 f_inembed2, istride2, idist2,
222 f_onembed2, ostride2, odist2,
223 CUFFT_Z2Z,plan->T_plan_1->local_n1*n_tuples_o/2);
224 if(cufft_error!= CUFFT_SUCCESS){
225 fprintf(stderr,
"CUFFT error: fplan2 creation failed %d\n",cufft_error);
return NULL;
233 cufft_error=cufftPlanMany(&plan->iplan_0, 2, &n[1],
234 f_onembed, ostride,odist ,
235 f_inembed, istride,idist,
237 if(cufft_error!= CUFFT_SUCCESS){
238 fprintf(stderr,
"CUFFT error: iplan creation failed %d\n",cufft_error);
return NULL;
245 static int method_static=0;
246 static int kway_static_2=0;
247 if(method_static==0){
248 plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d);
249 method_static=plan->T_plan_1->method;
250 kway_static_2=plan->T_plan_1->kway;
253 plan->T_plan_1->method=method_static;
254 plan->T_plan_1->kway=kway_static_2;
257 checkCuda_accfft (cudaDeviceSynchronize());
258 MPI_Barrier(plan->c_comm);
260 plan->T_plan_1->method=plan->T_plan_1->method;
261 plan->T_plan_1->kway=kway_static_2;
262 plan->T_plan_1i->method=plan->T_plan_1->method;
263 plan->T_plan_1i->kway=kway_static_2;
267 plan->T_plan_2i=NULL;
276 int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
277 int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
278 int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
279 int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
280 int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
283 int n_tuples_i, n_tuples_o;
285 plan->inplace==
true ? n_tuples_i=(n[2]/2+1)*2: n_tuples_i=n[2];
286 n_tuples_o=(n[2]/2+1)*2;
288 int isize[3],osize[3],istart[3],ostart[3];
290 plan->alloc_max=alloc_max;
292 dfft_get_local_size_gpu(n[0],n[1],n_tuples_o,osize_0,ostart_0,c_comm);
293 dfft_get_local_size_gpu(n[0],n_tuples_o/2,n[1],osize_1,ostart_1,c_comm);
294 dfft_get_local_size_gpu(n[1],n_tuples_o/2,n[0],osize_2,ostart_2,c_comm);
296 std::swap(osize_1[1],osize_1[2]);
297 std::swap(ostart_1[1],ostart_1[2]);
299 std::swap(ostart_2[1],ostart_2[2]);
300 std::swap(ostart_2[0],ostart_2[1]);
301 std::swap(osize_2[1],osize_2[2]);
302 std::swap(osize_2[0],osize_2[1]);
304 for(
int i=0;i<3;i++){
305 osize_1i[i]=osize_1[i];
306 osize_2i[i]=osize_2[i];
307 ostart_1i[i]=ostart_1[i];
308 ostart_2i[i]=ostart_2[i];
312 plan->Mem_mgr=
new Mem_Mgr_gpu(n[1],n_tuples_o/2,2,plan->row_comm,osize_0[0],alloc_max);
313 plan->T_plan_1=
new T_Plan_gpu(n[1],n_tuples_o/2,2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
314 plan->T_plan_2=
new T_Plan_gpu(n[0],n[1],osize_2[2]*2,plan->Mem_mgr, plan->col_comm);
315 plan->T_plan_2i=
new T_Plan_gpu(n[1],n[0],osize_2i[2]*2, plan->Mem_mgr, plan->col_comm);
316 plan->T_plan_1i=
new T_Plan_gpu(n_tuples_o/2,n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);
319 plan->T_plan_1->alloc_local=plan->alloc_max;
320 plan->T_plan_2->alloc_local=plan->alloc_max;
321 plan->T_plan_2i->alloc_local=plan->alloc_max;
322 plan->T_plan_1i->alloc_local=plan->alloc_max;
325 int NX=n[0], NY=n[1], NZ=n[2];
326 cufftResult_t cufft_error;
328 int f_inembed[1]={n_tuples_i};
329 int f_onembed[1]={n_tuples_o/2};
330 int idist=(n_tuples_i);
331 int odist=n_tuples_o/2;
334 int batch=osize_0[0]*osize_0[1];
338 cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
339 f_inembed, istride, idist,
340 f_onembed, ostride, odist,
342 if(cufft_error!= CUFFT_SUCCESS)
344 fprintf(stderr,
"CUFFT error: fplan_0 creation failed %d \n",cufft_error);
return NULL;
350 cufft_error=cufftPlanMany(&plan->iplan_0, 1, &n[2],
351 f_onembed, ostride, odist,
352 f_inembed, istride, idist,
354 if(cufft_error!= CUFFT_SUCCESS)
356 fprintf(stderr,
"CUFFT error: iplan_0 creation failed %d \n",cufft_error);
return NULL;
363 int f_inembed[1]={NY};
364 int f_onembed[1]={NY};
367 int istride=osize_1[2];
368 int ostride=osize_1[2];
369 int batch=osize_1[2];
373 cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
374 f_inembed, istride, idist,
375 f_onembed, ostride, odist,
377 if(cufft_error!= CUFFT_SUCCESS)
379 fprintf(stderr,
"CUFFT error: fplan_1 creation failed %d \n",cufft_error);
return NULL;
386 int f_inembed[1]={NX};
387 int f_onembed[1]={NX};
390 int istride=osize_2[1]*osize_2[2];
391 int ostride=osize_2[1]*osize_2[2];
392 int batch=osize_2[1]*osize_2[2];;
396 cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
397 f_inembed, istride, idist,
398 f_onembed, ostride, odist,
400 if(cufft_error!= CUFFT_SUCCESS)
402 fprintf(stderr,
"CUFFT error: fplan_2 creation failed %d \n",cufft_error);
return NULL;
410 static int method_static_2=0;
411 static int kway_static_2=0;
412 if(method_static_2==0){
414 plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d);
415 method_static_2=plan->T_plan_1->method;
416 kway_static_2=plan->T_plan_1->kway;
418 MPI_Bcast(&method_static_2,1, MPI_INT,0, c_comm );
419 MPI_Bcast(&kway_static_2,1, MPI_INT,0, c_comm );
422 checkCuda_accfft (cudaDeviceSynchronize());
423 MPI_Barrier(plan->c_comm);
424 plan->T_plan_1->method=method_static_2;
425 plan->T_plan_2->method=method_static_2;
426 plan->T_plan_2i->method=method_static_2;
427 plan->T_plan_1i->method=method_static_2;
428 plan->T_plan_1->kway=kway_static_2;
429 plan->T_plan_2->kway=kway_static_2;
430 plan->T_plan_2i->kway=kway_static_2;
431 plan->T_plan_1i->kway=kway_static_2;
442 void accfft_execute_gpu(accfft_plan_gpu* plan,
int direction,
double * data_d,
double * data_out_d,
double * timer){
447 data_out_d=plan->data_out;
449 int * coords=plan->coord;
450 int procid=plan->procid;
454 timings=
new double[5];
455 memset(timings,0,
sizeof(
double)*5);
461 cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
462 cudaEvent_t fft_startEvent, fft_stopEvent;
463 checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
464 checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
465 checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
466 checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
477 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
478 checkCuda_accfft(cufftExecD2Z(plan->fplan_0, (cufftDoubleReal*)data_d, (cufftDoubleComplex*)data_out_d));
479 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
480 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
481 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
482 fft_time+=dummy_time/1000;
484 MPI_Barrier(plan->c_comm);
485 plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2);
489 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
490 checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
491 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
492 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
493 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
494 fft_time+=dummy_time/1000;
500 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
501 checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_d,CUFFT_INVERSE));
502 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
503 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
504 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
505 fft_time+=dummy_time/1000;
507 plan->T_plan_1i->execute_gpu(plan->T_plan_1i,data_d,timings,1);
511 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
512 checkCuda_accfft(cufftExecZ2D(plan->iplan_0, (cufftDoubleComplex*)data_d,(cufftDoubleReal*)data_out_d));
513 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
514 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
515 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
516 fft_time+=dummy_time/1000;
523 int *osize_0 =plan->osize_0;
524 int *osize_1 =plan->osize_1;
526 int *osize_1i=plan->osize_1i;
534 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
535 checkCuda_accfft (cufftExecD2Z(plan->fplan_0,(cufftDoubleReal*)data_d, (cufftDoubleComplex*)data_out_d));
536 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
537 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
538 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
539 fft_time+=dummy_time/1000;
544 plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]);
548 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
549 for (
int i=0;i<osize_1[0];++i){
550 checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
552 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
553 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
554 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
555 fft_time+=dummy_time/1000;
556 MPI_Barrier(plan->c_comm);
558 plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]);
562 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
563 checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
564 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
565 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
566 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
567 fft_time+=dummy_time/1000;
569 else if (direction==1){
570 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
571 checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE));
572 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
573 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
574 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
575 fft_time+=dummy_time/1000;
577 MPI_Barrier(plan->c_comm);
580 plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1,1,coords[1]);
584 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
585 for (
int i=0;i<osize_1i[0];++i){
586 checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE));
588 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
589 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
590 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
591 fft_time+=dummy_time/1000;
592 MPI_Barrier(plan->c_comm);
596 plan->T_plan_1i->execute_gpu(plan->T_plan_1i,data_d,timings,1,osize_1i[0],coords[0]);
597 MPI_Barrier(plan->c_comm);
603 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
604 checkCuda_accfft (cufftExecZ2D(plan->iplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleReal*)data_out_d));
605 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
606 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
607 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
608 fft_time+=dummy_time/1000;
616 MPI_Barrier(plan->c_comm);
632 accfft_execute_gpu(plan,-1,data,(
double*)data_out,timer);
648 accfft_execute_gpu(plan,1,(
double*)data,data_out,timer);
668 int osize_0[3]={0}, ostart_0[3]={0};
669 int osize_1[3]={0}, ostart_1[3]={0};
670 int osize_2[3]={0}, ostart_2[3]={0};
676 alloc_local=dfft_get_local_size_gpu(n[0],n[1],n[2],osize_0,ostart_0,c_comm);
677 alloc_max=std::max(alloc_max, alloc_local);
678 alloc_local=dfft_get_local_size_gpu(n[0],n[2],n[1],osize_1,ostart_1,c_comm);
679 alloc_max=std::max(alloc_max, alloc_local);
680 alloc_local=dfft_get_local_size_gpu(n[1],n[2],n[0],osize_2,ostart_2,c_comm);
681 alloc_max=std::max(alloc_max, alloc_local);
684 std::swap(osize_1[1],osize_1[2]);
685 std::swap(ostart_1[1],ostart_1[2]);
687 std::swap(ostart_2[1],ostart_2[2]);
688 std::swap(ostart_2[0],ostart_2[1]);
689 std::swap(osize_2[1],osize_2[2]);
690 std::swap(osize_2[0],osize_2[1]);
695 dfft_get_local_size_gpu(n[0],n[1],n[2],isize,istart,c_comm);
701 ostart[0]=ostart_2[0];
702 ostart[1]=ostart_2[1];
703 ostart[2]=ostart_2[2];
721 accfft_plan_gpu *plan=
new accfft_plan_gpu;
723 MPI_Comm_rank(c_comm, &procid);
725 MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
727 int *coord=plan->coord;
728 MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
729 MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
730 plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
731 int NX=n[0], NY=n[1], NZ=n[2];
732 cufftResult_t cufft_error;
735 plan->data_out_c=data_out_d;
736 if(data_out_d==data_d){
738 else{plan->inplace=
false;}
742 int NX=n[0],NY=n[1],NZ=n[2];
745 int isize[3],osize[3],istart[3],ostart[3];
747 plan->alloc_max=alloc_max;
749 plan->Mem_mgr=
new Mem_Mgr_gpu(NX,NY,(NZ)*2,c_comm);
750 plan->T_plan_1=
new T_Plan_gpu(NX,NY,(NZ)*2, plan->Mem_mgr,c_comm);
751 plan->T_plan_1i=
new T_Plan_gpu(NY,NX,NZ*2, plan->Mem_mgr,c_comm);
753 plan->T_plan_1->alloc_local=alloc_max;
754 plan->T_plan_1i->alloc_local=alloc_max;
756 ptrdiff_t local_n0=plan->T_plan_1->local_n0;
757 ptrdiff_t local_n1=plan->T_plan_1->local_n1;
758 int N0=NX, N1=NY, N2=NZ;
761 int n[3] = {NX, NY, NZ};
762 int f_inembed[2]={NY,(NZ)};
763 int f_onembed[2]={NY,NZ};
768 int batch=plan->T_plan_1->local_n0;
770 cufftResult_t cufft_error;
773 cufft_error=cufftPlanMany(&plan->fplan_0, 2, &n[1],
774 f_inembed, istride, idist,
775 f_onembed, ostride, odist,
777 if(cufft_error!= CUFFT_SUCCESS)
779 fprintf(stderr,
"CUFFT error: fplan creation failed %d \n",cufft_error);
return NULL;
783 int f_inembed2[1]={NX};
784 int f_onembed2[1]={NX};
787 int istride2=local_n1*(NZ);
788 int ostride2=local_n1*(NZ);
791 cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[0],
792 f_inembed2, istride2, idist2,
793 f_onembed2, ostride2, odist2,
794 CUFFT_Z2Z, local_n1*(NZ));
795 if(cufft_error!= CUFFT_SUCCESS){
796 fprintf(stderr,
"CUFFT error: fplan2 creation failed %d\n",cufft_error);
return NULL;
801 plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,(
double*)data_out_d);
802 plan->T_plan_1i->method=plan->T_plan_1->method;
803 plan->T_plan_1i->kway=plan->T_plan_1->kway;
807 plan->T_plan_2i=NULL;
815 int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
816 int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
817 int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
818 int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
819 int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
822 int alloc_max=0,n_tuples=n[2]*2;
824 int isize[3],osize[3],istart[3],ostart[3];
826 plan->alloc_max=alloc_max;
828 dfft_get_local_size_gpu(n[0],n[1],n[2],osize_0,ostart_0,c_comm);
829 dfft_get_local_size_gpu(n[0],n[2],n[1],osize_1,ostart_1,c_comm);
830 dfft_get_local_size_gpu(n[1],n[2],n[0],osize_2,ostart_2,c_comm);
833 std::swap(osize_1[1],osize_1[2]);
834 std::swap(ostart_1[1],ostart_1[2]);
836 std::swap(ostart_2[1],ostart_2[2]);
837 std::swap(ostart_2[0],ostart_2[1]);
838 std::swap(osize_2[1],osize_2[2]);
839 std::swap(osize_2[0],osize_2[1]);
841 for(
int i=0;i<3;i++){
842 osize_1i[i]=osize_1[i];
843 osize_2i[i]=osize_2[i];
844 ostart_1i[i]=ostart_1[i];
845 ostart_2i[i]=ostart_2[i];
851 plan->Mem_mgr=
new Mem_Mgr_gpu(n[1],n[2],2,plan->row_comm,osize_0[0],alloc_max);
852 plan->T_plan_1=
new T_Plan_gpu(n[1],n[2],2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
853 plan->T_plan_2=
new T_Plan_gpu(n[0],n[1],2*osize_2[2], plan->Mem_mgr, plan->col_comm);
854 plan->T_plan_2i=
new T_Plan_gpu(n[1],n[0],2*osize_2i[2], plan->Mem_mgr, plan->col_comm);
855 plan->T_plan_1i=
new T_Plan_gpu(n[2],n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);
857 plan->T_plan_1->alloc_local=plan->alloc_max;
858 plan->T_plan_2->alloc_local=plan->alloc_max;
859 plan->T_plan_2i->alloc_local=plan->alloc_max;
860 plan->T_plan_1i->alloc_local=plan->alloc_max;
865 int f_inembed[1]={NZ};
866 int f_onembed[1]={NZ};
871 int batch=osize_0[0]*osize_0[1];
875 cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
876 f_inembed, istride, idist,
877 f_onembed, ostride, odist,
879 if(cufft_error!= CUFFT_SUCCESS)
881 fprintf(stderr,
"CUFFT error: fplan_0 creation failed %d \n",cufft_error);
return NULL;
888 int f_inembed[1]={NY};
889 int f_onembed[1]={NY};
892 int istride=osize_1[2];
893 int ostride=osize_1[2];
894 int batch=osize_1[2];
898 cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
899 f_inembed, istride, idist,
900 f_onembed, ostride, odist,
902 if(cufft_error!= CUFFT_SUCCESS)
904 fprintf(stderr,
"CUFFT error: fplan_1 creation failed %d \n",cufft_error);
return NULL;
911 int f_inembed[1]={NX};
912 int f_onembed[1]={NX};
915 int istride=osize_2[1]*osize_2[2];
916 int ostride=osize_2[1]*osize_2[2];
917 int batch=osize_2[1]*osize_2[2];;
921 cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
922 f_inembed, istride, idist,
923 f_onembed, ostride, odist,
925 if(cufft_error!= CUFFT_SUCCESS)
927 fprintf(stderr,
"CUFFT error: fplan_2 creation failed %d \n",cufft_error);
return NULL;
937 int coords[2],np[2],periods[2];
938 MPI_Cart_get(c_comm,2,np,periods,coords);
939 int transpose_method=0;
942 plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,(
double*)data_out_d);
943 transpose_method=plan->T_plan_1->method;
944 kway_method=plan->T_plan_1->kway;
946 checkCuda_accfft (cudaDeviceSynchronize());
947 MPI_Barrier(plan->c_comm);
950 MPI_Bcast(&transpose_method,1, MPI_INT,0, c_comm);
951 MPI_Bcast(&kway_method,1, MPI_INT,0, c_comm);
953 plan->T_plan_1->method=transpose_method;
954 plan->T_plan_2->method= transpose_method;
955 plan->T_plan_2i->method=transpose_method;
956 plan->T_plan_1i->method=transpose_method;
958 plan->T_plan_1->kway=kway_method;
959 plan->T_plan_2->kway= kway_method;
960 plan->T_plan_2i->kway=kway_method;
961 plan->T_plan_1i->kway=kway_method;
978 void accfft_execute_c2c_gpu(accfft_plan_gpu* plan,
int direction,Complex * data_d, Complex * data_out_d,
double * timer){
983 data_out_d=plan->data_out_c;
984 int * coords=plan->coord;
985 int procid=plan->procid;
989 timings=
new double[5];
990 memset(timings,0,
sizeof(
double)*5);
997 cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
998 cudaEvent_t fft_startEvent, fft_stopEvent;
999 checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
1000 checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
1001 checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
1002 checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
1003 cufftResult_t cufft_error;
1013 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1014 checkCuda_accfft(cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
1015 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1016 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
1017 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1018 fft_time+=dummy_time/1000;
1022 MPI_Barrier(plan->c_comm);
1023 plan->T_plan_1->execute_gpu(plan->T_plan_1,(
double*)data_out_d,timings,2);
1027 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1028 checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
1029 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1030 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
1031 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1032 fft_time+=dummy_time/1000;
1033 MPI_Barrier(plan->c_comm);
1039 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1040 checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_d,CUFFT_INVERSE));
1041 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1042 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
1043 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1044 fft_time+=dummy_time/1000;
1046 plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(
double*)data_d,timings,1);
1051 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1052 checkCuda_accfft(cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_out_d,CUFFT_INVERSE));
1053 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1054 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
1055 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1056 fft_time+=dummy_time/1000;
1063 int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
1064 int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
1065 int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
1066 int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
1067 int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;
1074 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1075 checkCuda_accfft(cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
1076 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1077 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
1078 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1079 fft_time+=dummy_time/1000;
1081 plan->T_plan_1->execute_gpu(plan->T_plan_1,(
double*)data_out_d,timings,2,osize_0[0],coords[0]);
1082 checkCuda_accfft (cudaDeviceSynchronize());
1083 MPI_Barrier(plan->c_comm);
1087 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1088 for (
int i=0;i<osize_1[0];++i){
1089 checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
1091 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1092 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
1093 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1094 fft_time+=dummy_time/1000;
1095 MPI_Barrier(plan->c_comm);
1099 plan->T_plan_2->execute_gpu(plan->T_plan_2,(
double*)data_out_d,timings,2,1,coords[1]);
1100 checkCuda_accfft (cudaDeviceSynchronize());
1101 MPI_Barrier(plan->c_comm);
1105 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1106 checkCuda_accfft(cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
1107 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1108 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
1109 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1110 fft_time+=dummy_time/1000;
1113 else if (direction==1){
1114 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1115 checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE));
1116 checkCuda_accfft (cudaDeviceSynchronize());
1117 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1118 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
1119 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1120 fft_time+=dummy_time/1000;
1121 MPI_Barrier(plan->c_comm);
1124 plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(
double*)data_d,timings,1,1,coords[1]);
1125 checkCuda_accfft (cudaDeviceSynchronize());
1126 MPI_Barrier(plan->c_comm);
1130 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1131 for (
int i=0;i<osize_1i[0];++i){
1132 checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]],CUFFT_INVERSE));
1134 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1135 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
1136 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1137 fft_time+=dummy_time/1000;
1138 MPI_Barrier(plan->c_comm);
1140 plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(
double*)data_d,timings,1,osize_1i[0],coords[0]);
1141 checkCuda_accfft (cudaDeviceSynchronize());
1142 MPI_Barrier(plan->c_comm);
1148 checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
1149 checkCuda_accfft (cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_out_d,CUFFT_INVERSE));
1150 checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
1151 checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) );
1152 checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
1153 fft_time+=dummy_time/1000;
1158 timings[4]=fft_time;
1162 MPI_Barrier(plan->c_comm);
1182 if(plan->T_plan_1!=NULL)
delete(plan->T_plan_1);
1183 if(plan->T_plan_1i!=NULL)
delete(plan->T_plan_1i);
1184 if(plan->T_plan_2!=NULL)
delete(plan->T_plan_2);
1185 if(plan->T_plan_2i!=NULL)
delete(plan->T_plan_2i);
1186 if(plan->Mem_mgr!=NULL)
delete(plan->Mem_mgr);
1188 if(plan->fplan_0!=-1)cufftDestroy(plan->fplan_0);
1189 if(plan->fplan_1!=-1)cufftDestroy(plan->fplan_1);
1190 if(plan->fplan_2!=-1)cufftDestroy(plan->fplan_2);
1192 if(plan->iplan_0!=-1)cufftDestroy(plan->iplan_0);
1193 if(plan->iplan_1!=-1)cufftDestroy(plan->iplan_1);
1194 if(plan->iplan_2!=-1)cufftDestroy(plan->iplan_2);
1196 MPI_Comm_free(&plan->row_comm);
1197 MPI_Comm_free(&plan->col_comm);
void accfft_cleanup_gpu()
void accfft_execute_c2c_gpu(accfft_plan_gpu *plan, int direction, Complex *data_d, Complex *data_out_d, double *timer)
void accfft_execute_c2r_gpu(accfft_plan_gpu *plan, Complex *data, double *data_out, double *timer)
accfft_plan_gpu * accfft_plan_dft_3d_r2c_gpu(int *n, double *data_d, double *data_out_d, MPI_Comm c_comm, unsigned flags)
void accfft_execute_r2c_gpu(accfft_plan_gpu *plan, double *data, Complex *data_out, double *timer)
void accfft_destroy_plan_gpu(accfft_plan_gpu *plan)
int accfft_local_size_dft_c2c_gpu(int *n, int *isize, int *istart, int *osize, int *ostart, MPI_Comm c_comm)
accfft_plan_gpu * accfft_plan_dft_3d_c2c_gpu(int *n, Complex *data_d, Complex *data_out_d, MPI_Comm c_comm, unsigned flags)
int accfft_local_size_dft_r2c_gpu(int *n, int *isize, int *istart, int *osize, int *ostart, MPI_Comm c_comm, bool inplace)
void accfft_destroy_plan(accfft_plan_gpu *plan)