23 #include <transpose_cuda.h>
28 #include <cuda_runtime_api.h>
30 #define PCOUT if(procid==0) std::cout
43 static bool IsPowerOfTwo(ulong x)
45 return (x & (x - 1)) == 0;
49 Mem_Mgr_gpu::Mem_Mgr_gpu(
int N0,
int N1,
int tuples, MPI_Comm Comm,
int howmany,
int specified_alloc_local){
55 MPI_Comm_rank(Comm, &procid);
56 MPI_Comm_size(Comm,&nprocs);
60 if(specified_alloc_local==0){
62 ptrdiff_t * local_n0_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
63 ptrdiff_t * local_n1_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
64 #pragma omp parallel for
65 for (
int proc=0;proc<nprocs;++proc){
66 local_n0_proc[proc]=ceil(N[0]/(
double)nprocs);
67 local_n1_proc[proc]=ceil(N[1]/(
double)nprocs);
70 if((N[0]-local_n0_proc[proc]*proc)<local_n0_proc[proc]) {local_n0_proc[proc]=N[0]-local_n0_proc[proc]*proc; local_n0_proc[proc]*=(int) local_n0_proc[proc]>0;}
71 if((N[1]-local_n1_proc[proc]*proc)<local_n1_proc[proc]) {local_n1_proc[proc]=N[1]-local_n1_proc[proc]*proc;local_n1_proc[proc]*=(int) local_n1_proc[proc]>0;}
76 local_n0=local_n0_proc[procid];
77 local_n1=local_n1_proc[procid];
84 alloc_local=local_n0*N[1]*n_tuples*
sizeof(double);
85 if(alloc_local<local_n1*N[0]*n_tuples*
sizeof(
double))
86 alloc_local=local_n1*N[0]*n_tuples*
sizeof(
double);
91 alloc_local=specified_alloc_local;
93 if( alloc_local<=1.05*std::pow(2,30) )
100 cudaError_t cuda_err1, cuda_err2;
101 double pinned_time=-MPI_Wtime();
102 buffer=NULL; buffer_2=NULL;
103 cuda_err1=cudaMallocHost((
void**)&buffer,alloc_local);
104 cuda_err2=cudaMallocHost((
void**)&buffer_2,alloc_local);
105 if(cuda_err1!=cudaSuccess || cuda_err2!=cudaSuccess){
106 std::cout<<
"!!!!!!!!!! Failed to cudaMallocHost in MemMgr"<<std::endl;
108 pinned_time+=MPI_Wtime();
112 posix_memalign((
void **)&buffer_2,64, alloc_local);
113 posix_memalign((
void **)&buffer,64, alloc_local);
115 cudaMalloc((
void **)&buffer_d, alloc_local);
117 posix_memalign((
void **)&buffer,64, alloc_local);
118 posix_memalign((
void **)&buffer_2,64, alloc_local);
120 memset( buffer,0, alloc_local );
121 memset( buffer_2,0, alloc_local );
124 Mem_Mgr_gpu::~Mem_Mgr_gpu(){
127 cudaError_t cuda_err1=cudaSuccess, cuda_err2=cudaSuccess,cuda_err3=cudaSuccess;
129 if(buffer!=NULL) cuda_err1=cudaFreeHost(buffer);
130 if(buffer!=NULL) cuda_err2=cudaFreeHost(buffer_2);
131 if(cuda_err1!=cudaSuccess || cuda_err2!=cudaSuccess){
132 std::cout<<
"!!!!!!!!!! Failed to cudaFreeHost in MemMgr; err1= "<<cuda_err1<<
" err2= "<<cuda_err2<<std::endl;
139 cuda_err3=cudaFree(buffer_d);
140 if(cuda_err3!=cudaSuccess){
141 std::cout<<
"!!!!!!!!!! Failed to cudaFree in MemMgr; err3= "<<cuda_err3<<std::endl;
148 T_Plan_gpu::T_Plan_gpu(
int N0,
int N1,
int tuples, Mem_Mgr_gpu * Mem_mgr, MPI_Comm Comm,
int howmany){
153 MPI_Comm_rank(Comm, &procid);
154 MPI_Comm_size(Comm,&nprocs);
156 local_n0_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
157 local_n1_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
158 local_0_start_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
159 local_1_start_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
163 local_0_start_proc[0]=0;local_1_start_proc[0]=0;
164 for (
int proc=0;proc<nprocs;++proc){
165 local_n0_proc[proc]=ceil(N[0]/(
double)nprocs);
166 local_n1_proc[proc]=ceil(N[1]/(
double)nprocs);
169 if((N[0]-local_n0_proc[proc]*proc)<local_n0_proc[proc]) {local_n0_proc[proc]=N[0]-local_n0_proc[proc]*proc; local_n0_proc[proc]*=(int) local_n0_proc[proc]>0;}
170 if((N[1]-local_n1_proc[proc]*proc)<local_n1_proc[proc]) {local_n1_proc[proc]=N[1]-local_n1_proc[proc]*proc;local_n1_proc[proc]*=(int) local_n1_proc[proc]>0;}
173 local_0_start_proc[proc]=local_0_start_proc[proc-1]+local_n0_proc[proc-1];
174 local_1_start_proc[proc]=local_1_start_proc[proc-1]+local_n1_proc[proc-1];
179 local_n0=local_n0_proc[procid];
180 local_n1=local_n1_proc[procid];
181 local_0_start=local_0_start_proc[procid];
182 local_1_start=local_1_start_proc[procid];
185 alloc_local=Mem_mgr->alloc_local;
187 nprocs_0=0; nprocs_1=0;
188 for (
int proc=0;proc<nprocs;++proc){
189 if(local_n0_proc[proc]!=0)
191 if(local_n1_proc[proc]!=0)
198 scount_proc=(
int*) malloc(
sizeof(
int)*nprocs);
199 rcount_proc=(
int*) malloc(
sizeof(
int)*nprocs);
200 soffset_proc=(
int*) malloc(
sizeof(
int)*nprocs);
201 roffset_proc=(
int*) malloc(
sizeof(
int)*nprocs);
203 scount_proc_f=(
int*) malloc(
sizeof(
int)*nprocs);
204 rcount_proc_f=(
int*) malloc(
sizeof(
int)*nprocs);
205 soffset_proc_f=(
int*) malloc(
sizeof(
int)*nprocs);
206 roffset_proc_f=(
int*) malloc(
sizeof(
int)*nprocs);
208 scount_proc_w=(
int*) malloc(
sizeof(
int)*nprocs);
209 rcount_proc_w=(
int*) malloc(
sizeof(
int)*nprocs);
210 soffset_proc_w=(
int*) malloc(
sizeof(
int)*nprocs);
211 roffset_proc_w=(
int*) malloc(
sizeof(
int)*nprocs);
223 if(nprocs_1>nprocs_0)
224 for (
int proc=0;proc<nprocs;++proc){
226 scount_proc[proc]=local_n1_proc[proc]*local_n0*n_tuples;
228 if(scount_proc[proc]!=0)
229 rcount_proc[proc]=local_n1_proc[proc]*local_n0_proc[proc]*n_tuples;
231 rcount_proc[proc]=local_n1*local_n0_proc[proc]*n_tuples;
233 soffset_proc[proc]=0;
234 roffset_proc[proc]=0;
236 soffset_proc[proc]=0;
237 roffset_proc[proc]=0;
240 soffset_proc[proc]=soffset_proc[proc-1]+scount_proc[proc-1];
241 roffset_proc[proc]=roffset_proc[proc-1]+rcount_proc[proc-1];
248 if(scount_proc[proc]==0) soffset_proc[proc]=0;
252 roffset_proc[proc]=0;
254 if(rcount_proc[proc]!=0)
255 last_recv_count=rcount_proc[proc];
256 if(local_n1_proc[proc]!=0)
257 last_local_n1=local_n1_proc[proc];
259 else if(nprocs_1<=nprocs_0)
260 for (
int proc=0;proc<nprocs;++proc){
262 scount_proc[proc]=local_n1_proc[proc]*local_n0*n_tuples;
263 rcount_proc[proc]=local_n1*local_n0_proc[proc]*n_tuples;
267 soffset_proc[proc]=0;
268 roffset_proc[proc]=0;
270 soffset_proc[proc]=0;
271 roffset_proc[proc]=0;
274 soffset_proc[proc]=soffset_proc[proc-1]+scount_proc[proc-1];
275 roffset_proc[proc]=roffset_proc[proc-1]+rcount_proc[proc-1];
281 roffset_proc[proc]=0;
282 soffset_proc[proc]=0;
285 if(scount_proc[proc]==0) soffset_proc[proc]=0;
289 soffset_proc[proc]=0;
291 if(rcount_proc[proc]!=0)
292 last_recv_count=rcount_proc[proc];
294 if(local_n1_proc[proc]!=0)
295 last_local_n1=local_n1_proc[proc];
299 is_evenly_distributed=0;
300 if((local_n0*nprocs_0-N[0])==0 && (local_n1*nprocs_1-N[1])==0 && nprocs_0==nprocs_1 && nprocs_0==nprocs){
301 is_evenly_distributed=1;
307 stype=
new MPI_Datatype[nprocs];
308 rtype=
new MPI_Datatype[nprocs];
313 for (
int i=0;i<nprocs;i++){
314 MPI_Type_vector(howmany,scount_proc[i],local_n0*N[1]*n_tuples, MPI_DOUBLE, &stype[i]);
315 MPI_Type_vector(howmany,rcount_proc[i],local_n1*N[0]*n_tuples, MPI_DOUBLE, &rtype[i]);
317 MPI_Type_commit(&stype[i]);
318 MPI_Type_commit(&rtype[i]);
320 soffset_proc_w[i]=soffset_proc[i]*8;
321 roffset_proc_w[i]=roffset_proc[i]*8;
325 soffset_proc_f[i]=soffset_proc[i]*howmany;
326 roffset_proc_f[i]=roffset_proc[i]*howmany;
327 scount_proc_f[i]= scount_proc[i]*howmany;
328 rcount_proc_f[i]= rcount_proc[i]*howmany;
352 buffer=Mem_mgr->buffer;
353 buffer_2=Mem_mgr->buffer_2;
354 buffer_d=Mem_mgr->buffer_d;
358 T_Plan_gpu::~T_Plan_gpu(){
361 free(local_0_start_proc);
362 free(local_1_start_proc);
370 free(soffset_proc_w);
371 free(roffset_proc_w);
375 free(soffset_proc_f);
376 free(roffset_proc_f);
388 void T_Plan_gpu::which_method_gpu(T_Plan_gpu* T_plan,
double* data_d){
391 double * time= (
double*) malloc(
sizeof(
double)*(4*(int)log2(nprocs)+4));
392 double * g_time= (
double*) malloc(
sizeof(
double)*(4*(int)log2(nprocs)+4));
393 for (
int i=0;i<4*(int)log2(nprocs)+4;i++)
396 transpose_cuda_v5(T_plan,(
double*)data_d,dummy);
397 time[0]=-MPI_Wtime();
398 transpose_cuda_v5(T_plan,(
double*)data_d,dummy);
399 time[0]+=MPI_Wtime();
401 transpose_cuda_v6(T_plan,(
double*)data_d,dummy);
402 time[1]=-MPI_Wtime();
403 transpose_cuda_v6(T_plan,(
double*)data_d,dummy);
404 time[1]+=MPI_Wtime();
406 if(IsPowerOfTwo(nprocs) && nprocs>511){
409 for (
int i=0;i<(int)log2(nprocs)-4;i++){
410 kway=nprocs/std::pow(2,i);
411 MPI_Barrier(T_plan->comm);
412 transpose_cuda_v7(T_plan,(
double*)data_d,dummy,kway);
413 time[2+i]=-MPI_Wtime();
414 transpose_cuda_v7(T_plan,(
double*)data_d,dummy,kway);
415 time[2+i]+=MPI_Wtime();
421 for (
int i=0;i<(int)log2(nprocs)-4;i++){
422 kway=nprocs/std::pow(2,i);
423 MPI_Barrier(T_plan->comm);
424 transpose_cuda_v7(T_plan,(
double*)data_d,dummy,kway);
425 time[2+(int)log2(nprocs)+i]=-MPI_Wtime();
426 transpose_cuda_v7(T_plan,(
double*)data_d,dummy,kway);
427 time[2+(int)log2(nprocs)+i]+=MPI_Wtime();
433 for (
int i=0;i<(int)log2(nprocs)-4;i++){
434 kway=nprocs/std::pow(2,i);
435 MPI_Barrier(T_plan->comm);
436 transpose_cuda_v7_2(T_plan,(
double*)data_d,dummy,kway);
437 time[2+2*(int)log2(nprocs)+i]=-MPI_Wtime();
438 transpose_cuda_v7_2(T_plan,(
double*)data_d,dummy,kway);
439 time[2+2*(int)log2(nprocs)+i]+=MPI_Wtime();
445 for (
int i=0;i<(int)log2(nprocs)-4;i++){
446 kway=nprocs/std::pow(2,i);
447 MPI_Barrier(T_plan->comm);
448 transpose_cuda_v7_2(T_plan,(
double*)data_d,dummy,kway);
449 time[2+3*(int)log2(nprocs)+i]=-MPI_Wtime();
450 transpose_cuda_v7_2(T_plan,(
double*)data_d,dummy,kway);
451 time[2+3*(int)log2(nprocs)+i]+=MPI_Wtime();
456 transpose_cuda_v5_2(T_plan,(
double*)data_d,dummy);
457 time[4*(int)log2(nprocs)+2]=-MPI_Wtime();
458 transpose_cuda_v5_2(T_plan,(
double*)data_d,dummy);
459 time[4*(int)log2(nprocs)+2]+=MPI_Wtime();
461 transpose_cuda_v5_3(T_plan,(
double*)data_d,dummy);
462 time[4*(int)log2(nprocs)+3]=-MPI_Wtime();
463 transpose_cuda_v5_3(T_plan,(
double*)data_d,dummy);
464 time[4*(int)log2(nprocs)+3]+=MPI_Wtime();
466 MPI_Allreduce(time,g_time,(4*(
int)log2(nprocs)+4),MPI_DOUBLE,MPI_MAX, T_plan->comm);
469 if(T_plan->procid==0){
470 for(
int i=0;i<4*(int)log2(nprocs)+4;++i)
471 std::cout<<
" time["<<i<<
"]= "<<g_time[i]<<
" , ";
475 double smallest=1000;
476 for (
int i=0;i<4*(int)log2(nprocs)+4;i++)
477 smallest=std::min(smallest,g_time[i]);
479 if(g_time[0]==smallest){
482 else if(g_time[1]==smallest){
485 else if(g_time[4*(
int)log2(nprocs)+2]==smallest){
488 else if(g_time[4*(
int)log2(nprocs)+3]==smallest){
492 for (
int i=0;i<(int)log2(nprocs);i++)
493 if(g_time[2+i]==smallest){
495 T_plan->kway=nprocs/std::pow(2,i);
496 T_plan->kway_async=
true;
499 for (
int i=0;i<(int)log2(nprocs);i++)
500 if(g_time[2+(
int)log2(nprocs)+i]==smallest){
502 T_plan->kway=nprocs/std::pow(2,i);
503 T_plan->kway_async=
false;
507 for (
int i=0;i<(int)log2(nprocs);i++)
508 if(g_time[2+2*(
int)log2(nprocs)+i]==smallest){
510 T_plan->kway=nprocs/std::pow(2,i);
511 T_plan->kway_async=
true;
515 for (
int i=0;i<(int)log2(nprocs);i++)
516 if(g_time[2+3*(
int)log2(nprocs)+i]==smallest){
518 T_plan->kway=nprocs/std::pow(2,i);
519 T_plan->kway_async=
false;
525 PCOUT<<
"smallest= "<<smallest<<std::endl;
526 PCOUT<<
"Using transpose v"<<method<<
" kway= "<<T_plan->kway<<
" kway_async="<<T_plan->kway_async<<std::endl;
530 MPI_Barrier(T_plan->comm);
534 void T_Plan_gpu::execute_gpu(T_Plan_gpu* T_plan,
double* data_d,
double *timings,
unsigned flags,
int howmany,
int tag){
539 fast_transpose_cuda_v1(T_plan,(
double*)data_d,timings,flags,howmany, tag);
541 fast_transpose_cuda_v1_2(T_plan,(
double*)data_d,timings,flags,howmany, tag);
543 fast_transpose_cuda_v1_3(T_plan,(
double*)data_d,timings,flags,howmany, tag);
545 fast_transpose_cuda_v2(T_plan,(
double*)data_d,timings,flags,howmany, tag);
547 fast_transpose_cuda_v3(T_plan,(
double*)data_d,timings,kway,flags,howmany, tag);
549 fast_transpose_cuda_v3_2(T_plan,(
double*)data_d,timings,kway,flags,howmany, tag);
554 fast_transpose_cuda_v1_h(T_plan,(
double*)data_d,timings,flags,howmany, tag);
556 fast_transpose_cuda_v1_2_h(T_plan,(
double*)data_d,timings,flags,howmany, tag);
558 fast_transpose_cuda_v1_3_h(T_plan,(
double*)data_d,timings,flags,howmany, tag);
560 fast_transpose_cuda_v2_h(T_plan,(
double*)data_d,timings,flags,howmany, tag);
561 if(method==3 || method==32)
562 fast_transpose_cuda_v3_h(T_plan,(
double*)data_d,timings,kway,flags,howmany, tag);
565 transpose_cuda_v5(T_plan,(
double*)data_d,timings,flags,howmany, tag);
567 transpose_cuda_v5_2(T_plan,(
double*)data_d,timings,flags,howmany, tag);
569 transpose_cuda_v5_3(T_plan,(
double*)data_d,timings,flags,howmany, tag);
571 transpose_cuda_v6(T_plan,(
double*)data_d,timings,flags,howmany, tag);
573 transpose_cuda_v7(T_plan,(
double*)data_d,timings,kway,flags,howmany, tag);
575 transpose_cuda_v7_2(T_plan,(
double*)data_d,timings,kway,flags,howmany, tag);
581 void T_Plan_gpu::which_fast_method_gpu(T_Plan_gpu* T_plan,
double* data_d){
584 double * time= (
double*) malloc(
sizeof(
double)*(4*(int)log2(nprocs)+4));
585 double * g_time= (
double*) malloc(
sizeof(
double)*(4*(int)log2(nprocs)+4));
586 for (
int i=0;i<4*(int)log2(nprocs)+4;i++)
589 fast_transpose_cuda_v1(T_plan,(
double*)data_d,dummy,2);
590 time[0]=-MPI_Wtime();
591 fast_transpose_cuda_v1(T_plan,(
double*)data_d,dummy,2);
592 time[0]+=MPI_Wtime();
594 fast_transpose_cuda_v2(T_plan,(
double*)data_d,dummy,2);
595 time[1]=-MPI_Wtime();
596 fast_transpose_cuda_v2(T_plan,(
double*)data_d,dummy,2);
597 time[1]+=MPI_Wtime();
599 if(IsPowerOfTwo(nprocs) && nprocs>511){
602 for (
int i=0;i<(int)log2(nprocs)-4;i++){
603 kway=nprocs/std::pow(2,i);
604 MPI_Barrier(T_plan->comm);
605 fast_transpose_cuda_v3(T_plan,(
double*)data_d,dummy,kway,2);
606 time[2+i]=-MPI_Wtime();
607 fast_transpose_cuda_v3(T_plan,(
double*)data_d,dummy,kway,2);
608 time[2+i]+=MPI_Wtime();
614 for (
int i=0;i<(int)log2(nprocs)-4;i++){
615 kway=nprocs/std::pow(2,i);
616 MPI_Barrier(T_plan->comm);
617 fast_transpose_cuda_v3(T_plan,(
double*)data_d,dummy,kway,2);
618 time[2+(int)log2(nprocs)+i]=-MPI_Wtime();
619 fast_transpose_cuda_v3(T_plan,(
double*)data_d,dummy,kway,2);
620 time[2+(int)log2(nprocs)+i]+=MPI_Wtime();
626 for (
int i=0;i<(int)log2(nprocs)-4;i++){
627 kway=nprocs/std::pow(2,i);
628 MPI_Barrier(T_plan->comm);
629 fast_transpose_cuda_v3_2(T_plan,(
double*)data_d,dummy,kway,2);
630 time[2+2*(int)log2(nprocs)+i]=-MPI_Wtime();
631 fast_transpose_cuda_v3_2(T_plan,(
double*)data_d,dummy,kway,2);
632 time[2+2*(int)log2(nprocs)+i]+=MPI_Wtime();
638 for (
int i=0;i<(int)log2(nprocs)-4;i++){
639 kway=nprocs/std::pow(2,i);
640 MPI_Barrier(T_plan->comm);
641 fast_transpose_cuda_v3_2(T_plan,(
double*)data_d,dummy,kway,2);
642 time[2+3*(int)log2(nprocs)+i]=-MPI_Wtime();
643 fast_transpose_cuda_v3_2(T_plan,(
double*)data_d,dummy,kway,2);
644 time[2+3*(int)log2(nprocs)+i]+=MPI_Wtime();
649 fast_transpose_cuda_v1_2(T_plan,(
double*)data_d,dummy,2);
650 time[4*(int)log2(nprocs)+2]=-MPI_Wtime();
651 fast_transpose_cuda_v1_2(T_plan,(
double*)data_d,dummy,2);
652 time[4*(int)log2(nprocs)+2]+=MPI_Wtime();
654 fast_transpose_cuda_v1_3(T_plan,(
double*)data_d,dummy,2);
655 time[4*(int)log2(nprocs)+3]=-MPI_Wtime();
656 fast_transpose_cuda_v1_3(T_plan,(
double*)data_d,dummy,2);
657 time[4*(int)log2(nprocs)+3]+=MPI_Wtime();
659 MPI_Allreduce(time,g_time,(4*(
int)log2(nprocs)+4),MPI_DOUBLE,MPI_MAX, T_plan->comm);
661 if(T_plan->procid==0){
662 for(
int i=0;i<4*(int)log2(nprocs)+4;++i)
663 std::cout<<
" time["<<i<<
"]= "<<g_time[i]<<
" , ";
667 double smallest=1000;
668 for (
int i=0;i<4*(int)log2(nprocs)+4;i++)
669 smallest=std::min(smallest,g_time[i]);
671 if(g_time[0]==smallest){
674 else if(g_time[1]==smallest){
677 else if(g_time[4*(
int)log2(nprocs)+2]==smallest){
680 else if(g_time[4*(
int)log2(nprocs)+3]==smallest){
684 for (
int i=0;i<(int)log2(nprocs);i++)
685 if(g_time[2+i]==smallest){
687 T_plan->kway=nprocs/std::pow(2,i);
688 T_plan->kway_async=
true;
691 for (
int i=0;i<(int)log2(nprocs);i++)
692 if(g_time[2+(
int)log2(nprocs)+i]==smallest){
694 T_plan->kway=nprocs/std::pow(2,i);
695 T_plan->kway_async=
false;
699 for (
int i=0;i<(int)log2(nprocs);i++)
700 if(g_time[2+2*(
int)log2(nprocs)+i]==smallest){
702 T_plan->kway=nprocs/std::pow(2,i);
703 T_plan->kway_async=
true;
707 for (
int i=0;i<(int)log2(nprocs);i++)
708 if(g_time[2+3*(
int)log2(nprocs)+i]==smallest){
710 T_plan->kway=nprocs/std::pow(2,i);
711 T_plan->kway_async=
false;
717 PCOUT<<
"smallest= "<<smallest<<std::endl;
718 PCOUT<<
"Using transpose v"<<method<<
" kway= "<<T_plan->kway<<
" kway_async="<<T_plan->kway_async<<std::endl;
722 MPI_Barrier(T_plan->comm);
731 void fast_transpose_cuda_v1_h(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
733 std::bitset<8> Flags(flags);
734 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
735 MPI_Barrier(T_plan->comm);
739 transpose_cuda_v5(T_plan,(
double*)data,timings,flags,howmany,tag);
740 MPI_Barrier(T_plan->comm);
743 timings[0]-=MPI_Wtime();
745 int nprocs_0, nprocs_1;
746 nprocs=T_plan->nprocs;
747 procid=T_plan->procid;
748 nprocs_0=T_plan->nprocs_0;
749 nprocs_1=T_plan->nprocs_1;
750 ptrdiff_t *N=T_plan->N;
751 ptrdiff_t local_n0=T_plan->local_n0;
752 ptrdiff_t local_n1=T_plan->local_n1;
753 ptrdiff_t n_tuples=T_plan->n_tuples;
755 double * data_cpu=T_plan->buffer;
756 double * send_recv_cpu = T_plan->buffer_2;
757 double * send_recv_d = T_plan->buffer_d;
759 int idist=N[1]*local_n0*n_tuples;
760 int odist=N[0]*local_n1*n_tuples;
762 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
765 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
767 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
768 for(
int h=0;h<howmany;h++)
769 for(
int id=0;
id<nprocs;++id){
771 for(
int i=0;i<local_n0;i++){
772 std::cout<<std::endl;
773 for(
int j=0;j<N[1];j++){
774 ptr=h*idist+(i*N[1]+j)*n_tuples;
775 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
779 MPI_Barrier(T_plan->comm);
786 ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
787 ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
788 ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
789 ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
790 shuffle_time-=MPI_Wtime();
791 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
792 #pragma omp parallel for
793 for(
int h=0;h<howmany;h++)
794 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
796 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
797 #pragma omp parallel for
798 for(
int h=0;h<howmany;h++)
799 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
802 shuffle_time+=MPI_Wtime();
803 timings[0]+=MPI_Wtime();
804 timings[0]+=shuffle_time;
805 timings[1]+=shuffle_time;
808 MPI_Barrier(T_plan->comm);
828 memcpy_v1_h1(nprocs_1,howmany,local_n0,n_tuples,local_n1_proc,send_recv_d,data,idist,N[1],local_1_start_proc);
829 cudaDeviceSynchronize();
832 shuffle_time+=MPI_Wtime();
834 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
836 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
837 for(
int id=0;
id<nprocs;++id){
838 for(
int h=0;h<howmany;h++)
840 for(
int i=0;i<N[1];i++){
841 std::cout<<std::endl;
842 for(
int j=0;j<local_n0;j++){
843 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
848 MPI_Barrier(T_plan->comm);
857 int* scount_proc= T_plan->scount_proc;
858 int* rcount_proc= T_plan->rcount_proc;
859 int* soffset_proc= T_plan->soffset_proc;
860 int* roffset_proc= T_plan->roffset_proc;
862 MPI_Barrier(T_plan->comm);
865 comm_time-=MPI_Wtime();
867 int soffset=0,roffset=0;
869 MPI_Request * s_request=
new MPI_Request[nprocs];
870 MPI_Request * request=
new MPI_Request[nprocs];
871 #pragma omp parallel for
872 for (
int proc=0;proc<nprocs;++proc){
873 request[proc]=MPI_REQUEST_NULL;
874 s_request[proc]=MPI_REQUEST_NULL;
877 double *s_buf, *r_buf;
878 s_buf=data_cpu; r_buf=send_recv_cpu;
879 double *r_buf_d=send_recv_d;
884 for (
int proc=0;proc<nprocs;++proc){
886 roffset=roffset_proc[proc];
887 MPI_Irecv(&r_buf[roffset*howmany],rcount_proc[proc]*howmany,MPI_DOUBLE, proc,
888 tag, T_plan->comm, &request[proc]);
892 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
893 for (
int proc=0;proc<nprocs;++proc){
895 soffset=soffset_proc[proc];
896 MPI_Isend(&s_buf[soffset*howmany],scount_proc[proc]*howmany,MPI_DOUBLE,proc, tag,
897 T_plan->comm, &s_request[proc]);
901 soffset=soffset_proc[procid];
902 roffset=roffset_proc[procid];
903 memcpy(&r_buf[roffset*howmany],&s_buf[soffset*howmany],howmany*
sizeof(
double)*scount_proc[procid]);
905 for (
int proc=0;proc<nprocs;++proc){
906 MPI_Wait(&request[proc], &ierr);
907 MPI_Wait(&s_request[proc], &ierr);
911 cudaMemcpy(r_buf_d, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
912 cudaDeviceSynchronize();
913 comm_time+=MPI_Wtime();
917 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
919 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
920 for(
int id=0;
id<nprocs;++id){
922 for(
int h=0;h<howmany;h++)
923 for(
int i=0;i<local_n1;i++){
924 std::cout<<std::endl;
925 for(
int j=0;j<N[0];j++){
926 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
931 MPI_Barrier(T_plan->comm);
938 reshuffle_time-=MPI_Wtime();
955 memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_d);
956 cudaDeviceSynchronize();
958 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
960 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
961 for(
int id=0;
id<nprocs_1;++id){
963 for(
int h=0;h<howmany;h++)
964 for(
int i=0;i<N[0];i++){
965 std::cout<<std::endl;
966 for(
int j=0;j<local_n1;j++){
967 ptr=h*odist+(i*local_n1+j)*n_tuples;
968 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
972 MPI_Barrier(T_plan->comm);
978 #pragma omp parallel for
979 for(
int h=0;h<howmany;h++)
980 local_transpose_cuda(N[0],local_n1,n_tuples,&data[h*odist] );
982 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
984 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
985 for(
int id=0;
id<nprocs_1;++id){
987 for(
int h=0;h<howmany;h++)
988 for(
int i=0;i<N[0];i++){
989 std::cout<<std::endl;
990 for(
int j=0;j<local_n1;j++){
991 ptr=h*odist+(i*local_n1+j)*n_tuples;
992 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
996 MPI_Barrier(T_plan->comm);
1001 reshuffle_time+=MPI_Wtime();
1002 MPI_Barrier(T_plan->comm);
1004 delete [] s_request;
1008 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
1009 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
1010 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
1011 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1013 timings[0]+=MPI_Wtime();
1014 timings[1]+=shuffle_time;
1015 timings[2]+=comm_time;
1016 timings[3]+=reshuffle_time;
1020 void fast_transpose_cuda_v1_2_h(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
1022 std::bitset<8> Flags(flags);
1023 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
1024 MPI_Barrier(T_plan->comm);
1028 transpose_cuda_v5(T_plan,(
double*)data,timings,flags,howmany,tag);
1029 MPI_Barrier(T_plan->comm);
1032 timings[0]-=MPI_Wtime();
1034 int nprocs_0, nprocs_1;
1035 nprocs=T_plan->nprocs;
1036 procid=T_plan->procid;
1037 nprocs_0=T_plan->nprocs_0;
1038 nprocs_1=T_plan->nprocs_1;
1039 ptrdiff_t *N=T_plan->N;
1040 ptrdiff_t local_n0=T_plan->local_n0;
1041 ptrdiff_t local_n1=T_plan->local_n1;
1042 ptrdiff_t n_tuples=T_plan->n_tuples;
1044 double * data_cpu=T_plan->buffer;
1045 double * send_recv_cpu = T_plan->buffer_2;
1046 double * send_recv_d = T_plan->buffer_d;
1048 int idist=N[1]*local_n0*n_tuples;
1049 int odist=N[0]*local_n1*n_tuples;
1051 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1054 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
1056 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1057 for(
int h=0;h<howmany;h++)
1058 for(
int id=0;
id<nprocs;++id){
1060 for(
int i=0;i<local_n0;i++){
1061 std::cout<<std::endl;
1062 for(
int j=0;j<N[1];j++){
1063 ptr=h*idist+(i*N[1]+j)*n_tuples;
1064 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
1068 MPI_Barrier(T_plan->comm);
1075 ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
1076 ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
1077 ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
1078 ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
1079 shuffle_time-=MPI_Wtime();
1080 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1081 #pragma omp parallel for
1082 for(
int h=0;h<howmany;h++)
1083 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
1085 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1086 #pragma omp parallel for
1087 for(
int h=0;h<howmany;h++)
1088 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
1091 shuffle_time+=MPI_Wtime();
1092 timings[0]+=MPI_Wtime();
1093 timings[0]+=shuffle_time;
1094 timings[1]+=shuffle_time;
1097 MPI_Barrier(T_plan->comm);
1117 memcpy_v1_h1(nprocs_1,howmany,local_n0,n_tuples,local_n1_proc,send_recv_d,data,idist,N[1],local_1_start_proc);
1118 cudaDeviceSynchronize();
1121 shuffle_time+=MPI_Wtime();
1123 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
1125 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1126 for(
int id=0;
id<nprocs;++id){
1127 for(
int h=0;h<howmany;h++)
1129 for(
int i=0;i<N[1];i++){
1130 std::cout<<std::endl;
1131 for(
int j=0;j<local_n0;j++){
1132 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
1137 MPI_Barrier(T_plan->comm);
1146 int* scount_proc= T_plan->scount_proc;
1147 int* rcount_proc= T_plan->rcount_proc;
1148 int* soffset_proc= T_plan->soffset_proc;
1149 int* roffset_proc= T_plan->roffset_proc;
1151 MPI_Barrier(T_plan->comm);
1154 comm_time-=MPI_Wtime();
1156 int soffset=0,roffset=0;
1158 MPI_Request * s_request=
new MPI_Request[nprocs];
1159 MPI_Request * request=
new MPI_Request[nprocs];
1160 int flag[nprocs],color[nprocs];
1161 memset(flag,0,
sizeof(
int)*nprocs);
1162 memset(color,0,
sizeof(
int)*nprocs);
1164 #pragma omp parallel for
1165 for (
int proc=0;proc<nprocs;++proc){
1166 request[proc]=MPI_REQUEST_NULL;
1167 s_request[proc]=MPI_REQUEST_NULL;
1170 double *s_buf, *r_buf;
1171 s_buf=data_cpu; r_buf=send_recv_cpu;
1172 double *r_buf_d=send_recv_d;
1178 for (
int proc=0;proc<nprocs;++proc){
1180 soffset=soffset_proc[proc];
1181 cudaMemcpy(&s_buf[soffset*howmany], &send_recv_d[soffset*howmany],howmany*
sizeof(
double)*scount_proc[proc], cudaMemcpyDeviceToHost);
1182 MPI_Isend(&s_buf[soffset*howmany],scount_proc[proc]*howmany, MPI_DOUBLE,proc, tag,
1183 T_plan->comm, &s_request[proc]);
1190 cudaMemcpy(&r_buf[roffset_proc[procid]*howmany], &send_recv_d[soffset_proc[procid]*howmany],howmany*
sizeof(
double)*scount_proc[procid], cudaMemcpyDeviceToHost);
1194 for (
int proc=0;proc<nprocs;++proc){
1196 roffset=roffset_proc[proc];
1197 MPI_Irecv(&r_buf[roffset*howmany],rcount_proc[proc]*howmany,MPI_DOUBLE, proc,
1198 tag, T_plan->comm, &request[proc]);
1201 while(counter!=nprocs+1){
1203 for (
int proc=0;proc<nprocs;++proc){
1204 MPI_Test(&request[proc], &flag[proc],&ierr);
1205 if(flag[proc]==1 && color[proc]==0){
1206 cudaMemcpyAsync(&r_buf_d[roffset_proc[proc]*howmany],&r_buf[roffset_proc[proc]*howmany],howmany*
sizeof(
double)*rcount_proc[proc],cudaMemcpyHostToDevice);
1215 cudaDeviceSynchronize();
1216 comm_time+=MPI_Wtime();
1223 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
1225 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1226 for(
int id=0;
id<nprocs;++id){
1228 for(
int h=0;h<howmany;h++)
1229 for(
int i=0;i<local_n1;i++){
1230 std::cout<<std::endl;
1231 for(
int j=0;j<N[0];j++){
1232 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
1237 MPI_Barrier(T_plan->comm);
1244 reshuffle_time-=MPI_Wtime();
1261 memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_d);
1262 cudaDeviceSynchronize();
1264 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
1266 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1267 for(
int id=0;
id<nprocs_1;++id){
1269 for(
int h=0;h<howmany;h++)
1270 for(
int i=0;i<N[0];i++){
1271 std::cout<<std::endl;
1272 for(
int j=0;j<local_n1;j++){
1273 ptr=h*odist+(i*local_n1+j)*n_tuples;
1274 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
1278 MPI_Barrier(T_plan->comm);
1284 #pragma omp parallel for
1285 for(
int h=0;h<howmany;h++)
1286 local_transpose_cuda(N[0],local_n1,n_tuples,&data[h*odist] );
1288 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
1290 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1291 for(
int id=0;
id<nprocs_1;++id){
1293 for(
int h=0;h<howmany;h++)
1294 for(
int i=0;i<N[0];i++){
1295 std::cout<<std::endl;
1296 for(
int j=0;j<local_n1;j++){
1297 ptr=h*odist+(i*local_n1+j)*n_tuples;
1298 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
1302 MPI_Barrier(T_plan->comm);
1307 reshuffle_time+=MPI_Wtime();
1308 MPI_Barrier(T_plan->comm);
1310 delete [] s_request;
1314 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
1315 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
1316 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
1317 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1319 timings[0]+=MPI_Wtime();
1320 timings[1]+=shuffle_time;
1321 timings[2]+=comm_time;
1322 timings[3]+=reshuffle_time;
1325 void fast_transpose_cuda_v1_3_h(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
1327 std::bitset<8> Flags(flags);
1328 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
1329 MPI_Barrier(T_plan->comm);
1333 transpose_cuda_v5(T_plan,(
double*)data,timings,flags,howmany,tag);
1334 MPI_Barrier(T_plan->comm);
1337 timings[0]-=MPI_Wtime();
1339 int nprocs_0, nprocs_1;
1340 nprocs=T_plan->nprocs;
1341 procid=T_plan->procid;
1342 nprocs_0=T_plan->nprocs_0;
1343 nprocs_1=T_plan->nprocs_1;
1344 ptrdiff_t *N=T_plan->N;
1345 ptrdiff_t local_n0=T_plan->local_n0;
1346 ptrdiff_t local_n1=T_plan->local_n1;
1347 ptrdiff_t n_tuples=T_plan->n_tuples;
1349 double * data_cpu=T_plan->buffer;
1350 double * send_recv_cpu = T_plan->buffer_2;
1351 double * send_recv_d = T_plan->buffer_d;
1353 int idist=N[1]*local_n0*n_tuples;
1354 int odist=N[0]*local_n1*n_tuples;
1356 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1359 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
1361 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1362 for(
int h=0;h<howmany;h++)
1363 for(
int id=0;
id<nprocs;++id){
1365 for(
int i=0;i<local_n0;i++){
1366 std::cout<<std::endl;
1367 for(
int j=0;j<N[1];j++){
1368 ptr=h*idist+(i*N[1]+j)*n_tuples;
1369 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
1373 MPI_Barrier(T_plan->comm);
1380 ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
1381 ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
1382 ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
1383 ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
1384 shuffle_time-=MPI_Wtime();
1385 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1386 #pragma omp parallel for
1387 for(
int h=0;h<howmany;h++)
1388 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
1390 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1391 #pragma omp parallel for
1392 for(
int h=0;h<howmany;h++)
1393 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
1396 shuffle_time+=MPI_Wtime();
1397 timings[0]+=MPI_Wtime();
1398 timings[0]+=shuffle_time;
1399 timings[1]+=shuffle_time;
1402 MPI_Barrier(T_plan->comm);
1422 memcpy_v1_h1(nprocs_1,howmany,local_n0,n_tuples,local_n1_proc,send_recv_d,data,idist,N[1],local_1_start_proc);
1423 cudaDeviceSynchronize();
1426 shuffle_time+=MPI_Wtime();
1428 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
1430 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1431 for(
int id=0;
id<nprocs;++id){
1432 for(
int h=0;h<howmany;h++)
1434 for(
int i=0;i<N[1];i++){
1435 std::cout<<std::endl;
1436 for(
int j=0;j<local_n0;j++){
1437 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
1442 MPI_Barrier(T_plan->comm);
1451 int* scount_proc= T_plan->scount_proc;
1452 int* rcount_proc= T_plan->rcount_proc;
1453 int* soffset_proc= T_plan->soffset_proc;
1454 int* roffset_proc= T_plan->roffset_proc;
1456 MPI_Barrier(T_plan->comm);
1459 comm_time-=MPI_Wtime();
1461 int soffset=0,roffset=0;
1464 double *s_buf, *r_buf;
1465 s_buf=data_cpu; r_buf=send_recv_cpu;
1466 double *r_buf_d=send_recv_d;
1472 for (
int proc=0;proc<nprocs;++proc){
1474 soffset=soffset_proc[proc];
1475 roffset=roffset_proc[proc];
1476 cudaMemcpy(&s_buf[soffset*howmany], &send_recv_d[soffset*howmany],howmany*
sizeof(
double)*scount_proc[proc], cudaMemcpyDeviceToHost);
1477 MPI_Sendrecv(&s_buf[soffset*howmany],scount_proc[proc]*howmany, MPI_DOUBLE,
1479 &r_buf[roffset*howmany],howmany*rcount_proc[proc], MPI_DOUBLE,
1481 T_plan->comm,&ierr);
1488 cudaMemcpy(&r_buf[roffset_proc[procid]*howmany], &send_recv_d[soffset_proc[procid]*howmany],howmany*
sizeof(
double)*scount_proc[procid], cudaMemcpyDeviceToHost);
1492 cudaMemcpy(r_buf_d,r_buf,T_plan->alloc_local,cudaMemcpyHostToDevice);
1495 cudaDeviceSynchronize();
1496 comm_time+=MPI_Wtime();
1500 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
1502 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1503 for(
int id=0;
id<nprocs;++id){
1505 for(
int h=0;h<howmany;h++)
1506 for(
int i=0;i<local_n1;i++){
1507 std::cout<<std::endl;
1508 for(
int j=0;j<N[0];j++){
1509 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
1514 MPI_Barrier(T_plan->comm);
1521 reshuffle_time-=MPI_Wtime();
1538 memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_d);
1539 cudaDeviceSynchronize();
1541 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
1543 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1544 for(
int id=0;
id<nprocs_1;++id){
1546 for(
int h=0;h<howmany;h++)
1547 for(
int i=0;i<N[0];i++){
1548 std::cout<<std::endl;
1549 for(
int j=0;j<local_n1;j++){
1550 ptr=h*odist+(i*local_n1+j)*n_tuples;
1551 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
1555 MPI_Barrier(T_plan->comm);
1561 #pragma omp parallel for
1562 for(
int h=0;h<howmany;h++)
1563 local_transpose_cuda(N[0],local_n1,n_tuples,&data[h*odist] );
1565 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
1567 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1568 for(
int id=0;
id<nprocs_1;++id){
1570 for(
int h=0;h<howmany;h++)
1571 for(
int i=0;i<N[0];i++){
1572 std::cout<<std::endl;
1573 for(
int j=0;j<local_n1;j++){
1574 ptr=h*odist+(i*local_n1+j)*n_tuples;
1575 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
1579 MPI_Barrier(T_plan->comm);
1584 reshuffle_time+=MPI_Wtime();
1585 MPI_Barrier(T_plan->comm);
1588 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
1589 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
1590 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
1591 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1593 timings[0]+=MPI_Wtime();
1594 timings[1]+=shuffle_time;
1595 timings[2]+=comm_time;
1596 timings[3]+=reshuffle_time;
1600 void fast_transpose_cuda_v1(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
1602 std::bitset<8> Flags(flags);
1603 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
1604 MPI_Barrier(T_plan->comm);
1608 transpose_cuda_v5(T_plan,(
double*)data,timings,flags,howmany,tag);
1609 MPI_Barrier(T_plan->comm);
1612 timings[0]-=MPI_Wtime();
1614 nprocs=T_plan->nprocs;
1615 procid=T_plan->procid;
1616 int nprocs_1=T_plan->nprocs_1;
1617 ptrdiff_t *N=T_plan->N;
1619 double * data_cpu=T_plan->buffer;
1620 double * send_recv_cpu = T_plan->buffer_2;
1621 double * send_recv_d = T_plan->buffer_d;
1624 ptrdiff_t local_n0=T_plan->local_n0;
1625 ptrdiff_t local_n1=T_plan->local_n1;
1626 ptrdiff_t n_tuples=T_plan->n_tuples;
1628 int idist=N[1]*local_n0*n_tuples;
1629 int odist=N[0]*local_n1*n_tuples;
1631 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1633 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
1635 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1636 for(
int h=0;h<howmany;h++)
1637 for(
int id=0;
id<nprocs;++id){
1639 for(
int i=0;i<local_n0;i++){
1640 std::cout<<std::endl;
1641 for(
int j=0;j<N[1];j++){
1642 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
1646 MPI_Barrier(T_plan->comm);
1653 shuffle_time-=MPI_Wtime();
1656 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1657 #pragma omp parallel for
1658 for(
int h=0;h<howmany;h++)
1659 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
1661 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1662 #pragma omp parallel for
1663 for(
int h=0;h<howmany;h++)
1664 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
1667 shuffle_time+=MPI_Wtime();
1668 timings[0]+=MPI_Wtime();
1669 timings[0]+=shuffle_time;
1670 timings[1]+=shuffle_time;
1673 MPI_Barrier(T_plan->comm);
1679 local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
1681 shuffle_time+=MPI_Wtime();
1683 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
1685 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1686 for(
int h=0;h<howmany;h++)
1687 for(
int id=0;
id<nprocs;++id){
1689 for(
int i=0;i<N[1];i++){
1690 std::cout<<std::endl;
1691 for(
int j=0;j<local_n0;j++){
1692 std::cout<<
'\t'<<data_cpu[ptr];
1697 MPI_Barrier(T_plan->comm);
1705 int* scount_proc= T_plan->scount_proc;
1706 int* rcount_proc= T_plan->rcount_proc;
1707 int* soffset_proc= T_plan->soffset_proc;
1708 int* roffset_proc= T_plan->roffset_proc;
1710 MPI_Barrier(T_plan->comm);
1713 comm_time-=MPI_Wtime();
1715 int soffset=0,roffset=0;
1717 MPI_Request * s_request=
new MPI_Request[nprocs];
1718 MPI_Request * request=
new MPI_Request[nprocs];
1719 #pragma omp parallel for
1720 for (
int proc=0;proc<nprocs;++proc){
1721 request[proc]=MPI_REQUEST_NULL;
1722 s_request[proc]=MPI_REQUEST_NULL;
1724 double *s_buf, *r_buf;
1725 s_buf=data_cpu; r_buf=send_recv_cpu;
1730 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1731 for (
int proc=0;proc<nprocs;++proc){
1733 soffset=soffset_proc[proc];
1734 roffset=roffset_proc[proc];
1735 MPI_Isend(&s_buf[soffset],scount_proc[proc],MPI_DOUBLE,proc, tag,
1736 T_plan->comm, &s_request[proc]);
1737 MPI_Irecv(&r_buf[roffset],rcount_proc[proc],MPI_DOUBLE, proc,
1738 tag, T_plan->comm, &request[proc]);
1742 soffset=soffset_proc[procid];
1743 roffset=roffset_proc[procid];
1744 for(
int h=0;h<howmany;h++)
1745 memcpy(&r_buf[h*odist+roffset],&s_buf[h*idist+soffset],
sizeof(
double)*scount_proc[procid]);
1750 for (
int proc=0;proc<nprocs;++proc){
1751 MPI_Wait(&request[proc], &ierr);
1752 MPI_Wait(&s_request[proc], &ierr);
1757 cudaMemcpy(data, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
1759 cudaMemcpy(send_recv_d, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
1760 comm_time+=MPI_Wtime();
1763 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
1765 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1766 for(
int h=0;h<howmany;h++)
1767 for(
int id=0;
id<nprocs;++id){
1769 for(
int i=0;i<local_n1;i++){
1770 std::cout<<std::endl;
1771 for(
int j=0;j<N[0];j++){
1772 std::cout<<
'\t'<<data_cpu[ptr];
1777 MPI_Barrier(T_plan->comm);
1783 reshuffle_time-=MPI_Wtime();
1786 local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
1791 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
1793 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1794 for(
int h=0;h<howmany;h++)
1795 for(
int id=0;
id<nprocs_1;++id){
1797 for(
int i=0;i<local_n1;i++){
1798 std::cout<<std::endl;
1799 for(
int j=0;j<N[0];j++){
1800 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
1804 MPI_Barrier(T_plan->comm);
1808 reshuffle_time+=MPI_Wtime();
1810 delete [] s_request;
1814 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
1815 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
1816 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
1817 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1819 timings[0]+=MPI_Wtime();
1820 timings[1]+=shuffle_time;
1821 timings[2]+=comm_time;
1822 timings[3]+=reshuffle_time;
1828 void fast_transpose_cuda_v1_2(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
1830 std::bitset<8> Flags(flags);
1831 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
1832 MPI_Barrier(T_plan->comm);
1836 transpose_cuda_v5(T_plan,(
double*)data,timings,flags,howmany,tag);
1837 MPI_Barrier(T_plan->comm);
1840 timings[0]-=MPI_Wtime();
1842 nprocs=T_plan->nprocs;
1843 procid=T_plan->procid;
1844 int nprocs_1=T_plan->nprocs_1;
1845 ptrdiff_t *N=T_plan->N;
1847 double * data_cpu=T_plan->buffer;
1848 double * send_recv_cpu = T_plan->buffer_2;
1849 double * send_recv_d = T_plan->buffer_d;
1852 ptrdiff_t local_n0=T_plan->local_n0;
1853 ptrdiff_t local_n1=T_plan->local_n1;
1854 ptrdiff_t n_tuples=T_plan->n_tuples;
1856 int idist=N[1]*local_n0*n_tuples;
1857 int odist=N[0]*local_n1*n_tuples;
1859 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1861 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
1863 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1864 for(
int h=0;h<howmany;h++)
1865 for(
int id=0;
id<nprocs;++id){
1867 for(
int i=0;i<local_n0;i++){
1868 std::cout<<std::endl;
1869 for(
int j=0;j<N[1];j++){
1870 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
1874 MPI_Barrier(T_plan->comm);
1881 shuffle_time-=MPI_Wtime();
1884 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1885 #pragma omp parallel for
1886 for(
int h=0;h<howmany;h++)
1887 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
1889 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1890 #pragma omp parallel for
1891 for(
int h=0;h<howmany;h++)
1892 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
1895 shuffle_time+=MPI_Wtime();
1896 timings[0]+=MPI_Wtime();
1897 timings[0]+=shuffle_time;
1898 timings[1]+=shuffle_time;
1901 MPI_Barrier(T_plan->comm);
1907 local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
1909 shuffle_time+=MPI_Wtime();
1911 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
1913 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1914 for(
int h=0;h<howmany;h++)
1915 for(
int id=0;
id<nprocs;++id){
1917 for(
int i=0;i<N[1];i++){
1918 std::cout<<std::endl;
1919 for(
int j=0;j<local_n0;j++){
1920 std::cout<<
'\t'<<data_cpu[ptr];
1925 MPI_Barrier(T_plan->comm);
1933 int* scount_proc= T_plan->scount_proc;
1934 int* rcount_proc= T_plan->rcount_proc;
1935 int* soffset_proc= T_plan->soffset_proc;
1936 int* roffset_proc= T_plan->roffset_proc;
1938 MPI_Barrier(T_plan->comm);
1941 comm_time-=MPI_Wtime();
1943 int soffset=0,roffset=0;
1945 MPI_Request * s_request=
new MPI_Request[nprocs];
1946 MPI_Request * request=
new MPI_Request[nprocs];
1947 int flag[nprocs],color[nprocs];
1948 memset(flag,0,
sizeof(
int)*nprocs);
1949 memset(color,0,
sizeof(
int)*nprocs);
1951 #pragma omp parallel for
1952 for (
int proc=0;proc<nprocs;++proc){
1953 request[proc]=MPI_REQUEST_NULL;
1954 s_request[proc]=MPI_REQUEST_NULL;
1956 double *s_buf, *r_buf;
1957 s_buf=data_cpu; r_buf=send_recv_cpu;
1962 r_buf_d=send_recv_d;
1969 for (
int proc=0;proc<nprocs;++proc){
1971 roffset=roffset_proc[proc];
1972 MPI_Irecv(&r_buf[roffset],rcount_proc[proc], MPI_DOUBLE, proc,
1973 tag, T_plan->comm, &request[proc]);
1978 for (
int proc=0;proc<nprocs;++proc){
1980 soffset=soffset_proc[proc];
1981 cudaMemcpy(&s_buf[soffset], &send_recv_d[soffset],
sizeof(
double)*scount_proc[proc], cudaMemcpyDeviceToHost);
1982 MPI_Isend(&s_buf[soffset],scount_proc[proc], MPI_DOUBLE,proc, tag,
1983 T_plan->comm, &s_request[proc]);
1990 cudaMemcpy(&r_buf[roffset_proc[procid]], &send_recv_d[soffset_proc[procid]],
sizeof(
double)*scount_proc[procid], cudaMemcpyDeviceToHost);
1994 while(counter!=nprocs+1){
1996 for (
int proc=0;proc<nprocs;++proc){
1997 MPI_Test(&request[proc], &flag[proc],&ierr);
1998 if(flag[proc]==1 && color[proc]==0){
1999 cudaMemcpyAsync(&r_buf_d[roffset_proc[proc]],&r_buf[roffset_proc[proc]],
sizeof(
double)*rcount_proc[proc],cudaMemcpyHostToDevice);
2006 cudaDeviceSynchronize();
2007 comm_time+=MPI_Wtime();
2017 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
2019 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2020 for(
int h=0;h<howmany;h++)
2021 for(
int id=0;
id<nprocs;++id){
2023 for(
int i=0;i<local_n1;i++){
2024 std::cout<<std::endl;
2025 for(
int j=0;j<N[0];j++){
2026 std::cout<<
'\t'<<data_cpu[ptr];
2031 MPI_Barrier(T_plan->comm);
2037 reshuffle_time-=MPI_Wtime();
2040 local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
2045 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
2047 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2048 for(
int h=0;h<howmany;h++)
2049 for(
int id=0;
id<nprocs_1;++id){
2051 for(
int i=0;i<local_n1;i++){
2052 std::cout<<std::endl;
2053 for(
int j=0;j<N[0];j++){
2054 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
2058 MPI_Barrier(T_plan->comm);
2062 reshuffle_time+=MPI_Wtime();
2064 delete [] s_request;
2068 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
2069 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
2070 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
2071 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2073 timings[0]+=MPI_Wtime();
2074 timings[1]+=shuffle_time;
2075 timings[2]+=comm_time;
2076 timings[3]+=reshuffle_time;
2081 void fast_transpose_cuda_v1_3(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
2083 std::bitset<8> Flags(flags);
2084 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
2085 MPI_Barrier(T_plan->comm);
2089 transpose_cuda_v5(T_plan,(
double*)data,timings,flags,howmany,tag);
2090 MPI_Barrier(T_plan->comm);
2093 timings[0]-=MPI_Wtime();
2095 nprocs=T_plan->nprocs;
2096 procid=T_plan->procid;
2097 int nprocs_1=T_plan->nprocs_1;
2098 ptrdiff_t *N=T_plan->N;
2100 double * data_cpu=T_plan->buffer;
2101 double * send_recv_cpu = T_plan->buffer_2;
2102 double * send_recv_d = T_plan->buffer_d;
2105 ptrdiff_t local_n0=T_plan->local_n0;
2106 ptrdiff_t local_n1=T_plan->local_n1;
2107 ptrdiff_t n_tuples=T_plan->n_tuples;
2109 int idist=N[1]*local_n0*n_tuples;
2110 int odist=N[0]*local_n1*n_tuples;
2112 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2114 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
2116 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2117 for(
int h=0;h<howmany;h++)
2118 for(
int id=0;
id<nprocs;++id){
2120 for(
int i=0;i<local_n0;i++){
2121 std::cout<<std::endl;
2122 for(
int j=0;j<N[1];j++){
2123 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
2127 MPI_Barrier(T_plan->comm);
2134 shuffle_time-=MPI_Wtime();
2137 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2138 #pragma omp parallel for
2139 for(
int h=0;h<howmany;h++)
2140 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
2142 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
2143 #pragma omp parallel for
2144 for(
int h=0;h<howmany;h++)
2145 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
2148 shuffle_time+=MPI_Wtime();
2149 timings[0]+=MPI_Wtime();
2150 timings[0]+=shuffle_time;
2151 timings[1]+=shuffle_time;
2154 MPI_Barrier(T_plan->comm);
2160 local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
2162 shuffle_time+=MPI_Wtime();
2164 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
2166 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2167 for(
int h=0;h<howmany;h++)
2168 for(
int id=0;
id<nprocs;++id){
2170 for(
int i=0;i<N[1];i++){
2171 std::cout<<std::endl;
2172 for(
int j=0;j<local_n0;j++){
2173 std::cout<<
'\t'<<data_cpu[ptr];
2178 MPI_Barrier(T_plan->comm);
2186 int* scount_proc= T_plan->scount_proc;
2187 int* rcount_proc= T_plan->rcount_proc;
2188 int* soffset_proc= T_plan->soffset_proc;
2189 int* roffset_proc= T_plan->roffset_proc;
2191 MPI_Barrier(T_plan->comm);
2194 comm_time-=MPI_Wtime();
2196 int soffset=0,roffset=0;
2198 MPI_Request * s_request=
new MPI_Request[nprocs];
2199 MPI_Request * request=
new MPI_Request[nprocs];
2200 int flag[nprocs],color[nprocs];
2201 memset(flag,0,
sizeof(
int)*nprocs);
2202 memset(color,0,
sizeof(
int)*nprocs);
2203 #pragma omp parallel for
2204 for (
int proc=0;proc<nprocs;++proc){
2205 request[proc]=MPI_REQUEST_NULL;
2206 s_request[proc]=MPI_REQUEST_NULL;
2208 double *s_buf, *r_buf;
2209 s_buf=data_cpu; r_buf=send_recv_cpu;
2214 r_buf_d=send_recv_d;
2221 for (
int proc=0;proc<nprocs;++proc){
2223 soffset=soffset_proc[proc];
2224 roffset=roffset_proc[proc];
2225 cudaMemcpy(&s_buf[soffset], &send_recv_d[soffset],
sizeof(
double)*scount_proc[proc], cudaMemcpyDeviceToHost);
2226 MPI_Sendrecv(&s_buf[soffset],scount_proc[proc], MPI_DOUBLE,
2228 &r_buf[roffset],rcount_proc[proc], MPI_DOUBLE,
2230 T_plan->comm,&ierr);
2237 cudaMemcpy(&r_buf[roffset_proc[procid]], &send_recv_d[soffset_proc[procid]],
sizeof(
double)*scount_proc[procid], cudaMemcpyDeviceToHost);
2241 cudaMemcpyAsync(&r_buf_d[roffset_proc[proc]],&r_buf[roffset_proc[proc]],
sizeof(
double)*rcount_proc[proc],cudaMemcpyHostToDevice);
2244 cudaMemcpy(r_buf_d,r_buf,T_plan->alloc_local,cudaMemcpyHostToDevice);
2245 cudaDeviceSynchronize();
2246 comm_time+=MPI_Wtime();
2250 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
2252 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2253 for(
int h=0;h<howmany;h++)
2254 for(
int id=0;
id<nprocs;++id){
2256 for(
int i=0;i<local_n1;i++){
2257 std::cout<<std::endl;
2258 for(
int j=0;j<N[0];j++){
2259 std::cout<<
'\t'<<data_cpu[ptr];
2264 MPI_Barrier(T_plan->comm);
2270 reshuffle_time-=MPI_Wtime();
2273 local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
2278 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
2280 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2281 for(
int h=0;h<howmany;h++)
2282 for(
int id=0;
id<nprocs_1;++id){
2284 for(
int i=0;i<local_n1;i++){
2285 std::cout<<std::endl;
2286 for(
int j=0;j<N[0];j++){
2287 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
2291 MPI_Barrier(T_plan->comm);
2295 reshuffle_time+=MPI_Wtime();
2297 delete [] s_request;
2301 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
2302 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
2303 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
2304 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2306 timings[0]+=MPI_Wtime();
2307 timings[1]+=shuffle_time;
2308 timings[2]+=comm_time;
2309 timings[3]+=reshuffle_time;
2318 void fast_transpose_cuda_v2(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
2322 std::bitset<8> Flags(flags);
2323 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
2324 MPI_Barrier(T_plan->comm);
2328 transpose_cuda_v6(T_plan,(
double*)data,timings,flags,howmany,tag);
2329 MPI_Barrier(T_plan->comm);
2332 timings[0]-=MPI_Wtime();
2334 nprocs=T_plan->nprocs;
2335 procid=T_plan->procid;
2336 int nprocs_1=T_plan->nprocs_1;
2337 ptrdiff_t *N=T_plan->N;
2339 double * data_cpu=T_plan->buffer;
2340 double * send_recv_cpu = T_plan->buffer_2;
2341 double * send_recv_d = T_plan->buffer_d;
2344 ptrdiff_t local_n0=T_plan->local_n0;
2345 ptrdiff_t local_n1=T_plan->local_n1;
2346 ptrdiff_t n_tuples=T_plan->n_tuples;
2348 int idist=N[1]*local_n0*n_tuples;
2349 int odist=N[0]*local_n1*n_tuples;
2351 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2353 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
2355 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2356 for(
int h=0;h<howmany;h++)
2357 for(
int id=0;
id<nprocs;++id){
2359 for(
int i=0;i<local_n0;i++){
2360 std::cout<<std::endl;
2361 for(
int j=0;j<N[1];j++){
2362 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
2366 MPI_Barrier(T_plan->comm);
2373 shuffle_time-=MPI_Wtime();
2376 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2377 #pragma omp parallel for
2378 for(
int h=0;h<howmany;h++)
2379 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
2381 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
2382 #pragma omp parallel for
2383 for(
int h=0;h<howmany;h++)
2384 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
2387 shuffle_time+=MPI_Wtime();
2388 timings[0]+=MPI_Wtime();
2389 timings[0]+=shuffle_time;
2390 timings[1]+=shuffle_time;
2393 MPI_Barrier(T_plan->comm);
2399 local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
2401 shuffle_time+=MPI_Wtime();
2403 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
2405 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2406 for(
int h=0;h<howmany;h++)
2407 for(
int id=0;
id<nprocs;++id){
2409 for(
int i=0;i<N[1];i++){
2410 std::cout<<std::endl;
2411 for(
int j=0;j<local_n0;j++){
2412 std::cout<<
'\t'<<data_cpu[ptr];
2417 MPI_Barrier(T_plan->comm);
2425 int* scount_proc_f= T_plan->scount_proc_f;
2426 int* rcount_proc_f= T_plan->rcount_proc_f;
2427 int* soffset_proc_f= T_plan->soffset_proc_f;
2428 int* roffset_proc_f= T_plan->roffset_proc_f;
2430 MPI_Barrier(T_plan->comm);
2433 comm_time-=MPI_Wtime();
2435 MPI_Request * s_request=
new MPI_Request[nprocs];
2436 MPI_Request * request=
new MPI_Request[nprocs];
2437 #pragma omp parallel for
2438 for (
int proc=0;proc<nprocs;++proc){
2439 request[proc]=MPI_REQUEST_NULL;
2440 s_request[proc]=MPI_REQUEST_NULL;
2442 double *s_buf, *r_buf;
2443 s_buf=data_cpu; r_buf=send_recv_cpu;
2448 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2449 if(T_plan->is_evenly_distributed==0)
2450 MPI_Alltoallv(s_buf,scount_proc_f,
2451 soffset_proc_f, MPI_DOUBLE,r_buf,
2452 rcount_proc_f,roffset_proc_f, MPI_DOUBLE,
2455 MPI_Alltoall(s_buf, scount_proc_f[0], MPI_DOUBLE,
2456 r_buf, rcount_proc_f[0], MPI_DOUBLE,
2462 cudaMemcpy(data, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
2464 cudaMemcpy(send_recv_d, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
2465 comm_time+=MPI_Wtime();
2468 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
2470 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2471 for(
int h=0;h<howmany;h++)
2472 for(
int id=0;
id<nprocs;++id){
2474 for(
int i=0;i<local_n1;i++){
2475 std::cout<<std::endl;
2476 for(
int j=0;j<N[0];j++){
2477 std::cout<<
'\t'<<data_cpu[ptr];
2482 MPI_Barrier(T_plan->comm);
2488 reshuffle_time-=MPI_Wtime();
2491 local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
2496 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
2498 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2499 for(
int h=0;h<howmany;h++)
2500 for(
int id=0;
id<nprocs_1;++id){
2502 for(
int i=0;i<local_n1;i++){
2503 std::cout<<std::endl;
2504 for(
int j=0;j<N[0];j++){
2505 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
2509 MPI_Barrier(T_plan->comm);
2513 reshuffle_time+=MPI_Wtime();
2515 delete [] s_request;
2519 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
2520 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
2521 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
2522 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2524 timings[0]+=MPI_Wtime();
2525 timings[1]+=shuffle_time;
2526 timings[2]+=comm_time;
2527 timings[3]+=reshuffle_time;
2531 void fast_transpose_cuda_v3(T_Plan_gpu* T_plan,
double * data,
double *timings,
int kway,
unsigned flags,
int howmany,
int tag ){
2535 std::bitset<8> Flags(flags);
2536 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
2537 MPI_Barrier(T_plan->comm);
2541 transpose_cuda_v5(T_plan,(
double*)data,timings,flags,howmany,tag);
2542 MPI_Barrier(T_plan->comm);
2545 timings[0]-=MPI_Wtime();
2547 nprocs=T_plan->nprocs;
2548 procid=T_plan->procid;
2549 int nprocs_1=T_plan->nprocs_1;
2550 ptrdiff_t *N=T_plan->N;
2552 double * data_cpu=T_plan->buffer;
2553 double * send_recv_cpu = T_plan->buffer_2;
2554 double * send_recv_d = T_plan->buffer_d;
2557 ptrdiff_t local_n0=T_plan->local_n0;
2558 ptrdiff_t local_n1=T_plan->local_n1;
2559 ptrdiff_t n_tuples=T_plan->n_tuples;
2561 int idist=N[1]*local_n0*n_tuples;
2562 int odist=N[0]*local_n1*n_tuples;
2564 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2566 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
2568 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2569 for(
int h=0;h<howmany;h++)
2570 for(
int id=0;
id<nprocs;++id){
2572 for(
int i=0;i<local_n0;i++){
2573 std::cout<<std::endl;
2574 for(
int j=0;j<N[1];j++){
2575 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
2579 MPI_Barrier(T_plan->comm);
2586 shuffle_time-=MPI_Wtime();
2589 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2590 #pragma omp parallel for
2591 for(
int h=0;h<howmany;h++)
2592 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
2594 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
2595 #pragma omp parallel for
2596 for(
int h=0;h<howmany;h++)
2597 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
2600 shuffle_time+=MPI_Wtime();
2601 timings[0]+=MPI_Wtime();
2602 timings[0]+=shuffle_time;
2603 timings[1]+=shuffle_time;
2606 MPI_Barrier(T_plan->comm);
2612 local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
2614 shuffle_time+=MPI_Wtime();
2616 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
2618 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2619 for(
int h=0;h<howmany;h++)
2620 for(
int id=0;
id<nprocs;++id){
2622 for(
int i=0;i<N[1];i++){
2623 std::cout<<std::endl;
2624 for(
int j=0;j<local_n0;j++){
2625 std::cout<<
'\t'<<data_cpu[ptr];
2630 MPI_Barrier(T_plan->comm);
2638 int* scount_proc_f= T_plan->scount_proc_f;
2639 int* rcount_proc_f= T_plan->rcount_proc_f;
2640 int* soffset_proc_f= T_plan->soffset_proc_f;
2641 int* roffset_proc_f= T_plan->roffset_proc_f;
2643 MPI_Barrier(T_plan->comm);
2646 comm_time-=MPI_Wtime();
2648 MPI_Request * s_request=
new MPI_Request[nprocs];
2649 MPI_Request * request=
new MPI_Request[nprocs];
2650 #pragma omp parallel for
2651 for (
int proc=0;proc<nprocs;++proc){
2652 request[proc]=MPI_REQUEST_NULL;
2653 s_request[proc]=MPI_REQUEST_NULL;
2655 double *s_buf, *r_buf;
2656 s_buf=data_cpu; r_buf=send_recv_cpu;
2661 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2662 if(T_plan->kway_async)
2663 par::Mpi_Alltoallv_dense<double,true>(s_buf , scount_proc_f, soffset_proc_f,
2664 r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
2666 par::Mpi_Alltoallv_dense<double,false>(s_buf , scount_proc_f, soffset_proc_f,
2667 r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
2671 cudaMemcpy(data, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
2673 cudaMemcpy(send_recv_d, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
2674 comm_time+=MPI_Wtime();
2677 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
2679 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2680 for(
int h=0;h<howmany;h++)
2681 for(
int id=0;
id<nprocs;++id){
2683 for(
int i=0;i<local_n1;i++){
2684 std::cout<<std::endl;
2685 for(
int j=0;j<N[0];j++){
2686 std::cout<<
'\t'<<data_cpu[ptr];
2691 MPI_Barrier(T_plan->comm);
2697 reshuffle_time-=MPI_Wtime();
2700 local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
2705 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
2707 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2708 for(
int h=0;h<howmany;h++)
2709 for(
int id=0;
id<nprocs_1;++id){
2711 for(
int i=0;i<local_n1;i++){
2712 std::cout<<std::endl;
2713 for(
int j=0;j<N[0];j++){
2714 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
2718 MPI_Barrier(T_plan->comm);
2722 reshuffle_time+=MPI_Wtime();
2724 delete [] s_request;
2728 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
2729 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
2730 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
2731 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2733 timings[0]+=MPI_Wtime();
2734 timings[1]+=shuffle_time;
2735 timings[2]+=comm_time;
2736 timings[3]+=reshuffle_time;
2741 void fast_transpose_cuda_v3_2(T_Plan_gpu* T_plan,
double * data,
double *timings,
int kway,
unsigned flags,
int howmany,
int tag ){
2745 std::bitset<8> Flags(flags);
2746 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
2747 MPI_Barrier(T_plan->comm);
2751 transpose_cuda_v7_2(T_plan,(
double*)data,timings,kway,flags,howmany, tag);
2752 MPI_Barrier(T_plan->comm);
2756 fast_transpose_cuda_v3(T_plan,(
double*)data,timings,kway,flags,howmany, tag);
2757 MPI_Barrier(T_plan->comm);
2760 timings[0]-=MPI_Wtime();
2762 nprocs=T_plan->nprocs;
2763 procid=T_plan->procid;
2764 int nprocs_1=T_plan->nprocs_1;
2765 ptrdiff_t *N=T_plan->N;
2767 double * data_cpu=T_plan->buffer;
2769 double * send_recv_d = T_plan->buffer_d;
2772 ptrdiff_t local_n0=T_plan->local_n0;
2773 ptrdiff_t local_n1=T_plan->local_n1;
2774 ptrdiff_t n_tuples=T_plan->n_tuples;
2776 int idist=N[1]*local_n0*n_tuples;
2777 int odist=N[0]*local_n1*n_tuples;
2779 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2781 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
2783 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2784 for(
int h=0;h<howmany;h++)
2785 for(
int id=0;
id<nprocs;++id){
2787 for(
int i=0;i<local_n0;i++){
2788 std::cout<<std::endl;
2789 for(
int j=0;j<N[1];j++){
2790 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
2794 MPI_Barrier(T_plan->comm);
2801 shuffle_time-=MPI_Wtime();
2804 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2805 #pragma omp parallel for
2806 for(
int h=0;h<howmany;h++)
2807 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
2809 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
2810 #pragma omp parallel for
2811 for(
int h=0;h<howmany;h++)
2812 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
2815 shuffle_time+=MPI_Wtime();
2816 timings[0]+=MPI_Wtime();
2817 timings[0]+=shuffle_time;
2818 timings[1]+=shuffle_time;
2821 MPI_Barrier(T_plan->comm);
2827 local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
2829 shuffle_time+=MPI_Wtime();
2831 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
2833 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2834 for(
int h=0;h<howmany;h++)
2835 for(
int id=0;
id<nprocs;++id){
2837 for(
int i=0;i<N[1];i++){
2838 std::cout<<std::endl;
2839 for(
int j=0;j<local_n0;j++){
2840 std::cout<<
'\t'<<data_cpu[ptr];
2845 MPI_Barrier(T_plan->comm);
2853 int* scount_proc_f= T_plan->scount_proc_f;
2854 int* rcount_proc_f= T_plan->rcount_proc_f;
2855 int* soffset_proc_f= T_plan->soffset_proc_f;
2856 int* roffset_proc_f= T_plan->roffset_proc_f;
2858 MPI_Barrier(T_plan->comm);
2861 comm_time-=MPI_Wtime();
2863 MPI_Request * s_request=
new MPI_Request[nprocs];
2864 MPI_Request * request=
new MPI_Request[nprocs];
2865 #pragma omp parallel for
2866 for (
int proc=0;proc<nprocs;++proc){
2867 request[proc]=MPI_REQUEST_NULL;
2868 s_request[proc]=MPI_REQUEST_NULL;
2875 r_buf_d=send_recv_d;
2881 if(T_plan->kway_async)
2882 par::Mpi_Alltoallv_dense_gpu<double,true>(send_recv_d , scount_proc_f, soffset_proc_f,
2883 r_buf_d, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
2885 par::Mpi_Alltoallv_dense_gpu<double,false>(send_recv_d , scount_proc_f, soffset_proc_f,
2886 r_buf_d, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
2890 comm_time+=MPI_Wtime();
2893 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
2895 cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2896 for(
int h=0;h<howmany;h++)
2897 for(
int id=0;
id<nprocs;++id){
2899 for(
int i=0;i<local_n1;i++){
2900 std::cout<<std::endl;
2901 for(
int j=0;j<N[0];j++){
2902 std::cout<<
'\t'<<data_cpu[ptr];
2907 MPI_Barrier(T_plan->comm);
2913 reshuffle_time-=MPI_Wtime();
2916 local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
2921 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
2923 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2924 for(
int h=0;h<howmany;h++)
2925 for(
int id=0;
id<nprocs_1;++id){
2927 for(
int i=0;i<local_n1;i++){
2928 std::cout<<std::endl;
2929 for(
int j=0;j<N[0];j++){
2930 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
2934 MPI_Barrier(T_plan->comm);
2938 reshuffle_time+=MPI_Wtime();
2940 delete [] s_request;
2944 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
2945 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
2946 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
2947 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2949 timings[0]+=MPI_Wtime();
2950 timings[1]+=shuffle_time;
2951 timings[2]+=comm_time;
2952 timings[3]+=reshuffle_time;
2960 void fast_transpose_cuda_v2_h(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
2962 std::bitset<8> Flags(flags);
2963 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
2964 MPI_Barrier(T_plan->comm);
2968 transpose_cuda_v6(T_plan,(
double*)data,timings,flags,howmany,tag);
2969 MPI_Barrier(T_plan->comm);
2972 timings[0]-=MPI_Wtime();
2974 int nprocs_0, nprocs_1;
2975 nprocs=T_plan->nprocs;
2976 procid=T_plan->procid;
2977 nprocs_0=T_plan->nprocs_0;
2978 nprocs_1=T_plan->nprocs_1;
2979 ptrdiff_t *N=T_plan->N;
2980 ptrdiff_t local_n0=T_plan->local_n0;
2981 ptrdiff_t local_n1=T_plan->local_n1;
2982 ptrdiff_t n_tuples=T_plan->n_tuples;
2984 double * data_cpu=T_plan->buffer;
2985 double * send_recv_cpu = T_plan->buffer_2;
2986 double * send_recv_d = T_plan->buffer_d;
2988 int idist=N[1]*local_n0*n_tuples;
2989 int odist=N[0]*local_n1*n_tuples;
2991 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2994 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
2996 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2997 for(
int h=0;h<howmany;h++)
2998 for(
int id=0;
id<nprocs;++id){
3000 for(
int i=0;i<local_n0;i++){
3001 std::cout<<std::endl;
3002 for(
int j=0;j<N[1];j++){
3003 ptr=h*idist+(i*N[1]+j)*n_tuples;
3004 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
3008 MPI_Barrier(T_plan->comm);
3015 ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
3016 ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
3017 ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
3018 ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
3019 shuffle_time-=MPI_Wtime();
3020 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
3021 #pragma omp parallel for
3022 for(
int h=0;h<howmany;h++)
3023 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
3025 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
3026 #pragma omp parallel for
3027 for(
int h=0;h<howmany;h++)
3028 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
3031 shuffle_time+=MPI_Wtime();
3032 timings[0]+=MPI_Wtime();
3033 timings[0]+=shuffle_time;
3034 timings[1]+=shuffle_time;
3037 MPI_Barrier(T_plan->comm);
3045 for (
int proc=0;proc<nprocs_1;++proc)
3046 for(
int h=0;h<howmany;++h){
3047 for(
int i=0;i<local_n0;++i){
3054 cudaMemcpy(&send_recv_d[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples] ,
sizeof(
double)*n_tuples*local_n1_proc[proc] , cudaMemcpyDeviceToDevice);
3055 ptr+=n_tuples*local_n1_proc[proc];
3058 memcpy_v1_h1(nprocs_1,howmany,local_n0,n_tuples,local_n1_proc,send_recv_d,data,idist,N[1],local_1_start_proc);
3059 cudaDeviceSynchronize();
3061 shuffle_time+=MPI_Wtime();
3063 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
3065 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3066 for(
int id=0;
id<nprocs;++id){
3067 for(
int h=0;h<howmany;h++)
3069 for(
int i=0;i<N[1];i++){
3070 std::cout<<std::endl;
3071 for(
int j=0;j<local_n0;j++){
3072 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
3077 MPI_Barrier(T_plan->comm);
3086 int* scount_proc_f= T_plan->scount_proc_f;
3087 int* rcount_proc_f= T_plan->rcount_proc_f;
3088 int* soffset_proc_f= T_plan->soffset_proc_f;
3089 int* roffset_proc_f= T_plan->roffset_proc_f;
3091 MPI_Barrier(T_plan->comm);
3094 comm_time-=MPI_Wtime();
3097 double *s_buf, *r_buf;
3098 s_buf=data_cpu; r_buf=send_recv_cpu;
3102 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3103 if(T_plan->is_evenly_distributed==0)
3104 MPI_Alltoallv(s_buf,scount_proc_f,
3105 soffset_proc_f, MPI_DOUBLE,r_buf,
3106 rcount_proc_f,roffset_proc_f, MPI_DOUBLE,
3109 MPI_Alltoall(s_buf, scount_proc_f[0], MPI_DOUBLE,
3110 r_buf, rcount_proc_f[0], MPI_DOUBLE,
3116 comm_time+=MPI_Wtime();
3120 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
3122 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3123 for(
int id=0;
id<nprocs;++id){
3125 for(
int h=0;h<howmany;h++)
3126 for(
int i=0;i<local_n1;i++){
3127 std::cout<<std::endl;
3128 for(
int j=0;j<N[0];j++){
3129 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
3134 MPI_Barrier(T_plan->comm);
3141 reshuffle_time-=MPI_Wtime();
3144 for (
int proc=0;proc<nprocs_0;++proc)
3145 for(
int h=0;h<howmany;++h){
3146 for(
int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
3148 cudaMemcpy( &data[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*
sizeof(
double)*n_tuples,cudaMemcpyHostToDevice);
3150 ptr+=n_tuples*local_n1;
3158 memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_cpu);
3159 cudaDeviceSynchronize();
3161 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
3163 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3164 for(
int id=0;
id<nprocs_1;++id){
3166 for(
int h=0;h<howmany;h++)
3167 for(
int i=0;i<N[0];i++){
3168 std::cout<<std::endl;
3169 for(
int j=0;j<local_n1;j++){
3170 ptr=h*odist+(i*local_n1+j)*n_tuples;
3171 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
3175 MPI_Barrier(T_plan->comm);
3181 #pragma omp parallel for
3182 for(
int h=0;h<howmany;h++)
3183 local_transpose_cuda(N[0],local_n1,n_tuples,&data[h*odist] );
3185 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
3187 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3188 for(
int id=0;
id<nprocs_1;++id){
3190 for(
int h=0;h<howmany;h++)
3191 for(
int i=0;i<N[0];i++){
3192 std::cout<<std::endl;
3193 for(
int j=0;j<local_n1;j++){
3194 ptr=h*odist+(i*local_n1+j)*n_tuples;
3195 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
3199 MPI_Barrier(T_plan->comm);
3204 reshuffle_time+=MPI_Wtime();
3205 MPI_Barrier(T_plan->comm);
3209 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
3210 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
3211 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
3212 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
3214 timings[0]+=MPI_Wtime();
3215 timings[1]+=shuffle_time;
3216 timings[2]+=comm_time;
3217 timings[3]+=reshuffle_time;
3221 void fast_transpose_cuda_v3_h(T_Plan_gpu* T_plan,
double * data,
double *timings,
int kway,
unsigned flags,
int howmany ,
int tag){
3223 std::bitset<8> Flags(flags);
3224 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
3225 MPI_Barrier(T_plan->comm);
3229 transpose_cuda_v6(T_plan,(
double*)data,timings,flags,howmany,tag);
3230 MPI_Barrier(T_plan->comm);
3233 timings[0]-=MPI_Wtime();
3235 int nprocs_0, nprocs_1;
3236 nprocs=T_plan->nprocs;
3237 procid=T_plan->procid;
3238 nprocs_0=T_plan->nprocs_0;
3239 nprocs_1=T_plan->nprocs_1;
3240 ptrdiff_t *N=T_plan->N;
3241 ptrdiff_t local_n0=T_plan->local_n0;
3242 ptrdiff_t local_n1=T_plan->local_n1;
3243 ptrdiff_t n_tuples=T_plan->n_tuples;
3245 double * data_cpu=T_plan->buffer;
3246 double * send_recv_cpu = T_plan->buffer_2;
3247 double * send_recv_d = T_plan->buffer_d;
3249 int idist=N[1]*local_n0*n_tuples;
3250 int odist=N[0]*local_n1*n_tuples;
3252 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
3255 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
3257 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3258 for(
int h=0;h<howmany;h++)
3259 for(
int id=0;
id<nprocs;++id){
3261 for(
int i=0;i<local_n0;i++){
3262 std::cout<<std::endl;
3263 for(
int j=0;j<N[1];j++){
3264 ptr=h*idist+(i*N[1]+j)*n_tuples;
3265 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
3269 MPI_Barrier(T_plan->comm);
3276 ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
3277 ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
3278 ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
3279 ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
3280 shuffle_time-=MPI_Wtime();
3281 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
3282 #pragma omp parallel for
3283 for(
int h=0;h<howmany;h++)
3284 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
3286 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
3287 #pragma omp parallel for
3288 for(
int h=0;h<howmany;h++)
3289 local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
3292 shuffle_time+=MPI_Wtime();
3293 timings[0]+=MPI_Wtime();
3294 timings[0]+=shuffle_time;
3295 timings[1]+=shuffle_time;
3298 MPI_Barrier(T_plan->comm);
3306 for (
int proc=0;proc<nprocs_1;++proc)
3307 for(
int h=0;h<howmany;++h){
3308 for(
int i=0;i<local_n0;++i){
3315 cudaMemcpy(&send_recv_d[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples] ,
sizeof(
double)*n_tuples*local_n1_proc[proc] , cudaMemcpyDeviceToDevice);
3316 ptr+=n_tuples*local_n1_proc[proc];
3319 memcpy_v1_h1(nprocs_1,howmany,local_n0,n_tuples,local_n1_proc,send_recv_d,data,idist,N[1],local_1_start_proc);
3320 cudaDeviceSynchronize();
3322 shuffle_time+=MPI_Wtime();
3324 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
3326 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3327 for(
int id=0;
id<nprocs;++id){
3328 for(
int h=0;h<howmany;h++)
3330 for(
int i=0;i<N[1];i++){
3331 std::cout<<std::endl;
3332 for(
int j=0;j<local_n0;j++){
3333 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
3338 MPI_Barrier(T_plan->comm);
3347 int* scount_proc_f= T_plan->scount_proc_f;
3348 int* rcount_proc_f= T_plan->rcount_proc_f;
3349 int* soffset_proc_f= T_plan->soffset_proc_f;
3350 int* roffset_proc_f= T_plan->roffset_proc_f;
3352 MPI_Barrier(T_plan->comm);
3355 comm_time-=MPI_Wtime();
3357 MPI_Request * s_request=
new MPI_Request[nprocs];
3358 MPI_Request * request=
new MPI_Request[nprocs];
3359 #pragma omp parallel for
3360 for (
int proc=0;proc<nprocs;++proc){
3361 request[proc]=MPI_REQUEST_NULL;
3362 s_request[proc]=MPI_REQUEST_NULL;
3365 double *s_buf, *r_buf;
3366 s_buf=data_cpu; r_buf=send_recv_cpu;
3370 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3371 if(T_plan->kway_async)
3372 par::Mpi_Alltoallv_dense<double,true>(s_buf , scount_proc_f, soffset_proc_f,
3373 r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
3375 par::Mpi_Alltoallv_dense<double,false>(s_buf , scount_proc_f, soffset_proc_f,
3376 r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
3380 comm_time+=MPI_Wtime();
3384 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
3386 cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3387 for(
int id=0;
id<nprocs;++id){
3389 for(
int h=0;h<howmany;h++)
3390 for(
int i=0;i<local_n1;i++){
3391 std::cout<<std::endl;
3392 for(
int j=0;j<N[0];j++){
3393 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
3398 MPI_Barrier(T_plan->comm);
3405 reshuffle_time-=MPI_Wtime();
3408 for (
int proc=0;proc<nprocs_0;++proc)
3409 for(
int h=0;h<howmany;++h){
3410 for(
int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
3412 cudaMemcpy( &data[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*
sizeof(
double)*n_tuples,cudaMemcpyHostToDevice);
3414 ptr+=n_tuples*local_n1;
3422 memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_cpu);
3423 cudaDeviceSynchronize();
3425 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
3427 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3428 for(
int id=0;
id<nprocs_1;++id){
3430 for(
int h=0;h<howmany;h++)
3431 for(
int i=0;i<N[0];i++){
3432 std::cout<<std::endl;
3433 for(
int j=0;j<local_n1;j++){
3434 ptr=h*odist+(i*local_n1+j)*n_tuples;
3435 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
3439 MPI_Barrier(T_plan->comm);
3445 #pragma omp parallel for
3446 for(
int h=0;h<howmany;h++)
3447 local_transpose_cuda(N[0],local_n1,n_tuples,&data[h*odist] );
3449 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
3451 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3452 for(
int id=0;
id<nprocs_1;++id){
3454 for(
int h=0;h<howmany;h++)
3455 for(
int i=0;i<N[0];i++){
3456 std::cout<<std::endl;
3457 for(
int j=0;j<local_n1;j++){
3458 ptr=h*odist+(i*local_n1+j)*n_tuples;
3459 std::cout<<
'\t'<<data_cpu[ptr]<<
","<<data_cpu[ptr+1];
3463 MPI_Barrier(T_plan->comm);
3468 reshuffle_time+=MPI_Wtime();
3469 MPI_Barrier(T_plan->comm);
3471 delete [] s_request;
3475 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
3476 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
3477 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
3478 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
3480 timings[0]+=MPI_Wtime();
3481 timings[1]+=shuffle_time;
3482 timings[2]+=comm_time;
3483 timings[3]+=reshuffle_time;
3487 void transpose_cuda_v5(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
3489 std::bitset<8> Flags(flags);
3490 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
3491 MPI_Barrier(T_plan->comm);
3494 timings[0]-=MPI_Wtime();
3496 int nprocs_0, nprocs_1;
3497 nprocs=T_plan->nprocs;
3498 procid=T_plan->procid;
3499 nprocs_0=T_plan->nprocs_0;
3500 nprocs_1=T_plan->nprocs_1;
3501 ptrdiff_t *N=T_plan->N;
3502 double* data_cpu=T_plan->buffer;
3503 double * send_recv_cpu = T_plan->buffer_2;
3504 double * send_recv = T_plan->buffer_d;
3505 ptrdiff_t local_n0=T_plan->local_n0;
3506 ptrdiff_t local_n1=T_plan->local_n1;
3507 ptrdiff_t n_tuples=T_plan->n_tuples;
3508 int idist=N[1]*local_n0*n_tuples;
3509 int odist=N[0]*local_n1*n_tuples;
3511 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
3513 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
3515 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3516 for(
int h=0;h<howmany;h++)
3517 for(
int id=0;
id<nprocs;++id){
3519 for(
int i=0;i<local_n0;i++){
3520 std::cout<<std::endl;
3521 for(
int j=0;j<N[1];j++){
3522 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
3526 MPI_Barrier(T_plan->comm);
3532 shuffle_time-=MPI_Wtime();
3535 for(
int h=0;h<howmany;h++)
3536 local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
3538 cudaDeviceSynchronize();
3540 shuffle_time+=MPI_Wtime();
3541 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
3543 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3544 for(
int h=0;h<howmany;h++)
3545 for(
int id=0;
id<nprocs;++id){
3547 for(
int i=0;i<N[1];i++){
3548 std::cout<<std::endl;
3549 for(
int j=0;j<local_n0;j++){
3550 std::cout<<
'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
3554 MPI_Barrier(T_plan->comm);
3558 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
3559 for(
int h=0;h<howmany;h++)
3560 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
3563 timings[0]+=MPI_Wtime();
3564 timings[1]+=shuffle_time;
3575 int* scount_proc= T_plan->scount_proc;
3577 int* soffset_proc= T_plan->soffset_proc;
3578 int* roffset_proc= T_plan->roffset_proc;
3580 comm_time-=MPI_Wtime();
3582 int soffset=0,roffset=0;
3584 MPI_Request request[nprocs], s_request[nprocs];
3585 #pragma omp parallel for
3586 for (
int proc=0;proc<nprocs;++proc){
3587 request[proc]=MPI_REQUEST_NULL;
3588 s_request[proc]=MPI_REQUEST_NULL;
3591 MPI_Datatype *stype=T_plan->stype;
3592 MPI_Datatype *rtype=T_plan->rtype;
3593 MPI_Barrier(T_plan->comm);
3596 for (
int proc=0;proc<nprocs;++proc){
3598 roffset=roffset_proc[proc];
3599 MPI_Irecv(&send_recv_cpu[roffset],1, rtype[proc], proc,
3600 tag, T_plan->comm, &request[proc]);
3604 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3605 for (
int proc=0;proc<nprocs;++proc){
3607 soffset=soffset_proc[proc];
3608 MPI_Isend(&data_cpu[soffset],1, stype[proc],proc, tag,
3609 T_plan->comm, &s_request[proc]);
3613 soffset=soffset_proc[procid];
3614 roffset=roffset_proc[procid];
3615 for(
int h=0;h<howmany;h++)
3616 memcpy(&send_recv_cpu[h*odist+roffset],&data_cpu[h*idist+soffset],
sizeof(
double)*scount_proc[procid]);
3617 for (
int proc=0;proc<nprocs;++proc){
3618 MPI_Wait(&request[proc], &ierr);
3619 MPI_Wait(&s_request[proc], &ierr);
3622 cudaMemcpy(send_recv, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
3623 cudaDeviceSynchronize();
3624 comm_time+=MPI_Wtime();
3626 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
3628 for(
int h=0;h<howmany;h++)
3629 for(
int id=0;
id<nprocs;++id){
3631 for(
int i=0;i<local_n1;i++){
3632 std::cout<<std::endl;
3633 for(
int j=0;j<N[0];j++){
3634 std::cout<<
'\t'<<send_recv_cpu[h*odist+(i*N[0]+j)*n_tuples];
3638 MPI_Barrier(T_plan->comm);
3648 reshuffle_time-=MPI_Wtime();
3655 int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
3657 last_ntuples=T_plan->last_recv_count/((int)local_n1);
3659 for(
int h=0;h<howmany;h++){
3661 cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice);
3662 else if(last_ntuples!=first_ntuples){
3663 local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
3664 local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
3666 else if(last_ntuples==first_ntuples){
3668 local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
3671 cudaDeviceSynchronize();
3675 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
3677 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3678 for(
int h=0;h<howmany;h++)
3679 for(
int id=0;
id<nprocs_1;++id){
3681 for(
int i=0;i<local_n1;i++){
3682 std::cout<<std::endl;
3683 for(
int j=0;j<N[0];j++){
3684 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
3688 MPI_Barrier(T_plan->comm);
3692 for(
int h=0;h<howmany;h++)
3693 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
3695 reshuffle_time+=MPI_Wtime();
3696 MPI_Barrier(T_plan->comm);
3698 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
3699 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
3700 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
3701 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
3703 timings[0]+=MPI_Wtime();
3704 timings[1]+=shuffle_time;
3705 timings[2]+=comm_time;
3706 timings[3]+=reshuffle_time;
3710 void transpose_cuda_v5_2(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
3712 std::bitset<8> Flags(flags);
3713 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
3714 MPI_Barrier(T_plan->comm);
3717 timings[0]-=MPI_Wtime();
3719 int nprocs_0, nprocs_1;
3720 nprocs=T_plan->nprocs;
3721 procid=T_plan->procid;
3722 nprocs_0=T_plan->nprocs_0;
3723 nprocs_1=T_plan->nprocs_1;
3724 ptrdiff_t *N=T_plan->N;
3725 double* data_cpu=T_plan->buffer;
3726 double * send_recv_cpu = T_plan->buffer_2;
3727 double * send_recv = T_plan->buffer_d;
3728 ptrdiff_t local_n0=T_plan->local_n0;
3729 ptrdiff_t local_n1=T_plan->local_n1;
3730 ptrdiff_t n_tuples=T_plan->n_tuples;
3731 int idist=N[1]*local_n0*n_tuples;
3732 int odist=N[0]*local_n1*n_tuples;
3734 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
3736 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
3738 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3739 for(
int h=0;h<howmany;h++)
3740 for(
int id=0;
id<nprocs;++id){
3742 for(
int i=0;i<local_n0;i++){
3743 std::cout<<std::endl;
3744 for(
int j=0;j<N[1];j++){
3745 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
3749 MPI_Barrier(T_plan->comm);
3755 shuffle_time-=MPI_Wtime();
3758 for(
int h=0;h<howmany;h++)
3759 local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
3761 cudaDeviceSynchronize();
3763 shuffle_time+=MPI_Wtime();
3764 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
3766 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3767 for(
int h=0;h<howmany;h++)
3768 for(
int id=0;
id<nprocs;++id){
3770 for(
int i=0;i<N[1];i++){
3771 std::cout<<std::endl;
3772 for(
int j=0;j<local_n0;j++){
3773 std::cout<<
'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
3777 MPI_Barrier(T_plan->comm);
3780 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
3781 for(
int h=0;h<howmany;h++)
3782 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
3785 timings[0]+=MPI_Wtime();
3786 timings[1]+=shuffle_time;
3796 int* scount_proc= T_plan->scount_proc;
3797 int* rcount_proc= T_plan->rcount_proc;
3798 int* soffset_proc= T_plan->soffset_proc;
3799 int* roffset_proc= T_plan->roffset_proc;
3801 comm_time-=MPI_Wtime();
3803 int soffset=0,roffset=0;
3805 MPI_Request request[nprocs], s_request[nprocs];
3806 int flag[nprocs],color[nprocs];
3807 memset(flag,0,
sizeof(
int)*nprocs);
3808 memset(color,0,
sizeof(
int)*nprocs);
3809 #pragma omp parallel for
3810 for (
int proc=0;proc<nprocs;++proc){
3811 request[proc]=MPI_REQUEST_NULL;
3812 s_request[proc]=MPI_REQUEST_NULL;
3816 MPI_Datatype *stype=T_plan->stype;
3817 MPI_Datatype *rtype=T_plan->rtype;
3819 for (
int proc=0;proc<nprocs;++proc){
3821 roffset=roffset_proc[proc];
3822 MPI_Irecv(&send_recv_cpu[roffset],1, rtype[proc], proc,
3823 tag, T_plan->comm, &request[proc]);
3828 for (
int proc=0;proc<nprocs;++proc){
3830 soffset=soffset_proc[proc];
3831 for(
int h=0;h<howmany;h++)
3832 cudaMemcpy(&data_cpu[h*idist+soffset], &data[h*idist+soffset],
sizeof(
double)*scount_proc[proc], cudaMemcpyDeviceToHost);
3833 MPI_Isend(&data_cpu[soffset],1, stype[proc],proc, tag,
3834 T_plan->comm, &s_request[proc]);
3838 for(
int h=0;h<howmany;h++)
3839 cudaMemcpy(&send_recv[h*odist+roffset_proc[procid]], &data[h*idist+soffset_proc[procid]],
sizeof(
double)*scount_proc[procid], cudaMemcpyDeviceToDevice);
3842 while(counter!=nprocs){
3844 for (
int proc=0;proc<nprocs;++proc){
3845 MPI_Test(&request[proc], &flag[proc],&ierr);
3846 if(flag[proc]==1 && color[proc]==0 && proc!=procid){
3847 for(
int h=0;h<howmany;h++)
3848 cudaMemcpyAsync(&send_recv[h*odist+roffset_proc[proc]],&send_recv_cpu[h*odist+roffset_proc[proc]],
sizeof(
double)*rcount_proc[proc],cudaMemcpyHostToDevice);
3855 cudaDeviceSynchronize();
3857 comm_time+=MPI_Wtime();
3859 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
3861 cudaMemcpy(send_recv_cpu, send_recv, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3862 for(
int h=0;h<howmany;h++)
3863 for(
int id=0;
id<nprocs;++id){
3865 for(
int i=0;i<local_n1;i++){
3866 std::cout<<std::endl;
3867 for(
int j=0;j<N[0];j++){
3868 std::cout<<
'\t'<<send_recv_cpu[odist*h+(i*N[0]+j)*n_tuples];
3872 MPI_Barrier(T_plan->comm);
3882 reshuffle_time-=MPI_Wtime();
3889 int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
3891 last_ntuples=T_plan->last_recv_count/((int)local_n1);
3893 for(
int h=0;h<howmany;h++){
3895 cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice);
3896 else if(last_ntuples!=first_ntuples){
3897 local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
3898 local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
3900 else if(last_ntuples==first_ntuples){
3902 local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
3905 cudaDeviceSynchronize();
3908 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
3910 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3911 for(
int h=0;h<howmany;h++)
3912 for(
int id=0;
id<nprocs_1;++id){
3914 for(
int i=0;i<local_n1;i++){
3915 std::cout<<std::endl;
3916 for(
int j=0;j<N[0];j++){
3917 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
3921 MPI_Barrier(T_plan->comm);
3925 for(
int h=0;h<howmany;h++)
3926 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
3928 reshuffle_time+=MPI_Wtime();
3929 MPI_Barrier(T_plan->comm);
3931 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
3932 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
3933 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
3934 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
3936 timings[0]+=MPI_Wtime();
3937 timings[1]+=shuffle_time;
3938 timings[2]+=comm_time;
3939 timings[3]+=reshuffle_time;
3944 void transpose_cuda_v5_3(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany ,
int tag){
3946 std::bitset<8> Flags(flags);
3947 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
3948 MPI_Barrier(T_plan->comm);
3951 timings[0]-=MPI_Wtime();
3953 int nprocs_0, nprocs_1;
3954 nprocs=T_plan->nprocs;
3955 procid=T_plan->procid;
3956 nprocs_0=T_plan->nprocs_0;
3957 nprocs_1=T_plan->nprocs_1;
3958 ptrdiff_t *N=T_plan->N;
3959 double* data_cpu=T_plan->buffer;
3960 double * send_recv_cpu = T_plan->buffer_2;
3961 double * send_recv = T_plan->buffer_d;
3962 ptrdiff_t local_n0=T_plan->local_n0;
3963 ptrdiff_t local_n1=T_plan->local_n1;
3964 ptrdiff_t n_tuples=T_plan->n_tuples;
3965 int idist=N[1]*local_n0*n_tuples;
3966 int odist=N[0]*local_n1*n_tuples;
3968 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
3970 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
3972 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3973 for(
int h=0;h<howmany;h++)
3974 for(
int id=0;
id<nprocs;++id){
3976 for(
int i=0;i<local_n0;i++){
3977 std::cout<<std::endl;
3978 for(
int j=0;j<N[1];j++){
3979 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
3983 MPI_Barrier(T_plan->comm);
3989 shuffle_time-=MPI_Wtime();
3992 for(
int h=0;h<howmany;h++)
3993 local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
3995 cudaDeviceSynchronize();
3997 shuffle_time+=MPI_Wtime();
3998 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
4000 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4001 for(
int h=0;h<howmany;h++)
4002 for(
int id=0;
id<nprocs;++id){
4004 for(
int i=0;i<N[1];i++){
4005 std::cout<<std::endl;
4006 for(
int j=0;j<local_n0;j++){
4007 std::cout<<
'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
4011 MPI_Barrier(T_plan->comm);
4014 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
4015 for(
int h=0;h<howmany;h++)
4016 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
4019 timings[0]+=MPI_Wtime();
4020 timings[1]+=shuffle_time;
4030 int* scount_proc= T_plan->scount_proc;
4031 int* rcount_proc= T_plan->rcount_proc;
4032 int* soffset_proc= T_plan->soffset_proc;
4033 int* roffset_proc= T_plan->roffset_proc;
4035 comm_time-=MPI_Wtime();
4037 int soffset=0,roffset=0;
4040 MPI_Datatype *stype=T_plan->stype;
4041 MPI_Datatype *rtype=T_plan->rtype;
4044 for (
int proc=0;proc<nprocs;++proc){
4046 soffset=soffset_proc[proc];
4047 roffset=roffset_proc[proc];
4048 for(
int h=0;h<howmany;h++)
4049 cudaMemcpy(&data_cpu[h*idist+soffset], &data[h*idist+soffset],
sizeof(
double)*scount_proc[proc], cudaMemcpyDeviceToHost);
4050 MPI_Sendrecv(&data_cpu[soffset],1, stype[proc],
4052 &send_recv_cpu[roffset],1, rtype[proc],
4054 T_plan->comm,&ierr);
4055 for(
int h=0;h<howmany;h++)
4056 cudaMemcpyAsync(&send_recv[h*odist+roffset_proc[proc]],&send_recv_cpu[h*odist+roffset_proc[proc]],
sizeof(
double)*rcount_proc[proc],cudaMemcpyHostToDevice);
4060 for(
int h=0;h<howmany;h++)
4061 cudaMemcpyAsync(&send_recv[h*odist+roffset_proc[procid]], &data[h*idist+soffset_proc[procid]],
sizeof(
double)*scount_proc[procid], cudaMemcpyDeviceToDevice);
4066 cudaDeviceSynchronize();
4068 comm_time+=MPI_Wtime();
4070 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
4072 cudaMemcpy(send_recv_cpu, send_recv, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4073 for(
int h=0;h<howmany;h++)
4074 for(
int id=0;
id<nprocs;++id){
4076 for(
int i=0;i<local_n1;i++){
4077 std::cout<<std::endl;
4078 for(
int j=0;j<N[0];j++){
4079 std::cout<<
'\t'<<send_recv_cpu[odist*h+(i*N[0]+j)*n_tuples];
4083 MPI_Barrier(T_plan->comm);
4093 reshuffle_time-=MPI_Wtime();
4100 int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
4102 last_ntuples=T_plan->last_recv_count/((int)local_n1);
4104 for(
int h=0;h<howmany;h++){
4106 cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice);
4107 else if(last_ntuples!=first_ntuples){
4108 local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
4109 local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
4111 else if(last_ntuples==first_ntuples){
4113 local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
4116 cudaDeviceSynchronize();
4119 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
4121 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4122 for(
int h=0;h<howmany;h++)
4123 for(
int id=0;
id<nprocs_1;++id){
4125 for(
int i=0;i<local_n1;i++){
4126 std::cout<<std::endl;
4127 for(
int j=0;j<N[0];j++){
4128 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
4132 MPI_Barrier(T_plan->comm);
4136 for(
int h=0;h<howmany;h++)
4137 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
4139 reshuffle_time+=MPI_Wtime();
4140 MPI_Barrier(T_plan->comm);
4142 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
4143 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
4144 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
4145 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
4147 timings[0]+=MPI_Wtime();
4148 timings[1]+=shuffle_time;
4149 timings[2]+=comm_time;
4150 timings[3]+=reshuffle_time;
4156 void transpose_cuda_v6(T_Plan_gpu* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
4158 std::bitset<8> Flags(flags);
4159 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
4160 MPI_Barrier(T_plan->comm);
4163 timings[0]-=MPI_Wtime();
4165 int nprocs_0, nprocs_1;
4166 nprocs=T_plan->nprocs;
4167 procid=T_plan->procid;
4168 nprocs_0=T_plan->nprocs_0;
4169 nprocs_1=T_plan->nprocs_1;
4170 ptrdiff_t *N=T_plan->N;
4171 double* data_cpu=T_plan->buffer;
4172 double * send_recv_cpu = T_plan->buffer_2;
4173 double * send_recv = T_plan->buffer_d;
4174 ptrdiff_t local_n0=T_plan->local_n0;
4175 ptrdiff_t local_n1=T_plan->local_n1;
4176 ptrdiff_t n_tuples=T_plan->n_tuples;
4177 int idist=N[1]*local_n0*n_tuples;
4178 int odist=N[0]*local_n1*n_tuples;
4180 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
4182 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
4184 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4185 for(
int h=0;h<howmany;h++)
4186 for(
int id=0;
id<1+0*nprocs;++id){
4188 for(
int i=0;i<local_n0;i++){
4189 std::cout<<std::endl;
4190 for(
int j=0;j<N[1];j++){
4191 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
4195 MPI_Barrier(T_plan->comm);
4201 shuffle_time-=MPI_Wtime();
4204 for(
int h=0;h<howmany;h++)
4205 local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
4207 cudaDeviceSynchronize();
4209 shuffle_time+=MPI_Wtime();
4210 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
4212 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4213 for(
int h=0;h<howmany;h++)
4214 for(
int id=0;
id<nprocs;++id){
4216 for(
int i=0;i<N[1];i++){
4217 std::cout<<std::endl;
4218 for(
int j=0;j<local_n0;j++){
4219 std::cout<<
'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
4223 MPI_Barrier(T_plan->comm);
4226 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
4227 for(
int h=0;h<howmany;h++)
4228 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
4231 timings[0]+=MPI_Wtime();
4232 timings[1]+=shuffle_time;
4242 int* scount_proc= T_plan->scount_proc;
4243 int* rcount_proc= T_plan->rcount_proc;
4244 int* soffset_proc= T_plan->soffset_proc;
4245 int* roffset_proc= T_plan->roffset_proc;
4247 MPI_Datatype *stype=T_plan->stype;
4248 MPI_Datatype *rtype=T_plan->rtype;
4249 MPI_Barrier(T_plan->comm);
4250 comm_time-=MPI_Wtime();
4252 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4254 MPI_Alltoallw(data_cpu,T_plan->scount_proc_w,
4255 T_plan->soffset_proc_w, stype,
4256 send_recv_cpu,T_plan->rcount_proc_w, T_plan->roffset_proc_w,
4257 rtype, T_plan->comm);
4259 else if(T_plan->is_evenly_distributed==0)
4260 MPI_Alltoallv(data_cpu,scount_proc,
4261 soffset_proc, MPI_DOUBLE,send_recv_cpu,
4262 rcount_proc,roffset_proc, MPI_DOUBLE,
4265 MPI_Alltoall(data_cpu, scount_proc[0], MPI_DOUBLE,
4266 send_recv_cpu, rcount_proc[0], MPI_DOUBLE,
4269 cudaMemcpy(send_recv, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
4270 comm_time+=MPI_Wtime();
4272 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
4274 for(
int h=0;h<howmany;h++)
4275 for(
int id=0;
id<nprocs;++id){
4277 for(
int i=0;i<local_n1;i++){
4278 std::cout<<std::endl;
4279 for(
int j=0;j<N[0];j++){
4280 std::cout<<
'\t'<<send_recv_cpu[h*odist+(i*N[0]+j)*n_tuples];
4284 MPI_Barrier(T_plan->comm);
4294 reshuffle_time-=MPI_Wtime();
4301 int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
4303 last_ntuples=T_plan->last_recv_count/((int)local_n1);
4305 for(
int h=0;h<howmany;h++){
4307 cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice);
4308 else if(last_ntuples!=first_ntuples){
4309 local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
4310 local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
4312 else if(last_ntuples==first_ntuples){
4314 local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
4318 cudaDeviceSynchronize();
4321 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
4323 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4324 for(
int h=0;h<howmany;h++)
4325 for(
int id=0;
id<nprocs_1;++id){
4327 for(
int i=0;i<local_n1;i++){
4328 std::cout<<std::endl;
4329 for(
int j=0;j<N[0];j++){
4330 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
4334 MPI_Barrier(T_plan->comm);
4338 for(
int h=0;h<howmany;h++)
4339 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
4341 reshuffle_time+=MPI_Wtime();
4343 MPI_Barrier(T_plan->comm);
4345 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
4346 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
4347 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
4348 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
4350 timings[0]+=MPI_Wtime();
4351 timings[1]+=shuffle_time;
4352 timings[2]+=comm_time;
4353 timings[3]+=reshuffle_time;
4356 void transpose_cuda_v7(T_Plan_gpu* T_plan,
double * data,
double *timings,
int kway,
unsigned flags,
int howmany,
int tag ){
4358 std::bitset<8> Flags(flags);
4359 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
4360 MPI_Barrier(T_plan->comm);
4363 timings[0]-=MPI_Wtime();
4365 int nprocs_0, nprocs_1;
4366 nprocs=T_plan->nprocs;
4367 procid=T_plan->procid;
4368 nprocs_0=T_plan->nprocs_0;
4369 nprocs_1=T_plan->nprocs_1;
4370 ptrdiff_t *N=T_plan->N;
4371 double* data_cpu=T_plan->buffer;
4372 double * send_recv_cpu = T_plan->buffer_2;
4373 double * send_recv = T_plan->buffer_d;
4374 ptrdiff_t local_n0=T_plan->local_n0;
4375 ptrdiff_t local_n1=T_plan->local_n1;
4376 ptrdiff_t n_tuples=T_plan->n_tuples;
4377 int idist=N[1]*local_n0*n_tuples;
4378 int odist=N[0]*local_n1*n_tuples;
4380 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
4382 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
4384 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4385 for(
int h=0;h<howmany;h++)
4386 for(
int id=0;
id<nprocs;++id){
4388 for(
int i=0;i<local_n0;i++){
4389 std::cout<<std::endl;
4390 for(
int j=0;j<N[1];j++){
4391 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
4395 MPI_Barrier(T_plan->comm);
4401 shuffle_time-=MPI_Wtime();
4404 for(
int h=0;h<howmany;h++)
4405 local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
4407 cudaDeviceSynchronize();
4409 shuffle_time+=MPI_Wtime();
4410 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
4412 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4413 for(
int h=0;h<howmany;h++)
4414 for(
int id=0;
id<nprocs;++id){
4416 for(
int i=0;i<N[1];i++){
4417 std::cout<<std::endl;
4418 for(
int j=0;j<local_n0;j++){
4419 std::cout<<
'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
4423 MPI_Barrier(T_plan->comm);
4426 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
4427 for(
int h=0;h<howmany;h++)
4428 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
4431 timings[0]+=MPI_Wtime();
4432 timings[1]+=shuffle_time;
4442 int* scount_proc= T_plan->scount_proc;
4443 int* rcount_proc= T_plan->rcount_proc;
4444 int* soffset_proc= T_plan->soffset_proc;
4445 int* roffset_proc= T_plan->roffset_proc;
4447 comm_time-=MPI_Wtime();
4449 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4459 if(T_plan->kway_async)
4460 par::Mpi_Alltoallv_dense<double,true>(data_cpu , scount_proc, soffset_proc,
4461 send_recv_cpu, rcount_proc, roffset_proc, T_plan->comm,kway);
4463 par::Mpi_Alltoallv_dense<double,false>(data_cpu , scount_proc, soffset_proc,
4464 send_recv_cpu, rcount_proc, roffset_proc, T_plan->comm,kway);
4467 cudaMemcpy(send_recv, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
4468 comm_time+=MPI_Wtime();
4470 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
4472 for(
int h=0;h<howmany;h++)
4473 for(
int id=0;
id<nprocs;++id){
4475 for(
int i=0;i<local_n1;i++){
4476 std::cout<<std::endl;
4477 for(
int j=0;j<N[0];j++){
4478 std::cout<<
'\t'<<send_recv_cpu[h*odist+(i*N[0]+j)*n_tuples];
4482 MPI_Barrier(T_plan->comm);
4492 reshuffle_time-=MPI_Wtime();
4499 int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
4501 last_ntuples=T_plan->last_recv_count/((int)local_n1);
4503 for(
int h=0;h<howmany;h++){
4505 cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice);
4506 else if(last_ntuples!=first_ntuples){
4507 local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
4508 local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
4510 else if(last_ntuples==first_ntuples){
4512 local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
4515 cudaDeviceSynchronize();
4518 for(
int h=0;h<howmany;h++)
4519 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
4521 reshuffle_time+=MPI_Wtime();
4523 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
4525 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4526 for(
int h=0;h<howmany;h++)
4527 for(
int id=0;
id<nprocs_1;++id){
4529 for(
int i=0;i<local_n1;i++){
4530 std::cout<<std::endl;
4531 for(
int j=0;j<N[0];j++){
4532 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
4536 MPI_Barrier(T_plan->comm);
4539 MPI_Barrier(T_plan->comm);
4541 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
4542 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
4543 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
4544 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
4546 timings[0]+=MPI_Wtime();
4547 timings[1]+=shuffle_time;
4548 timings[2]+=comm_time;
4549 timings[3]+=reshuffle_time;
4552 void transpose_cuda_v7_2(T_Plan_gpu* T_plan,
double * data,
double *timings,
int kway,
unsigned flags,
int howmany,
int tag ){
4554 std::bitset<8> Flags(flags);
4555 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
4556 MPI_Barrier(T_plan->comm);
4559 timings[0]-=MPI_Wtime();
4561 int nprocs_0, nprocs_1;
4562 nprocs=T_plan->nprocs;
4563 procid=T_plan->procid;
4564 nprocs_0=T_plan->nprocs_0;
4565 nprocs_1=T_plan->nprocs_1;
4566 ptrdiff_t *N=T_plan->N;
4567 double* data_cpu=T_plan->buffer;
4568 double * send_recv_cpu = T_plan->buffer_2;
4569 double * send_recv = T_plan->buffer_d;
4570 ptrdiff_t local_n0=T_plan->local_n0;
4571 ptrdiff_t local_n1=T_plan->local_n1;
4572 ptrdiff_t n_tuples=T_plan->n_tuples;
4573 int idist=N[1]*local_n0*n_tuples;
4574 int odist=N[0]*local_n1*n_tuples;
4576 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
4578 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
4580 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4581 for(
int h=0;h<howmany;h++)
4582 for(
int id=0;
id<nprocs;++id){
4584 for(
int i=0;i<local_n0;i++){
4585 std::cout<<std::endl;
4586 for(
int j=0;j<N[1];j++){
4587 std::cout<<
'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
4591 MPI_Barrier(T_plan->comm);
4597 shuffle_time-=MPI_Wtime();
4600 for(
int h=0;h<howmany;h++)
4601 local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
4603 cudaDeviceSynchronize();
4605 shuffle_time+=MPI_Wtime();
4606 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
4608 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4609 for(
int h=0;h<howmany;h++)
4610 for(
int id=0;
id<nprocs;++id){
4612 for(
int i=0;i<N[1];i++){
4613 std::cout<<std::endl;
4614 for(
int j=0;j<local_n0;j++){
4615 std::cout<<
'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
4619 MPI_Barrier(T_plan->comm);
4622 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
4623 for(
int h=0;h<howmany;h++)
4624 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
4627 timings[0]+=MPI_Wtime();
4628 timings[1]+=shuffle_time;
4638 int* scount_proc= T_plan->scount_proc;
4639 int* rcount_proc= T_plan->rcount_proc;
4640 int* soffset_proc= T_plan->soffset_proc;
4641 int* roffset_proc= T_plan->roffset_proc;
4643 comm_time-=MPI_Wtime();
4655 if(T_plan->kway_async)
4656 par::Mpi_Alltoallv_dense_gpu<double,true>(data , scount_proc, soffset_proc,
4657 send_recv, rcount_proc, roffset_proc, T_plan->comm,kway);
4659 par::Mpi_Alltoallv_dense_gpu<double,false>(data , scount_proc, soffset_proc,
4660 send_recv, rcount_proc, roffset_proc, T_plan->comm,kway);
4664 comm_time+=MPI_Wtime();
4666 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
4668 for(
int h=0;h<howmany;h++)
4669 for(
int id=0;
id<nprocs;++id){
4671 for(
int i=0;i<local_n1;i++){
4672 std::cout<<std::endl;
4673 for(
int j=0;j<N[0];j++){
4674 std::cout<<
'\t'<<send_recv_cpu[h*odist+(i*N[0]+j)*n_tuples];
4678 MPI_Barrier(T_plan->comm);
4688 reshuffle_time-=MPI_Wtime();
4695 int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
4697 last_ntuples=T_plan->last_recv_count/((int)local_n1);
4699 for(
int h=0;h<howmany;h++){
4701 cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice);
4702 else if(last_ntuples!=first_ntuples){
4703 local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
4704 local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
4706 else if(last_ntuples==first_ntuples){
4708 local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
4711 cudaDeviceSynchronize();
4714 for(
int h=0;h<howmany;h++)
4715 local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
4717 reshuffle_time+=MPI_Wtime();
4719 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
4721 cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4722 for(
int h=0;h<howmany;h++)
4723 for(
int id=0;
id<nprocs_1;++id){
4725 for(
int i=0;i<local_n1;i++){
4726 std::cout<<std::endl;
4727 for(
int j=0;j<N[0];j++){
4728 std::cout<<
'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
4732 MPI_Barrier(T_plan->comm);
4735 MPI_Barrier(T_plan->comm);
4737 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
4738 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
4739 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
4740 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
4742 timings[0]+=MPI_Wtime();
4743 timings[1]+=shuffle_time;
4744 timings[2]+=comm_time;
4745 timings[3]+=reshuffle_time;