AccFFT
transpose_gpu.cpp
1 /*
2  * Copyright (c) 2014-2015, Amir Gholami, George Biros
3  * All rights reserved.
4  * This file is part of AccFFT library.
5  *
6  * AccFFT is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * AccFFT is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with AccFFT. If not, see <http://www.gnu.org/licenses/>.
18  *
19 */
20 
21 #include <mpi.h>
22 #include <iostream>
23 #include <transpose_cuda.h>
24 
25 #include <stdio.h>
26 #include <bitset>
27 #include <cuda.h>
28 #include <cuda_runtime_api.h>
29 #include <assert.h>
30 #define PCOUT if(procid==0) std::cout
31 #include "parUtils.h"
32 #include <math.h>
33 #include <cmath>
34 #include <stdlib.h>
35 #include <functional>
36 #include <cmath>
37 #define VERBOSE 0
38 
39 
40 int log2(int a){
41 return log(a)/log(2);
42 }
43 static bool IsPowerOfTwo(ulong x)
44 {
45  return (x & (x - 1)) == 0;
46 }
47 
48 #ifdef ENABLE_GPU
49 Mem_Mgr_gpu::Mem_Mgr_gpu(int N0, int N1,int tuples, MPI_Comm Comm, int howmany, int specified_alloc_local){
50 
51  N[0]=N0;
52  N[1]=N1;
53  n_tuples=tuples;
54  int procid, nprocs;
55  MPI_Comm_rank(Comm, &procid);
56  MPI_Comm_size(Comm,&nprocs);
57 
58 
59  // Determine local_n0/n1 of each processor
60  if(specified_alloc_local==0){
61  {
62  ptrdiff_t * local_n0_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
63  ptrdiff_t * local_n1_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
64 #pragma omp parallel for
65  for (int proc=0;proc<nprocs;++proc){
66  local_n0_proc[proc]=ceil(N[0]/(double)nprocs);
67  local_n1_proc[proc]=ceil(N[1]/(double)nprocs);
68 
69 
70  if((N[0]-local_n0_proc[proc]*proc)<local_n0_proc[proc]) {local_n0_proc[proc]=N[0]-local_n0_proc[proc]*proc; local_n0_proc[proc]*=(int) local_n0_proc[proc]>0;}
71  if((N[1]-local_n1_proc[proc]*proc)<local_n1_proc[proc]) {local_n1_proc[proc]=N[1]-local_n1_proc[proc]*proc;local_n1_proc[proc]*=(int) local_n1_proc[proc]>0;}
72 
73 
74  }
75 
76  local_n0=local_n0_proc[procid];
77  local_n1=local_n1_proc[procid];
78  free(local_n0_proc);
79  free(local_n1_proc);
80  }
81 
82 
83  // Determine alloc local based on maximum size of input and output distribution
84  alloc_local=local_n0*N[1]*n_tuples*sizeof(double);
85  if(alloc_local<local_n1*N[0]*n_tuples*sizeof(double))
86  alloc_local=local_n1*N[0]*n_tuples*sizeof(double);
87 
88  alloc_local*=howmany;
89  }
90  else{
91  alloc_local=specified_alloc_local;
92  }
93  if( alloc_local<=1.05*std::pow(2,30) )
94  PINNED=1;
95  else
96  PINNED=0;
97 #ifdef ENABLE_GPU
98  // PCOUT<<"ENABLE GPU"<<std::endl;
99  if(PINNED==1){
100  cudaError_t cuda_err1, cuda_err2;
101  double pinned_time=-MPI_Wtime();
102  buffer=NULL; buffer_2=NULL;
103  cuda_err1=cudaMallocHost((void**)&buffer,alloc_local);
104  cuda_err2=cudaMallocHost((void**)&buffer_2,alloc_local);
105  if(cuda_err1!=cudaSuccess || cuda_err2!=cudaSuccess){
106  std::cout<<"!!!!!!!!!! Failed to cudaMallocHost in MemMgr"<<std::endl;
107  }
108  pinned_time+=MPI_Wtime();
109  // PCOUT<<"PINNED Alloc time= "<<pinned_time<<std::endl;
110  }
111  else{
112  posix_memalign((void **)&buffer_2,64, alloc_local);
113  posix_memalign((void **)&buffer,64, alloc_local);
114  }
115  cudaMalloc((void **)&buffer_d, alloc_local);
116 #else
117  posix_memalign((void **)&buffer,64, alloc_local);
118  posix_memalign((void **)&buffer_2,64, alloc_local);
119 #endif
120  memset( buffer,0, alloc_local );
121  memset( buffer_2,0, alloc_local );
122 
123 }
124 Mem_Mgr_gpu::~Mem_Mgr_gpu(){
125 
126 #ifdef ENABLE_GPU
127  cudaError_t cuda_err1=cudaSuccess, cuda_err2=cudaSuccess,cuda_err3=cudaSuccess;
128  if(PINNED==1){
129  if(buffer!=NULL) cuda_err1=cudaFreeHost(buffer);
130  if(buffer!=NULL) cuda_err2=cudaFreeHost(buffer_2);
131  if(cuda_err1!=cudaSuccess || cuda_err2!=cudaSuccess){
132  std::cout<<"!!!!!!!!!! Failed to cudaFreeHost in MemMgr; err1= "<<cuda_err1<<" err2= "<<cuda_err2<<std::endl;
133  }
134  }
135  else{
136  free(buffer);
137  free(buffer_2);
138  }
139  cuda_err3=cudaFree(buffer_d);
140  if(cuda_err3!=cudaSuccess){
141  std::cout<<"!!!!!!!!!! Failed to cudaFree in MemMgr; err3= "<<cuda_err3<<std::endl;
142  }
143 #else
144  free(buffer);
145  free(buffer_2);
146 #endif
147 }
148 T_Plan_gpu::T_Plan_gpu(int N0, int N1,int tuples, Mem_Mgr_gpu * Mem_mgr, MPI_Comm Comm, int howmany){
149 
150  N[0]=N0;
151  N[1]=N1;
152  n_tuples=tuples;
153  MPI_Comm_rank(Comm, &procid);
154  MPI_Comm_size(Comm,&nprocs);
155 
156  local_n0_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
157  local_n1_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
158  local_0_start_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
159  local_1_start_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
160 
161  // Determine local_n0/n1 of each processor
162 
163  local_0_start_proc[0]=0;local_1_start_proc[0]=0;
164  for (int proc=0;proc<nprocs;++proc){
165  local_n0_proc[proc]=ceil(N[0]/(double)nprocs);
166  local_n1_proc[proc]=ceil(N[1]/(double)nprocs);
167 
168 
169  if((N[0]-local_n0_proc[proc]*proc)<local_n0_proc[proc]) {local_n0_proc[proc]=N[0]-local_n0_proc[proc]*proc; local_n0_proc[proc]*=(int) local_n0_proc[proc]>0;}
170  if((N[1]-local_n1_proc[proc]*proc)<local_n1_proc[proc]) {local_n1_proc[proc]=N[1]-local_n1_proc[proc]*proc;local_n1_proc[proc]*=(int) local_n1_proc[proc]>0;}
171 
172  if(proc!=0){
173  local_0_start_proc[proc]=local_0_start_proc[proc-1]+local_n0_proc[proc-1];
174  local_1_start_proc[proc]=local_1_start_proc[proc-1]+local_n1_proc[proc-1];
175  }
176 
177  }
178 
179  local_n0=local_n0_proc[procid];
180  local_n1=local_n1_proc[procid];
181  local_0_start=local_0_start_proc[procid];
182  local_1_start=local_1_start_proc[procid];
183 
184 
185  alloc_local=Mem_mgr->alloc_local;
186  // Determine effective processors taking part in each transpose phase
187  nprocs_0=0; nprocs_1=0;
188  for (int proc=0;proc<nprocs;++proc){
189  if(local_n0_proc[proc]!=0)
190  nprocs_0+=1;
191  if(local_n1_proc[proc]!=0)
192  nprocs_1+=1;
193  }
194 
195 
196 
197  // Set send recv counts for communication part
198  scount_proc=(int*) malloc(sizeof(int)*nprocs);
199  rcount_proc=(int*) malloc(sizeof(int)*nprocs);
200  soffset_proc=(int*) malloc(sizeof(int)*nprocs);
201  roffset_proc=(int*) malloc(sizeof(int)*nprocs);
202 
203  scount_proc_f=(int*) malloc(sizeof(int)*nprocs);
204  rcount_proc_f=(int*) malloc(sizeof(int)*nprocs);
205  soffset_proc_f=(int*) malloc(sizeof(int)*nprocs);
206  roffset_proc_f=(int*) malloc(sizeof(int)*nprocs);
207 
208  scount_proc_w=(int*) malloc(sizeof(int)*nprocs);
209  rcount_proc_w=(int*) malloc(sizeof(int)*nprocs);
210  soffset_proc_w=(int*) malloc(sizeof(int)*nprocs);
211  roffset_proc_w=(int*) malloc(sizeof(int)*nprocs);
212 
213  //scount_proc_v8=(int*) malloc(sizeof(int)*nprocs);
214  //rcount_proc_v8=(int*) malloc(sizeof(int)*nprocs);
215  //soffset_proc_v8=(int*) malloc(sizeof(int)*nprocs);
216  //roffset_proc_v8=(int*) malloc(sizeof(int)*nprocs);
217  //memset(scount_proc_v8,0,sizeof(int)*nprocs);
218  //memset(rcount_proc_v8,0,sizeof(int)*nprocs);
219  //memset(soffset_proc_v8,0,sizeof(int)*nprocs);
220  //memset(roffset_proc_v8,0,sizeof(int)*nprocs);
221 
222  last_recv_count=0; // Will store the n_tuples of the last received data. In general ~=n_tuples
223  if(nprocs_1>nprocs_0)
224  for (int proc=0;proc<nprocs;++proc){
225 
226  scount_proc[proc]=local_n1_proc[proc]*local_n0*n_tuples;
227 
228  if(scount_proc[proc]!=0)
229  rcount_proc[proc]=local_n1_proc[proc]*local_n0_proc[proc]*n_tuples;//scount_proc[proc];
230  else
231  rcount_proc[proc]=local_n1*local_n0_proc[proc]*n_tuples;//local_n0_proc[proc]*n_tuples; //
232 
233  soffset_proc[proc]=0;
234  roffset_proc[proc]=0;
235  if(proc==0){
236  soffset_proc[proc]=0;
237  roffset_proc[proc]=0;
238  }
239  else{
240  soffset_proc[proc]=soffset_proc[proc-1]+scount_proc[proc-1];
241  roffset_proc[proc]=roffset_proc[proc-1]+rcount_proc[proc-1];
242  }
243 
244  if(proc>=nprocs_1){ // in case you have requested too many processes
245  rcount_proc[proc]=0;
246  scount_proc[proc]=0;
247  }
248  if(scount_proc[proc]==0) soffset_proc[proc]=0;//local_n0*N[1]*n_tuples-1;
249 
250  if(proc>=nprocs_0){
251  rcount_proc[proc]=0;
252  roffset_proc[proc]=0;
253  }
254  if(rcount_proc[proc]!=0)
255  last_recv_count=rcount_proc[proc];
256  if(local_n1_proc[proc]!=0)
257  last_local_n1=local_n1_proc[proc];
258  }
259  else if(nprocs_1<=nprocs_0)
260  for (int proc=0;proc<nprocs;++proc){
261 
262  scount_proc[proc]=local_n1_proc[proc]*local_n0*n_tuples;
263  rcount_proc[proc]=local_n1*local_n0_proc[proc]*n_tuples;//scount_proc[proc];
264 
265 
266 
267  soffset_proc[proc]=0;
268  roffset_proc[proc]=0;
269  if(proc==0){
270  soffset_proc[proc]=0;
271  roffset_proc[proc]=0;
272  }
273  else{
274  soffset_proc[proc]=soffset_proc[proc-1]+scount_proc[proc-1];
275  roffset_proc[proc]=roffset_proc[proc-1]+rcount_proc[proc-1];
276  }
277 
278  if(proc>=nprocs_0){ // in case you have requested too many processes
279  rcount_proc[proc]=0;
280  scount_proc[proc]=0;
281  roffset_proc[proc]=0;
282  soffset_proc[proc]=0;
283  }
284 
285  if(scount_proc[proc]==0) soffset_proc[proc]=0;//local_n0*N[1]*n_tuples-1;
286 
287  if(proc>=nprocs_1){
288  scount_proc[proc]=0;
289  soffset_proc[proc]=0;
290  }
291  if(rcount_proc[proc]!=0)
292  last_recv_count=rcount_proc[proc];
293 
294  if(local_n1_proc[proc]!=0)
295  last_local_n1=local_n1_proc[proc];
296  }
297 
298 
299  is_evenly_distributed=0; // use alltoallv
300  if((local_n0*nprocs_0-N[0])==0 && (local_n1*nprocs_1-N[1])==0 && nprocs_0==nprocs_1 && nprocs_0==nprocs){
301  is_evenly_distributed=1; // use alltoall
302  }
303 
304  method=5; //Default Transpose method
305  kway=8; // for transpose_v7
306  kway_async=true;
307  stype=new MPI_Datatype[nprocs];
308  rtype=new MPI_Datatype[nprocs];
309  //stype_v8=new MPI_Datatype[nprocs];
310  //rtype_v8_=new MPI_Datatype[nprocs];
311  //rtype_v8=new MPI_Datatype[nprocs];
312 
313  for (int i=0;i<nprocs;i++){
314  MPI_Type_vector(howmany,scount_proc[i],local_n0*N[1]*n_tuples, MPI_DOUBLE, &stype[i]);
315  MPI_Type_vector(howmany,rcount_proc[i],local_n1*N[0]*n_tuples, MPI_DOUBLE, &rtype[i]);
316 
317  MPI_Type_commit(&stype[i]);
318  MPI_Type_commit(&rtype[i]);
319 
320  soffset_proc_w[i]=soffset_proc[i]*8; //SNAFU in bytes
321  roffset_proc_w[i]=roffset_proc[i]*8;
322  scount_proc_w[i]=1;
323  rcount_proc_w[i]=1;
324 
325  soffset_proc_f[i]=soffset_proc[i]*howmany; //SNAFU in bytes
326  roffset_proc_f[i]=roffset_proc[i]*howmany;
327  scount_proc_f[i]= scount_proc[i]*howmany;
328  rcount_proc_f[i]= rcount_proc[i]*howmany;
329 
330 
331  /*
332  if(local_n0!=0){
333  soffset_proc_v8[i]=soffset_proc[i]/local_n0;
334  scount_proc_v8[i]=scount_proc[i]/local_n0;
335  }
336  if(local_n1!=0){
337  roffset_proc_v8[i]=roffset_proc[i]/local_n1;
338  rcount_proc_v8[i]=rcount_proc[i]/local_n1;
339  }
340 
341  MPI_Type_vector(local_n0,scount_proc_v8[i],N[1]*n_tuples, MPI_DOUBLE, &stype_v8[i]);
342  MPI_Type_vector(local_n1,1*n_tuples,N[0]*n_tuples, MPI_DOUBLE, &rtype_v8_[i]);
343  MPI_Type_hvector(rcount_proc_v8[i],1,n_tuples*sizeof(double),rtype_v8_[i], &rtype_v8[i]);
344 
345  MPI_Type_commit(&stype_v8[i]);
346  MPI_Type_commit(&rtype_v8[i]);
347  */
348 
349  }
350 
351  comm=Comm; // MPI Communicator
352  buffer=Mem_mgr->buffer;
353  buffer_2=Mem_mgr->buffer_2;
354  buffer_d=Mem_mgr->buffer_d;
355  //data_cpu=Mem_mgr->data_cpu;
356 
357 }
358 T_Plan_gpu::~T_Plan_gpu(){
359  free(local_n0_proc);
360  free(local_n1_proc);
361  free(local_0_start_proc);
362  free(local_1_start_proc);
363  free(scount_proc);
364  free(rcount_proc);
365  free(soffset_proc);
366  free(roffset_proc);
367 
368  free(scount_proc_w);
369  free(rcount_proc_w);
370  free(soffset_proc_w);
371  free(roffset_proc_w);
372 
373  free(scount_proc_f);
374  free(rcount_proc_f);
375  free(soffset_proc_f);
376  free(roffset_proc_f);
377 
378  //free(scount_proc_v8);
379  //free(rcount_proc_v8);
380  //free(soffset_proc_v8);
381  //free(roffset_proc_v8);
382  delete [] stype;
383  delete [] rtype;
384  //delete [] stype_v8;
385  //delete [] rtype_v8;
386  //delete [] rtype_v8_;
387 }
388 void T_Plan_gpu::which_method_gpu(T_Plan_gpu* T_plan,double* data_d){
389 
390  double dummy[4]={0};
391  double * time= (double*) malloc(sizeof(double)*(4*(int)log2(nprocs)+4));
392  double * g_time= (double*) malloc(sizeof(double)*(4*(int)log2(nprocs)+4));
393  for (int i=0;i<4*(int)log2(nprocs)+4;i++)
394  time[i]=1000;
395 
396  transpose_cuda_v5(T_plan,(double*)data_d,dummy); // Warmup
397  time[0]=-MPI_Wtime();
398  transpose_cuda_v5(T_plan,(double*)data_d,dummy);
399  time[0]+=MPI_Wtime();
400 
401  transpose_cuda_v6(T_plan,(double*)data_d,dummy); // Warmup
402  time[1]=-MPI_Wtime();
403  transpose_cuda_v6(T_plan,(double*)data_d,dummy);
404  time[1]+=MPI_Wtime();
405 
406  if(IsPowerOfTwo(nprocs) && nprocs>511){
407  kway_async=true;
408 #ifndef TORUS_TOPOL
409  for (int i=0;i<(int)log2(nprocs)-4;i++){
410  kway=nprocs/std::pow(2,i);
411  MPI_Barrier(T_plan->comm);
412  transpose_cuda_v7(T_plan,(double*)data_d,dummy,kway); // Warmup
413  time[2+i]=-MPI_Wtime();
414  transpose_cuda_v7(T_plan,(double*)data_d,dummy,kway);
415  time[2+i]+=MPI_Wtime();
416  }
417 #endif
418 
419  kway_async=false;
420 #ifdef TORUS_TOPOL
421  for (int i=0;i<(int)log2(nprocs)-4;i++){
422  kway=nprocs/std::pow(2,i);
423  MPI_Barrier(T_plan->comm);
424  transpose_cuda_v7(T_plan,(double*)data_d,dummy,kway); // Warmup
425  time[2+(int)log2(nprocs)+i]=-MPI_Wtime();
426  transpose_cuda_v7(T_plan,(double*)data_d,dummy,kway);
427  time[2+(int)log2(nprocs)+i]+=MPI_Wtime();
428  }
429 #endif
430 
431 #ifndef TORUS_TOPOL
432  kway_async=true;
433  for (int i=0;i<(int)log2(nprocs)-4;i++){
434  kway=nprocs/std::pow(2,i);
435  MPI_Barrier(T_plan->comm);
436  transpose_cuda_v7_2(T_plan,(double*)data_d,dummy,kway); // Warmup
437  time[2+2*(int)log2(nprocs)+i]=-MPI_Wtime();
438  transpose_cuda_v7_2(T_plan,(double*)data_d,dummy,kway);
439  time[2+2*(int)log2(nprocs)+i]+=MPI_Wtime();
440  }
441 #endif
442 
443 #ifdef TORUS_TOPOL
444  kway_async=false;
445  for (int i=0;i<(int)log2(nprocs)-4;i++){
446  kway=nprocs/std::pow(2,i);
447  MPI_Barrier(T_plan->comm);
448  transpose_cuda_v7_2(T_plan,(double*)data_d,dummy,kway); // Warmup
449  time[2+3*(int)log2(nprocs)+i]=-MPI_Wtime();
450  transpose_cuda_v7_2(T_plan,(double*)data_d,dummy,kway);
451  time[2+3*(int)log2(nprocs)+i]+=MPI_Wtime();
452  }
453 #endif
454  }
455 
456  transpose_cuda_v5_2(T_plan,(double*)data_d,dummy); // Warmup
457  time[4*(int)log2(nprocs)+2]=-MPI_Wtime();
458  transpose_cuda_v5_2(T_plan,(double*)data_d,dummy);
459  time[4*(int)log2(nprocs)+2]+=MPI_Wtime();
460 
461  transpose_cuda_v5_3(T_plan,(double*)data_d,dummy); // Warmup
462  time[4*(int)log2(nprocs)+3]=-MPI_Wtime();
463  transpose_cuda_v5_3(T_plan,(double*)data_d,dummy);
464  time[4*(int)log2(nprocs)+3]+=MPI_Wtime();
465 
466  MPI_Allreduce(time,g_time,(4*(int)log2(nprocs)+4),MPI_DOUBLE,MPI_MAX, T_plan->comm);
467 
468  if(VERBOSE>=1)
469  if(T_plan->procid==0){
470  for(int i=0;i<4*(int)log2(nprocs)+4;++i)
471  std::cout<<" time["<<i<<"]= "<<g_time[i]<<" , ";
472  std::cout<<'\n';
473  }
474 
475  double smallest=1000;
476  for (int i=0;i<4*(int)log2(nprocs)+4;i++)
477  smallest=std::min(smallest,g_time[i]);
478 
479  if(g_time[0]==smallest){
480  T_plan->method=5;
481  }
482  else if(g_time[1]==smallest){
483  T_plan->method=6;
484  }
485  else if(g_time[4*(int)log2(nprocs)+2]==smallest){
486  T_plan->method=52;
487  }
488  else if(g_time[4*(int)log2(nprocs)+3]==smallest){
489  T_plan->method=53;
490  }
491  else{
492  for (int i=0;i<(int)log2(nprocs);i++)
493  if(g_time[2+i]==smallest){
494  T_plan->method=7;
495  T_plan->kway=nprocs/std::pow(2,i);
496  T_plan->kway_async=true;
497  break;
498  }
499  for (int i=0;i<(int)log2(nprocs);i++)
500  if(g_time[2+(int)log2(nprocs)+i]==smallest){
501  T_plan->method=7;
502  T_plan->kway=nprocs/std::pow(2,i);
503  T_plan->kway_async=false;
504  break;
505  }
506 
507  for (int i=0;i<(int)log2(nprocs);i++)
508  if(g_time[2+2*(int)log2(nprocs)+i]==smallest){
509  T_plan->method=7;
510  T_plan->kway=nprocs/std::pow(2,i);
511  T_plan->kway_async=true;
512  break;
513  }
514 
515  for (int i=0;i<(int)log2(nprocs);i++)
516  if(g_time[2+3*(int)log2(nprocs)+i]==smallest){
517  T_plan->method=7;
518  T_plan->kway=nprocs/std::pow(2,i);
519  T_plan->kway_async=false;
520  break;
521  }
522  }
523 
524  if(VERBOSE>=1){
525  PCOUT<<"smallest= "<<smallest<<std::endl;
526  PCOUT<<"Using transpose v"<<method<<" kway= "<<T_plan->kway<<" kway_async="<<T_plan->kway_async<<std::endl;
527  }
528  free(time);
529  free(g_time);
530  MPI_Barrier(T_plan->comm);
531 
532  return;
533 }
534 void T_Plan_gpu::execute_gpu(T_Plan_gpu* T_plan,double* data_d,double *timings, unsigned flags, int howmany, int tag){
535 
536 
537  if(howmany==1){
538  if(method==1)
539  fast_transpose_cuda_v1(T_plan,(double*)data_d,timings,flags,howmany, tag);
540  if(method==12)
541  fast_transpose_cuda_v1_2(T_plan,(double*)data_d,timings,flags,howmany, tag);
542  if(method==13)
543  fast_transpose_cuda_v1_3(T_plan,(double*)data_d,timings,flags,howmany, tag);
544  if(method==2)
545  fast_transpose_cuda_v2(T_plan,(double*)data_d,timings,flags,howmany, tag);
546  if(method==3)
547  fast_transpose_cuda_v3(T_plan,(double*)data_d,timings,kway,flags,howmany, tag);
548  if(method==32)
549  fast_transpose_cuda_v3_2(T_plan,(double*)data_d,timings,kway,flags,howmany, tag);
550  }
551  else
552  {
553  if(method==1)
554  fast_transpose_cuda_v1_h(T_plan,(double*)data_d,timings,flags,howmany, tag);
555  if(method==12)
556  fast_transpose_cuda_v1_2_h(T_plan,(double*)data_d,timings,flags,howmany, tag);
557  if(method==13)
558  fast_transpose_cuda_v1_3_h(T_plan,(double*)data_d,timings,flags,howmany, tag);
559  if(method==2)
560  fast_transpose_cuda_v2_h(T_plan,(double*)data_d,timings,flags,howmany, tag);
561  if(method==3 || method==32)
562  fast_transpose_cuda_v3_h(T_plan,(double*)data_d,timings,kway,flags,howmany, tag);
563  }
564  if(method==5)
565  transpose_cuda_v5(T_plan,(double*)data_d,timings,flags,howmany, tag);
566  if(method==52)
567  transpose_cuda_v5_2(T_plan,(double*)data_d,timings,flags,howmany, tag);
568  if(method==53)
569  transpose_cuda_v5_3(T_plan,(double*)data_d,timings,flags,howmany, tag);
570  if(method==6)
571  transpose_cuda_v6(T_plan,(double*)data_d,timings,flags,howmany, tag);
572  if(method==7)
573  transpose_cuda_v7(T_plan,(double*)data_d,timings,kway,flags,howmany, tag);
574  if(method==72)
575  transpose_cuda_v7_2(T_plan,(double*)data_d,timings,kway,flags,howmany, tag);
576 
577 
578  return;
579 }
580 
581 void T_Plan_gpu::which_fast_method_gpu(T_Plan_gpu* T_plan,double* data_d){
582 
583  double dummy[4]={0};
584  double * time= (double*) malloc(sizeof(double)*(4*(int)log2(nprocs)+4));
585  double * g_time= (double*) malloc(sizeof(double)*(4*(int)log2(nprocs)+4));
586  for (int i=0;i<4*(int)log2(nprocs)+4;i++)
587  time[i]=1000;
588 
589  fast_transpose_cuda_v1(T_plan,(double*)data_d,dummy,2); // Warmup
590  time[0]=-MPI_Wtime();
591  fast_transpose_cuda_v1(T_plan,(double*)data_d,dummy,2);
592  time[0]+=MPI_Wtime();
593 
594  fast_transpose_cuda_v2(T_plan,(double*)data_d,dummy,2); // Warmup
595  time[1]=-MPI_Wtime();
596  fast_transpose_cuda_v2(T_plan,(double*)data_d,dummy,2);
597  time[1]+=MPI_Wtime();
598 
599  if(IsPowerOfTwo(nprocs) && nprocs>511){
600  kway_async=true;
601 #ifndef TORUS_TOPOL
602  for (int i=0;i<(int)log2(nprocs)-4;i++){
603  kway=nprocs/std::pow(2,i);
604  MPI_Barrier(T_plan->comm);
605  fast_transpose_cuda_v3(T_plan,(double*)data_d,dummy,kway,2); // Warmup
606  time[2+i]=-MPI_Wtime();
607  fast_transpose_cuda_v3(T_plan,(double*)data_d,dummy,kway,2);
608  time[2+i]+=MPI_Wtime();
609  }
610 #endif
611 
612  kway_async=false;
613 #ifdef TORUS_TOPOL
614  for (int i=0;i<(int)log2(nprocs)-4;i++){
615  kway=nprocs/std::pow(2,i);
616  MPI_Barrier(T_plan->comm);
617  fast_transpose_cuda_v3(T_plan,(double*)data_d,dummy,kway,2); // Warmup
618  time[2+(int)log2(nprocs)+i]=-MPI_Wtime();
619  fast_transpose_cuda_v3(T_plan,(double*)data_d,dummy,kway,2);
620  time[2+(int)log2(nprocs)+i]+=MPI_Wtime();
621  }
622 #endif
623 
624 #ifndef TORUS_TOPOL
625  kway_async=true;
626  for (int i=0;i<(int)log2(nprocs)-4;i++){
627  kway=nprocs/std::pow(2,i);
628  MPI_Barrier(T_plan->comm);
629  fast_transpose_cuda_v3_2(T_plan,(double*)data_d,dummy,kway,2); // Warmup
630  time[2+2*(int)log2(nprocs)+i]=-MPI_Wtime();
631  fast_transpose_cuda_v3_2(T_plan,(double*)data_d,dummy,kway,2);
632  time[2+2*(int)log2(nprocs)+i]+=MPI_Wtime();
633  }
634 #endif
635 
636 #ifdef TORUS_TOPOL
637  kway_async=false;
638  for (int i=0;i<(int)log2(nprocs)-4;i++){
639  kway=nprocs/std::pow(2,i);
640  MPI_Barrier(T_plan->comm);
641  fast_transpose_cuda_v3_2(T_plan,(double*)data_d,dummy,kway,2); // Warmup
642  time[2+3*(int)log2(nprocs)+i]=-MPI_Wtime();
643  fast_transpose_cuda_v3_2(T_plan,(double*)data_d,dummy,kway,2);
644  time[2+3*(int)log2(nprocs)+i]+=MPI_Wtime();
645  }
646 #endif
647  }
648 
649  fast_transpose_cuda_v1_2(T_plan,(double*)data_d,dummy,2); // Warmup
650  time[4*(int)log2(nprocs)+2]=-MPI_Wtime();
651  fast_transpose_cuda_v1_2(T_plan,(double*)data_d,dummy,2);
652  time[4*(int)log2(nprocs)+2]+=MPI_Wtime();
653 
654  fast_transpose_cuda_v1_3(T_plan,(double*)data_d,dummy,2); // Warmup
655  time[4*(int)log2(nprocs)+3]=-MPI_Wtime();
656  fast_transpose_cuda_v1_3(T_plan,(double*)data_d,dummy,2);
657  time[4*(int)log2(nprocs)+3]+=MPI_Wtime();
658 
659  MPI_Allreduce(time,g_time,(4*(int)log2(nprocs)+4),MPI_DOUBLE,MPI_MAX, T_plan->comm);
660  if(VERBOSE>=1)
661  if(T_plan->procid==0){
662  for(int i=0;i<4*(int)log2(nprocs)+4;++i)
663  std::cout<<" time["<<i<<"]= "<<g_time[i]<<" , ";
664  std::cout<<'\n';
665  }
666 
667  double smallest=1000;
668  for (int i=0;i<4*(int)log2(nprocs)+4;i++)
669  smallest=std::min(smallest,g_time[i]);
670 
671  if(g_time[0]==smallest){
672  T_plan->method=1;
673  }
674  else if(g_time[1]==smallest){
675  T_plan->method=2;
676  }
677  else if(g_time[4*(int)log2(nprocs)+2]==smallest){
678  T_plan->method=12;
679  }
680  else if(g_time[4*(int)log2(nprocs)+3]==smallest){
681  T_plan->method=13;
682  }
683  else{
684  for (int i=0;i<(int)log2(nprocs);i++)
685  if(g_time[2+i]==smallest){
686  T_plan->method=3;
687  T_plan->kway=nprocs/std::pow(2,i);
688  T_plan->kway_async=true;
689  break;
690  }
691  for (int i=0;i<(int)log2(nprocs);i++)
692  if(g_time[2+(int)log2(nprocs)+i]==smallest){
693  T_plan->method=3;
694  T_plan->kway=nprocs/std::pow(2,i);
695  T_plan->kway_async=false;
696  break;
697  }
698 
699  for (int i=0;i<(int)log2(nprocs);i++)
700  if(g_time[2+2*(int)log2(nprocs)+i]==smallest){
701  T_plan->method=32;
702  T_plan->kway=nprocs/std::pow(2,i);
703  T_plan->kway_async=true;
704  break;
705  }
706 
707  for (int i=0;i<(int)log2(nprocs);i++)
708  if(g_time[2+3*(int)log2(nprocs)+i]==smallest){
709  T_plan->method=32;
710  T_plan->kway=nprocs/std::pow(2,i);
711  T_plan->kway_async=false;
712  break;
713  }
714  }
715 
716  if(VERBOSE>=1){
717  PCOUT<<"smallest= "<<smallest<<std::endl;
718  PCOUT<<"Using transpose v"<<method<<" kway= "<<T_plan->kway<<" kway_async="<<T_plan->kway_async<<std::endl;
719  }
720  free(time);
721  free(g_time);
722  MPI_Barrier(T_plan->comm);
723 
724  return;
725 }
726 #endif
727 
728 
729 
730 
731 void fast_transpose_cuda_v1_h(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
732 
733  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
734  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If nprocs==1 and Flags==Transposed_Out return
735  MPI_Barrier(T_plan->comm);
736  return;
737  }
738  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
739  transpose_cuda_v5(T_plan,(double*)data,timings,flags,howmany,tag);
740  MPI_Barrier(T_plan->comm);
741  return;
742  }
743  timings[0]-=MPI_Wtime();
744  int nprocs, procid;
745  int nprocs_0, nprocs_1;
746  nprocs=T_plan->nprocs;
747  procid=T_plan->procid;
748  nprocs_0=T_plan->nprocs_0;
749  nprocs_1=T_plan->nprocs_1;
750  ptrdiff_t *N=T_plan->N;
751  ptrdiff_t local_n0=T_plan->local_n0;
752  ptrdiff_t local_n1=T_plan->local_n1;
753  ptrdiff_t n_tuples=T_plan->n_tuples;
754 
755  double * data_cpu=T_plan->buffer; //pinned
756  double * send_recv_cpu = T_plan->buffer_2;//pinned
757  double * send_recv_d = T_plan->buffer_d;
758 
759  int idist=N[1]*local_n0*n_tuples;
760  int odist=N[0]*local_n1*n_tuples;
761 
762  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
763 
764  int ptr=0;
765  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
766  if(VERBOSE>=2){
767  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
768  for(int h=0;h<howmany;h++)
769  for(int id=0;id<nprocs;++id){
770  if(procid==id)
771  for(int i=0;i<local_n0;i++){
772  std::cout<<std::endl;
773  for(int j=0;j<N[1];j++){
774  ptr=h*idist+(i*N[1]+j)*n_tuples;
775  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
776  }
777  }
778  std::cout<<'\n';
779  MPI_Barrier(T_plan->comm);
780  }
781  }
782  //PCOUT<<" ============================================= "<<std::endl;
783  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
784  //PCOUT<<" ============================================= "<<std::endl;
785  ptr=0;
786  ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
787  ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
788  ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
789  ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
790  shuffle_time-=MPI_Wtime();
791  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
792 #pragma omp parallel for
793  for(int h=0;h<howmany;h++)
794  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
795  }
796  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
797 #pragma omp parallel for
798  for(int h=0;h<howmany;h++)
799  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
800  }
801  if(nprocs==1){ // Transpose is done!
802  shuffle_time+=MPI_Wtime();
803  timings[0]+=MPI_Wtime();
804  timings[0]+=shuffle_time;
805  timings[1]+=shuffle_time;
806  timings[2]+=0;
807  timings[3]+=0;
808  MPI_Barrier(T_plan->comm);
809  return;
810  }
811 
812  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
813  ptr=0;
814 
815  //for (int proc=0;proc<nprocs_1;++proc)
816  // for(int h=0;h<howmany;++h){
817  // for(int i=0;i<local_n0;++i){
818  // //for(int j=local_1_start_proc[proc];j<local_1_start_proc[proc]+local_n1_proc[proc];++j){
819  // // memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+j)*n_tuples],sizeof(double)*n_tuples);
820  // // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
821  // // ptr+=n_tuples;
822  // //}
823  // //memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],sizeof(double)*n_tuples*local_n1_proc[proc]);
824  // cudaMemcpy(&send_recv_d[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples] , sizeof(double)*n_tuples*local_n1_proc[proc] , cudaMemcpyDeviceToDevice);
825  // ptr+=n_tuples*local_n1_proc[proc];
826  // }
827  // }
828  memcpy_v1_h1(nprocs_1,howmany,local_n0,n_tuples,local_n1_proc,send_recv_d,data,idist,N[1],local_1_start_proc);
829  cudaDeviceSynchronize();
830 
831 
832  shuffle_time+=MPI_Wtime();
833  ptr=0;
834  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
835  if(VERBOSE>=2){
836  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
837  for(int id=0;id<nprocs;++id){
838  for(int h=0;h<howmany;h++)
839  if(procid==id)
840  for(int i=0;i<N[1];i++){
841  std::cout<<std::endl;
842  for(int j=0;j<local_n0;j++){
843  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
844  ptr+=n_tuples;
845  }
846  }
847  std::cout<<'\n';
848  MPI_Barrier(T_plan->comm);
849  }
850  }
851 
852 
853  //PCOUT<<" ============================================= "<<std::endl;
854  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
855  //PCOUT<<" ============================================= "<<std::endl;
856 
857  int* scount_proc= T_plan->scount_proc;
858  int* rcount_proc= T_plan->rcount_proc;
859  int* soffset_proc= T_plan->soffset_proc;
860  int* roffset_proc= T_plan->roffset_proc;
861 
862  MPI_Barrier(T_plan->comm);
863 
864  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
865  comm_time-=MPI_Wtime();
866 
867  int soffset=0,roffset=0;
868  MPI_Status ierr;
869  MPI_Request * s_request= new MPI_Request[nprocs];
870  MPI_Request * request= new MPI_Request[nprocs];
871 #pragma omp parallel for
872  for (int proc=0;proc<nprocs;++proc){
873  request[proc]=MPI_REQUEST_NULL;
874  s_request[proc]=MPI_REQUEST_NULL;
875  }
876 
877  double *s_buf, *r_buf;
878  s_buf=data_cpu; r_buf=send_recv_cpu;
879  double *r_buf_d=send_recv_d;
880  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
881 
882 
883  // Post Receives
884  for (int proc=0;proc<nprocs;++proc){
885  if(proc!=procid){
886  roffset=roffset_proc[proc];
887  MPI_Irecv(&r_buf[roffset*howmany],rcount_proc[proc]*howmany,MPI_DOUBLE, proc,
888  tag, T_plan->comm, &request[proc]);
889  }
890  }
891  // SEND
892  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
893  for (int proc=0;proc<nprocs;++proc){
894  if(proc!=procid){
895  soffset=soffset_proc[proc];
896  MPI_Isend(&s_buf[soffset*howmany],scount_proc[proc]*howmany,MPI_DOUBLE,proc, tag,
897  T_plan->comm, &s_request[proc]);
898  }
899  }
900 
901  soffset=soffset_proc[procid];//aoffset_proc[proc];//proc*count_proc[proc];
902  roffset=roffset_proc[procid];
903  memcpy(&r_buf[roffset*howmany],&s_buf[soffset*howmany],howmany*sizeof(double)*scount_proc[procid]);
904 
905  for (int proc=0;proc<nprocs;++proc){
906  MPI_Wait(&request[proc], &ierr);
907  MPI_Wait(&s_request[proc], &ierr);
908  }
909 
910  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
911  cudaMemcpy(r_buf_d, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
912  cudaDeviceSynchronize();
913  comm_time+=MPI_Wtime();
914 
915 
916  ptr=0;
917  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
918  if(VERBOSE>=2){
919  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
920  for(int id=0;id<nprocs;++id){
921  if(procid==id)
922  for(int h=0;h<howmany;h++)
923  for(int i=0;i<local_n1;i++){
924  std::cout<<std::endl;
925  for(int j=0;j<N[0];j++){
926  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
927  ptr+=n_tuples;
928  }
929  }
930  std::cout<<'\n';
931  MPI_Barrier(T_plan->comm);
932  }
933  }
934  //PCOUT<<" ============================================= "<<std::endl;
935  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
936  //PCOUT<<" ============================================= "<<std::endl;
937  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
938  reshuffle_time-=MPI_Wtime();
939  ptr=0;
940  //for (int proc=0;proc<nprocs_0;++proc)
941  // for(int h=0;h<howmany;++h){
942  // for(int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
943  // //memcpy(&data_cpu[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*sizeof(double)*n_tuples);
944  // cudaMemcpy( &data[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*sizeof(double)*n_tuples,cudaMemcpyHostToDevice);
945  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" i=("<<i<<") data_ptr= "<<h*odist+(i*local_n1)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*local_n1_proc[proc] <<std::endl;
946  // ptr+=n_tuples*local_n1;
947  // //for(int j=0*local_1_start_proc[proc];j<0*local_1_start_proc[proc]+local_n1;++j){
948  // // memcpy(&data[h*odist+(i*local_n1+j)*n_tuples],&send_recv[ptr],sizeof(double)*n_tuples);
949  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
950  // // ptr+=n_tuples;
951  // //}
952  // }
953  // }
954  //memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_cpu);
955  memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_d);
956  cudaDeviceSynchronize();
957 
958  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
959  if(VERBOSE>=2){
960  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
961  for(int id=0;id<nprocs_1;++id){
962  if(procid==id)
963  for(int h=0;h<howmany;h++)
964  for(int i=0;i<N[0];i++){
965  std::cout<<std::endl;
966  for(int j=0;j<local_n1;j++){
967  ptr=h*odist+(i*local_n1+j)*n_tuples;
968  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
969  }
970  }
971  std::cout<<'\n';
972  MPI_Barrier(T_plan->comm);
973  }
974  }
975  // Right now the data is in transposed out format.
976  // If the user did not want this layout, transpose again.
977  if(Flags[1]==0){
978 #pragma omp parallel for
979  for(int h=0;h<howmany;h++)
980  local_transpose_cuda(N[0],local_n1,n_tuples,&data[h*odist] );
981 
982  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
983  if(VERBOSE>=2){
984  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
985  for(int id=0;id<nprocs_1;++id){
986  if(procid==id)
987  for(int h=0;h<howmany;h++)
988  for(int i=0;i<N[0];i++){
989  std::cout<<std::endl;
990  for(int j=0;j<local_n1;j++){
991  ptr=h*odist+(i*local_n1+j)*n_tuples;
992  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
993  }
994  }
995  std::cout<<'\n';
996  MPI_Barrier(T_plan->comm);
997  }
998  }
999  }
1000 
1001  reshuffle_time+=MPI_Wtime();
1002  MPI_Barrier(T_plan->comm);
1003  delete [] request;
1004  delete [] s_request;
1005 
1006 
1007  if(VERBOSE>=1){
1008  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
1009  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
1010  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
1011  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1012  }
1013  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
1014  timings[1]+=shuffle_time;
1015  timings[2]+=comm_time;
1016  timings[3]+=reshuffle_time;
1017  return;
1018 }
1019 
1020 void fast_transpose_cuda_v1_2_h(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
1021 
1022  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
1023  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If nprocs==1 and Flags==Transposed_Out return
1024  MPI_Barrier(T_plan->comm);
1025  return;
1026  }
1027  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
1028  transpose_cuda_v5(T_plan,(double*)data,timings,flags,howmany,tag);
1029  MPI_Barrier(T_plan->comm);
1030  return;
1031  }
1032  timings[0]-=MPI_Wtime();
1033  int nprocs, procid;
1034  int nprocs_0, nprocs_1;
1035  nprocs=T_plan->nprocs;
1036  procid=T_plan->procid;
1037  nprocs_0=T_plan->nprocs_0;
1038  nprocs_1=T_plan->nprocs_1;
1039  ptrdiff_t *N=T_plan->N;
1040  ptrdiff_t local_n0=T_plan->local_n0;
1041  ptrdiff_t local_n1=T_plan->local_n1;
1042  ptrdiff_t n_tuples=T_plan->n_tuples;
1043 
1044  double * data_cpu=T_plan->buffer; //pinned
1045  double * send_recv_cpu = T_plan->buffer_2;//pinned
1046  double * send_recv_d = T_plan->buffer_d;
1047 
1048  int idist=N[1]*local_n0*n_tuples;
1049  int odist=N[0]*local_n1*n_tuples;
1050 
1051  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1052 
1053  int ptr=0;
1054  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
1055  if(VERBOSE>=2){
1056  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1057  for(int h=0;h<howmany;h++)
1058  for(int id=0;id<nprocs;++id){
1059  if(procid==id)
1060  for(int i=0;i<local_n0;i++){
1061  std::cout<<std::endl;
1062  for(int j=0;j<N[1];j++){
1063  ptr=h*idist+(i*N[1]+j)*n_tuples;
1064  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
1065  }
1066  }
1067  std::cout<<'\n';
1068  MPI_Barrier(T_plan->comm);
1069  }
1070  }
1071  //PCOUT<<" ============================================= "<<std::endl;
1072  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
1073  //PCOUT<<" ============================================= "<<std::endl;
1074  ptr=0;
1075  ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
1076  ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
1077  ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
1078  ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
1079  shuffle_time-=MPI_Wtime();
1080  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1081 #pragma omp parallel for
1082  for(int h=0;h<howmany;h++)
1083  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
1084  }
1085  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1086 #pragma omp parallel for
1087  for(int h=0;h<howmany;h++)
1088  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
1089  }
1090  if(nprocs==1){ // Transpose is done!
1091  shuffle_time+=MPI_Wtime();
1092  timings[0]+=MPI_Wtime();
1093  timings[0]+=shuffle_time;
1094  timings[1]+=shuffle_time;
1095  timings[2]+=0;
1096  timings[3]+=0;
1097  MPI_Barrier(T_plan->comm);
1098  return;
1099  }
1100 
1101  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
1102  ptr=0;
1103 
1104  //for (int proc=0;proc<nprocs_1;++proc)
1105  // for(int h=0;h<howmany;++h){
1106  // for(int i=0;i<local_n0;++i){
1107  // //for(int j=local_1_start_proc[proc];j<local_1_start_proc[proc]+local_n1_proc[proc];++j){
1108  // // memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+j)*n_tuples],sizeof(double)*n_tuples);
1109  // // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
1110  // // ptr+=n_tuples;
1111  // //}
1112  // //memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],sizeof(double)*n_tuples*local_n1_proc[proc]);
1113  // cudaMemcpy(&send_recv_d[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples] , sizeof(double)*n_tuples*local_n1_proc[proc] , cudaMemcpyDeviceToDevice);
1114  // ptr+=n_tuples*local_n1_proc[proc];
1115  // }
1116  // }
1117  memcpy_v1_h1(nprocs_1,howmany,local_n0,n_tuples,local_n1_proc,send_recv_d,data,idist,N[1],local_1_start_proc);
1118  cudaDeviceSynchronize();
1119 
1120 
1121  shuffle_time+=MPI_Wtime();
1122  ptr=0;
1123  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
1124  if(VERBOSE>=2){
1125  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1126  for(int id=0;id<nprocs;++id){
1127  for(int h=0;h<howmany;h++)
1128  if(procid==id)
1129  for(int i=0;i<N[1];i++){
1130  std::cout<<std::endl;
1131  for(int j=0;j<local_n0;j++){
1132  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
1133  ptr+=n_tuples;
1134  }
1135  }
1136  std::cout<<'\n';
1137  MPI_Barrier(T_plan->comm);
1138  }
1139  }
1140 
1141 
1142  //PCOUT<<" ============================================= "<<std::endl;
1143  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
1144  //PCOUT<<" ============================================= "<<std::endl;
1145 
1146  int* scount_proc= T_plan->scount_proc;
1147  int* rcount_proc= T_plan->rcount_proc;
1148  int* soffset_proc= T_plan->soffset_proc;
1149  int* roffset_proc= T_plan->roffset_proc;
1150 
1151  MPI_Barrier(T_plan->comm);
1152 
1153  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1154  comm_time-=MPI_Wtime();
1155 
1156  int soffset=0,roffset=0;
1157  MPI_Status ierr;
1158  MPI_Request * s_request= new MPI_Request[nprocs];
1159  MPI_Request * request= new MPI_Request[nprocs];
1160  int flag[nprocs],color[nprocs];
1161  memset(flag,0,sizeof(int)*nprocs);
1162  memset(color,0,sizeof(int)*nprocs);
1163  int counter=1;
1164 #pragma omp parallel for
1165  for (int proc=0;proc<nprocs;++proc){
1166  request[proc]=MPI_REQUEST_NULL;
1167  s_request[proc]=MPI_REQUEST_NULL;
1168  }
1169 
1170  double *s_buf, *r_buf;
1171  s_buf=data_cpu; r_buf=send_recv_cpu;
1172  double *r_buf_d=send_recv_d;
1173  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> send_recv_d --Th--> data_d
1174 
1175 
1176  // SEND
1177  //cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1178  for (int proc=0;proc<nprocs;++proc){
1179  if(proc!=procid){
1180  soffset=soffset_proc[proc];
1181  cudaMemcpy(&s_buf[soffset*howmany], &send_recv_d[soffset*howmany],howmany*sizeof(double)*scount_proc[proc], cudaMemcpyDeviceToHost);
1182  MPI_Isend(&s_buf[soffset*howmany],scount_proc[proc]*howmany, MPI_DOUBLE,proc, tag,
1183  T_plan->comm, &s_request[proc]);
1184 
1185  }
1186  else{ // copy your own part directly
1187  //cudaMemcpy(&r_buf_d[roffset_proc[procid]*howmany], &send_recv_d[soffset_proc[procid]*howmany],howmany*sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToDevice);
1188  // Note that because if Flags[1]=0 the send_d and recv_d are the same, D2D should not be used
1189  // Otherwise it will corrupt the data in the unbalanced cases
1190  cudaMemcpy(&r_buf[roffset_proc[procid]*howmany], &send_recv_d[soffset_proc[procid]*howmany],howmany*sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToHost);
1191  flag[procid]=1;
1192  }
1193  }
1194  for (int proc=0;proc<nprocs;++proc){
1195  if(proc!=procid){
1196  roffset=roffset_proc[proc];
1197  MPI_Irecv(&r_buf[roffset*howmany],rcount_proc[proc]*howmany,MPI_DOUBLE, proc,
1198  tag, T_plan->comm, &request[proc]);
1199  }
1200  }
1201  while(counter!=nprocs+1){
1202 
1203  for (int proc=0;proc<nprocs;++proc){
1204  MPI_Test(&request[proc], &flag[proc],&ierr);
1205  if(flag[proc]==1 && color[proc]==0){
1206  cudaMemcpyAsync(&r_buf_d[roffset_proc[proc]*howmany],&r_buf[roffset_proc[proc]*howmany],howmany*sizeof(double)*rcount_proc[proc],cudaMemcpyHostToDevice);
1207  color[proc]=1;
1208  counter+=1;
1209  }
1210  }
1211 
1212  }
1213  //cudaMemcpy(r_buf_d, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
1214  //cudaMemcpy(&r_buf_d[roffset_proc[procid]*howmany], &send_recv_d[soffset_proc[procid]*howmany],howmany*sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToDevice);
1215  cudaDeviceSynchronize();
1216  comm_time+=MPI_Wtime();
1217 
1218 
1219 
1220 
1221 
1222  ptr=0;
1223  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
1224  if(VERBOSE>=2){
1225  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1226  for(int id=0;id<nprocs;++id){
1227  if(procid==id)
1228  for(int h=0;h<howmany;h++)
1229  for(int i=0;i<local_n1;i++){
1230  std::cout<<std::endl;
1231  for(int j=0;j<N[0];j++){
1232  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
1233  ptr+=n_tuples;
1234  }
1235  }
1236  std::cout<<'\n';
1237  MPI_Barrier(T_plan->comm);
1238  }
1239  }
1240  //PCOUT<<" ============================================= "<<std::endl;
1241  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
1242  //PCOUT<<" ============================================= "<<std::endl;
1243  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
1244  reshuffle_time-=MPI_Wtime();
1245  ptr=0;
1246  //for (int proc=0;proc<nprocs_0;++proc)
1247  // for(int h=0;h<howmany;++h){
1248  // for(int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
1249  // //memcpy(&data_cpu[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*sizeof(double)*n_tuples);
1250  // cudaMemcpy( &data[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*sizeof(double)*n_tuples,cudaMemcpyHostToDevice);
1251  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" i=("<<i<<") data_ptr= "<<h*odist+(i*local_n1)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*local_n1_proc[proc] <<std::endl;
1252  // ptr+=n_tuples*local_n1;
1253  // //for(int j=0*local_1_start_proc[proc];j<0*local_1_start_proc[proc]+local_n1;++j){
1254  // // memcpy(&data[h*odist+(i*local_n1+j)*n_tuples],&send_recv[ptr],sizeof(double)*n_tuples);
1255  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
1256  // // ptr+=n_tuples;
1257  // //}
1258  // }
1259  // }
1260  //memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_cpu);
1261  memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_d);
1262  cudaDeviceSynchronize();
1263 
1264  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
1265  if(VERBOSE>=2){
1266  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1267  for(int id=0;id<nprocs_1;++id){
1268  if(procid==id)
1269  for(int h=0;h<howmany;h++)
1270  for(int i=0;i<N[0];i++){
1271  std::cout<<std::endl;
1272  for(int j=0;j<local_n1;j++){
1273  ptr=h*odist+(i*local_n1+j)*n_tuples;
1274  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
1275  }
1276  }
1277  std::cout<<'\n';
1278  MPI_Barrier(T_plan->comm);
1279  }
1280  }
1281  // Right now the data is in transposed out format.
1282  // If the user did not want this layout, transpose again.
1283  if(Flags[1]==0){
1284 #pragma omp parallel for
1285  for(int h=0;h<howmany;h++)
1286  local_transpose_cuda(N[0],local_n1,n_tuples,&data[h*odist] );
1287 
1288  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
1289  if(VERBOSE>=2){
1290  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1291  for(int id=0;id<nprocs_1;++id){
1292  if(procid==id)
1293  for(int h=0;h<howmany;h++)
1294  for(int i=0;i<N[0];i++){
1295  std::cout<<std::endl;
1296  for(int j=0;j<local_n1;j++){
1297  ptr=h*odist+(i*local_n1+j)*n_tuples;
1298  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
1299  }
1300  }
1301  std::cout<<'\n';
1302  MPI_Barrier(T_plan->comm);
1303  }
1304  }
1305  }
1306 
1307  reshuffle_time+=MPI_Wtime();
1308  MPI_Barrier(T_plan->comm);
1309  delete [] request;
1310  delete [] s_request;
1311 
1312 
1313  if(VERBOSE>=1){
1314  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
1315  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
1316  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
1317  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1318  }
1319  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
1320  timings[1]+=shuffle_time;
1321  timings[2]+=comm_time;
1322  timings[3]+=reshuffle_time;
1323  return;
1324 }
1325 void fast_transpose_cuda_v1_3_h(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
1326 
1327  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
1328  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If nprocs==1 and Flags==Transposed_Out return
1329  MPI_Barrier(T_plan->comm);
1330  return;
1331  }
1332  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
1333  transpose_cuda_v5(T_plan,(double*)data,timings,flags,howmany,tag);
1334  MPI_Barrier(T_plan->comm);
1335  return;
1336  }
1337  timings[0]-=MPI_Wtime();
1338  int nprocs, procid;
1339  int nprocs_0, nprocs_1;
1340  nprocs=T_plan->nprocs;
1341  procid=T_plan->procid;
1342  nprocs_0=T_plan->nprocs_0;
1343  nprocs_1=T_plan->nprocs_1;
1344  ptrdiff_t *N=T_plan->N;
1345  ptrdiff_t local_n0=T_plan->local_n0;
1346  ptrdiff_t local_n1=T_plan->local_n1;
1347  ptrdiff_t n_tuples=T_plan->n_tuples;
1348 
1349  double * data_cpu=T_plan->buffer; //pinned
1350  double * send_recv_cpu = T_plan->buffer_2;//pinned
1351  double * send_recv_d = T_plan->buffer_d;
1352 
1353  int idist=N[1]*local_n0*n_tuples;
1354  int odist=N[0]*local_n1*n_tuples;
1355 
1356  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1357 
1358  int ptr=0;
1359  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
1360  if(VERBOSE>=2){
1361  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1362  for(int h=0;h<howmany;h++)
1363  for(int id=0;id<nprocs;++id){
1364  if(procid==id)
1365  for(int i=0;i<local_n0;i++){
1366  std::cout<<std::endl;
1367  for(int j=0;j<N[1];j++){
1368  ptr=h*idist+(i*N[1]+j)*n_tuples;
1369  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
1370  }
1371  }
1372  std::cout<<'\n';
1373  MPI_Barrier(T_plan->comm);
1374  }
1375  }
1376  //PCOUT<<" ============================================= "<<std::endl;
1377  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
1378  //PCOUT<<" ============================================= "<<std::endl;
1379  ptr=0;
1380  ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
1381  ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
1382  ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
1383  ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
1384  shuffle_time-=MPI_Wtime();
1385  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1386 #pragma omp parallel for
1387  for(int h=0;h<howmany;h++)
1388  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
1389  }
1390  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1391 #pragma omp parallel for
1392  for(int h=0;h<howmany;h++)
1393  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
1394  }
1395  if(nprocs==1){ // Transpose is done!
1396  shuffle_time+=MPI_Wtime();
1397  timings[0]+=MPI_Wtime();
1398  timings[0]+=shuffle_time;
1399  timings[1]+=shuffle_time;
1400  timings[2]+=0;
1401  timings[3]+=0;
1402  MPI_Barrier(T_plan->comm);
1403  return;
1404  }
1405 
1406  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
1407  ptr=0;
1408 
1409  //for (int proc=0;proc<nprocs_1;++proc)
1410  // for(int h=0;h<howmany;++h){
1411  // for(int i=0;i<local_n0;++i){
1412  // //for(int j=local_1_start_proc[proc];j<local_1_start_proc[proc]+local_n1_proc[proc];++j){
1413  // // memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+j)*n_tuples],sizeof(double)*n_tuples);
1414  // // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
1415  // // ptr+=n_tuples;
1416  // //}
1417  // //memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],sizeof(double)*n_tuples*local_n1_proc[proc]);
1418  // cudaMemcpy(&send_recv_d[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples] , sizeof(double)*n_tuples*local_n1_proc[proc] , cudaMemcpyDeviceToDevice);
1419  // ptr+=n_tuples*local_n1_proc[proc];
1420  // }
1421  // }
1422  memcpy_v1_h1(nprocs_1,howmany,local_n0,n_tuples,local_n1_proc,send_recv_d,data,idist,N[1],local_1_start_proc);
1423  cudaDeviceSynchronize();
1424 
1425 
1426  shuffle_time+=MPI_Wtime();
1427  ptr=0;
1428  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
1429  if(VERBOSE>=2){
1430  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1431  for(int id=0;id<nprocs;++id){
1432  for(int h=0;h<howmany;h++)
1433  if(procid==id)
1434  for(int i=0;i<N[1];i++){
1435  std::cout<<std::endl;
1436  for(int j=0;j<local_n0;j++){
1437  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
1438  ptr+=n_tuples;
1439  }
1440  }
1441  std::cout<<'\n';
1442  MPI_Barrier(T_plan->comm);
1443  }
1444  }
1445 
1446 
1447  //PCOUT<<" ============================================= "<<std::endl;
1448  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
1449  //PCOUT<<" ============================================= "<<std::endl;
1450 
1451  int* scount_proc= T_plan->scount_proc;
1452  int* rcount_proc= T_plan->rcount_proc;
1453  int* soffset_proc= T_plan->soffset_proc;
1454  int* roffset_proc= T_plan->roffset_proc;
1455 
1456  MPI_Barrier(T_plan->comm);
1457 
1458  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1459  comm_time-=MPI_Wtime();
1460 
1461  int soffset=0,roffset=0;
1462  MPI_Status ierr;
1463 
1464  double *s_buf, *r_buf;
1465  s_buf=data_cpu; r_buf=send_recv_cpu;
1466  double *r_buf_d=send_recv_d;
1467  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> send_recv_d --Th--> data_d
1468 
1469 
1470  // SEND
1471  //cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1472  for (int proc=0;proc<nprocs;++proc){
1473  if(proc!=procid){
1474  soffset=soffset_proc[proc];
1475  roffset=roffset_proc[proc];
1476  cudaMemcpy(&s_buf[soffset*howmany], &send_recv_d[soffset*howmany],howmany*sizeof(double)*scount_proc[proc], cudaMemcpyDeviceToHost);
1477  MPI_Sendrecv(&s_buf[soffset*howmany],scount_proc[proc]*howmany, MPI_DOUBLE,
1478  proc, tag,
1479  &r_buf[roffset*howmany],howmany*rcount_proc[proc], MPI_DOUBLE,
1480  proc, tag,
1481  T_plan->comm,&ierr);
1482 
1483  }
1484  else{ // copy your own part directly
1485  //cudaMemcpy(&r_buf_d[roffset_proc[procid]*howmany], &send_recv_d[soffset_proc[procid]*howmany],howmany*sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToDevice);
1486  // Note that because if Flags[1]=0 the send_d and recv_d are the same, D2D should not be used
1487  // Otherwise it will corrupt the data in the unbalanced cases
1488  cudaMemcpy(&r_buf[roffset_proc[procid]*howmany], &send_recv_d[soffset_proc[procid]*howmany],howmany*sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToHost);
1489  }
1490  }
1491 
1492  cudaMemcpy(r_buf_d,r_buf,T_plan->alloc_local,cudaMemcpyHostToDevice);
1493  //cudaMemcpy(r_buf_d, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
1494  //cudaMemcpy(&r_buf_d[roffset_proc[procid]*howmany], &send_recv_d[soffset_proc[procid]*howmany],howmany*sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToDevice);
1495  cudaDeviceSynchronize();
1496  comm_time+=MPI_Wtime();
1497 
1498 
1499  ptr=0;
1500  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
1501  if(VERBOSE>=2){
1502  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1503  for(int id=0;id<nprocs;++id){
1504  if(procid==id)
1505  for(int h=0;h<howmany;h++)
1506  for(int i=0;i<local_n1;i++){
1507  std::cout<<std::endl;
1508  for(int j=0;j<N[0];j++){
1509  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
1510  ptr+=n_tuples;
1511  }
1512  }
1513  std::cout<<'\n';
1514  MPI_Barrier(T_plan->comm);
1515  }
1516  }
1517  //PCOUT<<" ============================================= "<<std::endl;
1518  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
1519  //PCOUT<<" ============================================= "<<std::endl;
1520  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
1521  reshuffle_time-=MPI_Wtime();
1522  ptr=0;
1523  //for (int proc=0;proc<nprocs_0;++proc)
1524  // for(int h=0;h<howmany;++h){
1525  // for(int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
1526  // //memcpy(&data_cpu[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*sizeof(double)*n_tuples);
1527  // cudaMemcpy( &data[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*sizeof(double)*n_tuples,cudaMemcpyHostToDevice);
1528  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" i=("<<i<<") data_ptr= "<<h*odist+(i*local_n1)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*local_n1_proc[proc] <<std::endl;
1529  // ptr+=n_tuples*local_n1;
1530  // //for(int j=0*local_1_start_proc[proc];j<0*local_1_start_proc[proc]+local_n1;++j){
1531  // // memcpy(&data[h*odist+(i*local_n1+j)*n_tuples],&send_recv[ptr],sizeof(double)*n_tuples);
1532  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
1533  // // ptr+=n_tuples;
1534  // //}
1535  // }
1536  // }
1537  //memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_cpu);
1538  memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_d);
1539  cudaDeviceSynchronize();
1540 
1541  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
1542  if(VERBOSE>=2){
1543  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1544  for(int id=0;id<nprocs_1;++id){
1545  if(procid==id)
1546  for(int h=0;h<howmany;h++)
1547  for(int i=0;i<N[0];i++){
1548  std::cout<<std::endl;
1549  for(int j=0;j<local_n1;j++){
1550  ptr=h*odist+(i*local_n1+j)*n_tuples;
1551  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
1552  }
1553  }
1554  std::cout<<'\n';
1555  MPI_Barrier(T_plan->comm);
1556  }
1557  }
1558  // Right now the data is in transposed out format.
1559  // If the user did not want this layout, transpose again.
1560  if(Flags[1]==0){
1561 #pragma omp parallel for
1562  for(int h=0;h<howmany;h++)
1563  local_transpose_cuda(N[0],local_n1,n_tuples,&data[h*odist] );
1564 
1565  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
1566  if(VERBOSE>=2){
1567  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1568  for(int id=0;id<nprocs_1;++id){
1569  if(procid==id)
1570  for(int h=0;h<howmany;h++)
1571  for(int i=0;i<N[0];i++){
1572  std::cout<<std::endl;
1573  for(int j=0;j<local_n1;j++){
1574  ptr=h*odist+(i*local_n1+j)*n_tuples;
1575  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
1576  }
1577  }
1578  std::cout<<'\n';
1579  MPI_Barrier(T_plan->comm);
1580  }
1581  }
1582  }
1583 
1584  reshuffle_time+=MPI_Wtime();
1585  MPI_Barrier(T_plan->comm);
1586 
1587  if(VERBOSE>=1){
1588  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
1589  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
1590  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
1591  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1592  }
1593  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
1594  timings[1]+=shuffle_time;
1595  timings[2]+=comm_time;
1596  timings[3]+=reshuffle_time;
1597  return;
1598 }
1599 
1600 void fast_transpose_cuda_v1(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
1601 
1602  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
1603  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
1604  MPI_Barrier(T_plan->comm);
1605  return;
1606  }
1607  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
1608  transpose_cuda_v5(T_plan,(double*)data,timings,flags,howmany,tag);
1609  MPI_Barrier(T_plan->comm);
1610  return;
1611  }
1612  timings[0]-=MPI_Wtime();
1613  int nprocs, procid;
1614  nprocs=T_plan->nprocs;
1615  procid=T_plan->procid;
1616  int nprocs_1=T_plan->nprocs_1;
1617  ptrdiff_t *N=T_plan->N;
1618 
1619  double * data_cpu=T_plan->buffer; //pinned
1620  double * send_recv_cpu = T_plan->buffer_2;//pinned
1621  double * send_recv_d = T_plan->buffer_d;
1622 
1623 
1624  ptrdiff_t local_n0=T_plan->local_n0;
1625  ptrdiff_t local_n1=T_plan->local_n1;
1626  ptrdiff_t n_tuples=T_plan->n_tuples;
1627 
1628  int idist=N[1]*local_n0*n_tuples;
1629  int odist=N[0]*local_n1*n_tuples;
1630 
1631  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1632 
1633  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
1634  if(VERBOSE>=2){
1635  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1636  for(int h=0;h<howmany;h++)
1637  for(int id=0;id<nprocs;++id){
1638  if(procid==id)
1639  for(int i=0;i<local_n0;i++){
1640  std::cout<<std::endl;
1641  for(int j=0;j<N[1];j++){
1642  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
1643  }
1644  }
1645  std::cout<<'\n';
1646  MPI_Barrier(T_plan->comm);
1647  }
1648  }
1649  //PCOUT<<" ============================================= "<<std::endl;
1650  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
1651  //PCOUT<<" ============================================= "<<std::endl;
1652  int ptr=0;
1653  shuffle_time-=MPI_Wtime();
1654 
1655 
1656  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1657 #pragma omp parallel for
1658  for(int h=0;h<howmany;h++)
1659  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
1660  }
1661  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1662 #pragma omp parallel for
1663  for(int h=0;h<howmany;h++)
1664  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
1665  }
1666  if(nprocs==1){ // Transpose is done!
1667  shuffle_time+=MPI_Wtime();
1668  timings[0]+=MPI_Wtime();
1669  timings[0]+=shuffle_time;
1670  timings[1]+=shuffle_time;
1671  timings[2]+=0;
1672  timings[3]+=0;
1673  MPI_Barrier(T_plan->comm);
1674  return;
1675  }
1676 
1677  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
1678  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
1679  local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
1680 
1681  shuffle_time+=MPI_Wtime();
1682  ptr=0;
1683  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
1684  if(VERBOSE>=2){
1685  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1686  for(int h=0;h<howmany;h++)
1687  for(int id=0;id<nprocs;++id){
1688  if(procid==id)
1689  for(int i=0;i<N[1];i++){
1690  std::cout<<std::endl;
1691  for(int j=0;j<local_n0;j++){
1692  std::cout<<'\t'<<data_cpu[ptr];//data[h*idist+(i*local_n0+j)*n_tuples];
1693  ptr+=n_tuples;
1694  }
1695  }
1696  std::cout<<'\n';
1697  MPI_Barrier(T_plan->comm);
1698  }
1699  }
1700 
1701  //PCOUT<<" ============================================= "<<std::endl;
1702  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
1703  //PCOUT<<" ============================================= "<<std::endl;
1704 
1705  int* scount_proc= T_plan->scount_proc;
1706  int* rcount_proc= T_plan->rcount_proc;
1707  int* soffset_proc= T_plan->soffset_proc;
1708  int* roffset_proc= T_plan->roffset_proc;
1709 
1710  MPI_Barrier(T_plan->comm);
1711 
1712  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1713  comm_time-=MPI_Wtime();
1714 
1715  int soffset=0,roffset=0;
1716  MPI_Status ierr;
1717  MPI_Request * s_request= new MPI_Request[nprocs];
1718  MPI_Request * request= new MPI_Request[nprocs];
1719 #pragma omp parallel for
1720  for (int proc=0;proc<nprocs;++proc){
1721  request[proc]=MPI_REQUEST_NULL;
1722  s_request[proc]=MPI_REQUEST_NULL;
1723  }
1724  double *s_buf, *r_buf;
1725  s_buf=data_cpu; r_buf=send_recv_cpu;
1726 
1727  // SEND
1728  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
1729  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
1730  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1731  for (int proc=0;proc<nprocs;++proc){
1732  if(proc!=procid){
1733  soffset=soffset_proc[proc];
1734  roffset=roffset_proc[proc];
1735  MPI_Isend(&s_buf[soffset],scount_proc[proc],MPI_DOUBLE,proc, tag,
1736  T_plan->comm, &s_request[proc]);
1737  MPI_Irecv(&r_buf[roffset],rcount_proc[proc],MPI_DOUBLE, proc,
1738  tag, T_plan->comm, &request[proc]);
1739  }
1740  }
1741  // Copy Your own part. See the note below for the if condition
1742  soffset=soffset_proc[procid];//aoffset_proc[proc];//proc*count_proc[proc];
1743  roffset=roffset_proc[procid];
1744  for(int h=0;h<howmany;h++)
1745  memcpy(&r_buf[h*odist+roffset],&s_buf[h*idist+soffset],sizeof(double)*scount_proc[procid]);
1746 
1747 
1748  // If the output is to be transposed locally, then everything is done in sendrecv. just copy it
1749  // Otherwise you have to perform the copy via the multiple stride transpose
1750  for (int proc=0;proc<nprocs;++proc){
1751  MPI_Wait(&request[proc], &ierr);
1752  MPI_Wait(&s_request[proc], &ierr);
1753  }
1754 
1755  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
1756  if(Flags[1]==1)
1757  cudaMemcpy(data, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
1758  else
1759  cudaMemcpy(send_recv_d, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
1760  comm_time+=MPI_Wtime();
1761 
1762  ptr=0;
1763  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
1764  if(VERBOSE>=2){
1765  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1766  for(int h=0;h<howmany;h++)
1767  for(int id=0;id<nprocs;++id){
1768  if(procid==id)
1769  for(int i=0;i<local_n1;i++){
1770  std::cout<<std::endl;
1771  for(int j=0;j<N[0];j++){
1772  std::cout<<'\t'<<data_cpu[ptr];//send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
1773  ptr+=n_tuples;
1774  }
1775  }
1776  std::cout<<'\n';
1777  MPI_Barrier(T_plan->comm);
1778  }
1779  }
1780  //PCOUT<<" ============================================= "<<std::endl;
1781  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
1782  //PCOUT<<" ============================================= "<<std::endl;
1783  reshuffle_time-=MPI_Wtime();
1784  ptr=0;
1785  if(Flags[1]==0)
1786  local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
1787 
1788 
1789 
1790 
1791  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
1792  if(VERBOSE>=2){
1793  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1794  for(int h=0;h<howmany;h++)
1795  for(int id=0;id<nprocs_1;++id){
1796  if(procid==id)
1797  for(int i=0;i<local_n1;i++){
1798  std::cout<<std::endl;
1799  for(int j=0;j<N[0];j++){
1800  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
1801  }
1802  }
1803  std::cout<<'\n';
1804  MPI_Barrier(T_plan->comm);
1805  }
1806  }
1807 
1808  reshuffle_time+=MPI_Wtime();
1809  delete [] request;
1810  delete [] s_request;
1811 
1812 
1813  if(VERBOSE>=1){
1814  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
1815  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
1816  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
1817  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1818  }
1819  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
1820  timings[1]+=shuffle_time;
1821  timings[2]+=comm_time;
1822  timings[3]+=reshuffle_time;
1823  return;
1824 
1825 
1826 }
1827 
1828 void fast_transpose_cuda_v1_2(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
1829 
1830  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
1831  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
1832  MPI_Barrier(T_plan->comm);
1833  return;
1834  }
1835  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
1836  transpose_cuda_v5(T_plan,(double*)data,timings,flags,howmany,tag);
1837  MPI_Barrier(T_plan->comm);
1838  return;
1839  }
1840  timings[0]-=MPI_Wtime();
1841  int nprocs, procid;
1842  nprocs=T_plan->nprocs;
1843  procid=T_plan->procid;
1844  int nprocs_1=T_plan->nprocs_1;
1845  ptrdiff_t *N=T_plan->N;
1846 
1847  double * data_cpu=T_plan->buffer; //pinned
1848  double * send_recv_cpu = T_plan->buffer_2;//pinned
1849  double * send_recv_d = T_plan->buffer_d;
1850 
1851 
1852  ptrdiff_t local_n0=T_plan->local_n0;
1853  ptrdiff_t local_n1=T_plan->local_n1;
1854  ptrdiff_t n_tuples=T_plan->n_tuples;
1855 
1856  int idist=N[1]*local_n0*n_tuples;
1857  int odist=N[0]*local_n1*n_tuples;
1858 
1859  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1860 
1861  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
1862  if(VERBOSE>=2){
1863  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1864  for(int h=0;h<howmany;h++)
1865  for(int id=0;id<nprocs;++id){
1866  if(procid==id)
1867  for(int i=0;i<local_n0;i++){
1868  std::cout<<std::endl;
1869  for(int j=0;j<N[1];j++){
1870  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
1871  }
1872  }
1873  std::cout<<'\n';
1874  MPI_Barrier(T_plan->comm);
1875  }
1876  }
1877  //PCOUT<<" ============================================= "<<std::endl;
1878  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
1879  //PCOUT<<" ============================================= "<<std::endl;
1880  int ptr=0;
1881  shuffle_time-=MPI_Wtime();
1882 
1883 
1884  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1885 #pragma omp parallel for
1886  for(int h=0;h<howmany;h++)
1887  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
1888  }
1889  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1890 #pragma omp parallel for
1891  for(int h=0;h<howmany;h++)
1892  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
1893  }
1894  if(nprocs==1){ // Transpose is done!
1895  shuffle_time+=MPI_Wtime();
1896  timings[0]+=MPI_Wtime();
1897  timings[0]+=shuffle_time;
1898  timings[1]+=shuffle_time;
1899  timings[2]+=0;
1900  timings[3]+=0;
1901  MPI_Barrier(T_plan->comm);
1902  return;
1903  }
1904 
1905  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
1906  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
1907  local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
1908 
1909  shuffle_time+=MPI_Wtime();
1910  ptr=0;
1911  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
1912  if(VERBOSE>=2){
1913  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1914  for(int h=0;h<howmany;h++)
1915  for(int id=0;id<nprocs;++id){
1916  if(procid==id)
1917  for(int i=0;i<N[1];i++){
1918  std::cout<<std::endl;
1919  for(int j=0;j<local_n0;j++){
1920  std::cout<<'\t'<<data_cpu[ptr];//data[h*idist+(i*local_n0+j)*n_tuples];
1921  ptr+=n_tuples;
1922  }
1923  }
1924  std::cout<<'\n';
1925  MPI_Barrier(T_plan->comm);
1926  }
1927  }
1928 
1929  //PCOUT<<" ============================================= "<<std::endl;
1930  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
1931  //PCOUT<<" ============================================= "<<std::endl;
1932 
1933  int* scount_proc= T_plan->scount_proc;
1934  int* rcount_proc= T_plan->rcount_proc;
1935  int* soffset_proc= T_plan->soffset_proc;
1936  int* roffset_proc= T_plan->roffset_proc;
1937 
1938  MPI_Barrier(T_plan->comm);
1939 
1940  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1941  comm_time-=MPI_Wtime();
1942 
1943  int soffset=0,roffset=0;
1944  MPI_Status ierr;
1945  MPI_Request * s_request= new MPI_Request[nprocs];
1946  MPI_Request * request= new MPI_Request[nprocs];
1947  int flag[nprocs],color[nprocs];
1948  memset(flag,0,sizeof(int)*nprocs);
1949  memset(color,0,sizeof(int)*nprocs);
1950  int counter=1;
1951 #pragma omp parallel for
1952  for (int proc=0;proc<nprocs;++proc){
1953  request[proc]=MPI_REQUEST_NULL;
1954  s_request[proc]=MPI_REQUEST_NULL;
1955  }
1956  double *s_buf, *r_buf;
1957  s_buf=data_cpu; r_buf=send_recv_cpu;
1958  double * r_buf_d;
1959  if(Flags[1]==1)
1960  r_buf_d=data;
1961  else
1962  r_buf_d=send_recv_d;
1963 
1964  // SEND
1965  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
1966  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy-->send_recv_d --Th--> data_d
1967 
1968 
1969  for (int proc=0;proc<nprocs;++proc){
1970  if(proc!=procid){
1971  roffset=roffset_proc[proc];
1972  MPI_Irecv(&r_buf[roffset],rcount_proc[proc], MPI_DOUBLE, proc,
1973  tag, T_plan->comm, &request[proc]);
1974  }
1975  }
1976  // SEND
1977  //cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
1978  for (int proc=0;proc<nprocs;++proc){
1979  if(proc!=procid){
1980  soffset=soffset_proc[proc];
1981  cudaMemcpy(&s_buf[soffset], &send_recv_d[soffset],sizeof(double)*scount_proc[proc], cudaMemcpyDeviceToHost);
1982  MPI_Isend(&s_buf[soffset],scount_proc[proc], MPI_DOUBLE,proc, tag,
1983  T_plan->comm, &s_request[proc]);
1984 
1985  }
1986  else{ // copy your own part directly
1987  //cudaMemcpy(&r_buf_d[roffset_proc[procid]], &send_recv_d[soffset_proc[procid]],sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToDevice);
1988  // Note that because if Flags[1]=0 the send_d and recv_d are the same, D2D should not be used
1989  // Otherwise it will corrupt the data in the unbalanced cases
1990  cudaMemcpy(&r_buf[roffset_proc[procid]], &send_recv_d[soffset_proc[procid]],sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToHost);
1991  flag[procid]=1;
1992  }
1993  }
1994  while(counter!=nprocs+1){
1995 
1996  for (int proc=0;proc<nprocs;++proc){
1997  MPI_Test(&request[proc], &flag[proc],&ierr);
1998  if(flag[proc]==1 && color[proc]==0){
1999  cudaMemcpyAsync(&r_buf_d[roffset_proc[proc]],&r_buf[roffset_proc[proc]],sizeof(double)*rcount_proc[proc],cudaMemcpyHostToDevice);
2000  color[proc]=1;
2001  counter+=1;
2002  }
2003  }
2004 
2005  }
2006  cudaDeviceSynchronize();
2007  comm_time+=MPI_Wtime();
2008 
2009 
2010 
2011 
2012 
2013 
2014 
2015 
2016  ptr=0;
2017  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
2018  if(VERBOSE>=2){
2019  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2020  for(int h=0;h<howmany;h++)
2021  for(int id=0;id<nprocs;++id){
2022  if(procid==id)
2023  for(int i=0;i<local_n1;i++){
2024  std::cout<<std::endl;
2025  for(int j=0;j<N[0];j++){
2026  std::cout<<'\t'<<data_cpu[ptr];//send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
2027  ptr+=n_tuples;
2028  }
2029  }
2030  std::cout<<'\n';
2031  MPI_Barrier(T_plan->comm);
2032  }
2033  }
2034  //PCOUT<<" ============================================= "<<std::endl;
2035  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
2036  //PCOUT<<" ============================================= "<<std::endl;
2037  reshuffle_time-=MPI_Wtime();
2038  ptr=0;
2039  if(Flags[1]==0)
2040  local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
2041 
2042 
2043 
2044 
2045  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
2046  if(VERBOSE>=2){
2047  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2048  for(int h=0;h<howmany;h++)
2049  for(int id=0;id<nprocs_1;++id){
2050  if(procid==id)
2051  for(int i=0;i<local_n1;i++){
2052  std::cout<<std::endl;
2053  for(int j=0;j<N[0];j++){
2054  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
2055  }
2056  }
2057  std::cout<<'\n';
2058  MPI_Barrier(T_plan->comm);
2059  }
2060  }
2061 
2062  reshuffle_time+=MPI_Wtime();
2063  delete [] request;
2064  delete [] s_request;
2065 
2066 
2067  if(VERBOSE>=1){
2068  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
2069  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
2070  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
2071  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2072  }
2073  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
2074  timings[1]+=shuffle_time;
2075  timings[2]+=comm_time;
2076  timings[3]+=reshuffle_time;
2077  return;
2078 
2079 
2080 }
2081 void fast_transpose_cuda_v1_3(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
2082 
2083  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
2084  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
2085  MPI_Barrier(T_plan->comm);
2086  return;
2087  }
2088  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
2089  transpose_cuda_v5(T_plan,(double*)data,timings,flags,howmany,tag);
2090  MPI_Barrier(T_plan->comm);
2091  return;
2092  }
2093  timings[0]-=MPI_Wtime();
2094  int nprocs, procid;
2095  nprocs=T_plan->nprocs;
2096  procid=T_plan->procid;
2097  int nprocs_1=T_plan->nprocs_1;
2098  ptrdiff_t *N=T_plan->N;
2099 
2100  double * data_cpu=T_plan->buffer; //pinned
2101  double * send_recv_cpu = T_plan->buffer_2;//pinned
2102  double * send_recv_d = T_plan->buffer_d;
2103 
2104 
2105  ptrdiff_t local_n0=T_plan->local_n0;
2106  ptrdiff_t local_n1=T_plan->local_n1;
2107  ptrdiff_t n_tuples=T_plan->n_tuples;
2108 
2109  int idist=N[1]*local_n0*n_tuples;
2110  int odist=N[0]*local_n1*n_tuples;
2111 
2112  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2113 
2114  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
2115  if(VERBOSE>=2){
2116  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2117  for(int h=0;h<howmany;h++)
2118  for(int id=0;id<nprocs;++id){
2119  if(procid==id)
2120  for(int i=0;i<local_n0;i++){
2121  std::cout<<std::endl;
2122  for(int j=0;j<N[1];j++){
2123  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
2124  }
2125  }
2126  std::cout<<'\n';
2127  MPI_Barrier(T_plan->comm);
2128  }
2129  }
2130  //PCOUT<<" ============================================= "<<std::endl;
2131  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
2132  //PCOUT<<" ============================================= "<<std::endl;
2133  int ptr=0;
2134  shuffle_time-=MPI_Wtime();
2135 
2136 
2137  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2138 #pragma omp parallel for
2139  for(int h=0;h<howmany;h++)
2140  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
2141  }
2142  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
2143 #pragma omp parallel for
2144  for(int h=0;h<howmany;h++)
2145  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
2146  }
2147  if(nprocs==1){ // Transpose is done!
2148  shuffle_time+=MPI_Wtime();
2149  timings[0]+=MPI_Wtime();
2150  timings[0]+=shuffle_time;
2151  timings[1]+=shuffle_time;
2152  timings[2]+=0;
2153  timings[3]+=0;
2154  MPI_Barrier(T_plan->comm);
2155  return;
2156  }
2157 
2158  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
2159  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
2160  local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
2161 
2162  shuffle_time+=MPI_Wtime();
2163  ptr=0;
2164  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
2165  if(VERBOSE>=2){
2166  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2167  for(int h=0;h<howmany;h++)
2168  for(int id=0;id<nprocs;++id){
2169  if(procid==id)
2170  for(int i=0;i<N[1];i++){
2171  std::cout<<std::endl;
2172  for(int j=0;j<local_n0;j++){
2173  std::cout<<'\t'<<data_cpu[ptr];//data[h*idist+(i*local_n0+j)*n_tuples];
2174  ptr+=n_tuples;
2175  }
2176  }
2177  std::cout<<'\n';
2178  MPI_Barrier(T_plan->comm);
2179  }
2180  }
2181 
2182  //PCOUT<<" ============================================= "<<std::endl;
2183  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
2184  //PCOUT<<" ============================================= "<<std::endl;
2185 
2186  int* scount_proc= T_plan->scount_proc;
2187  int* rcount_proc= T_plan->rcount_proc;
2188  int* soffset_proc= T_plan->soffset_proc;
2189  int* roffset_proc= T_plan->roffset_proc;
2190 
2191  MPI_Barrier(T_plan->comm);
2192 
2193  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
2194  comm_time-=MPI_Wtime();
2195 
2196  int soffset=0,roffset=0;
2197  MPI_Status ierr;
2198  MPI_Request * s_request= new MPI_Request[nprocs];
2199  MPI_Request * request= new MPI_Request[nprocs];
2200  int flag[nprocs],color[nprocs];
2201  memset(flag,0,sizeof(int)*nprocs);
2202  memset(color,0,sizeof(int)*nprocs);
2203 #pragma omp parallel for
2204  for (int proc=0;proc<nprocs;++proc){
2205  request[proc]=MPI_REQUEST_NULL;
2206  s_request[proc]=MPI_REQUEST_NULL;
2207  }
2208  double *s_buf, *r_buf;
2209  s_buf=data_cpu; r_buf=send_recv_cpu;
2210  double * r_buf_d;
2211  if(Flags[1]==1)
2212  r_buf_d=data;
2213  else
2214  r_buf_d=send_recv_d;
2215 
2216  // SEND
2217  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
2218  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
2219 
2220 
2221  for (int proc=0;proc<nprocs;++proc){
2222  if(proc!=procid){
2223  soffset=soffset_proc[proc];
2224  roffset=roffset_proc[proc];
2225  cudaMemcpy(&s_buf[soffset], &send_recv_d[soffset],sizeof(double)*scount_proc[proc], cudaMemcpyDeviceToHost);
2226  MPI_Sendrecv(&s_buf[soffset],scount_proc[proc], MPI_DOUBLE,
2227  proc, tag,
2228  &r_buf[roffset],rcount_proc[proc], MPI_DOUBLE,
2229  proc, tag,
2230  T_plan->comm,&ierr);
2231 
2232  }
2233  else{ // copy your own part directly
2234  //cudaMemcpyAsync(&r_buf_d[roffset_proc[procid]], &send_recv_d[soffset_proc[procid]],sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToDevice);
2235  // Note that because if Flags[1]=0 the send_d and recv_d are the same, D2D should not be used
2236  // Otherwise it will corrupt the data in the unbalanced cases
2237  cudaMemcpy(&r_buf[roffset_proc[procid]], &send_recv_d[soffset_proc[procid]],sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToHost);
2238  flag[procid]=1;
2239  }
2240  if(Flags[1]==1)
2241  cudaMemcpyAsync(&r_buf_d[roffset_proc[proc]],&r_buf[roffset_proc[proc]],sizeof(double)*rcount_proc[proc],cudaMemcpyHostToDevice);
2242  }
2243  if(Flags[1]==0)
2244  cudaMemcpy(r_buf_d,r_buf,T_plan->alloc_local,cudaMemcpyHostToDevice);
2245  cudaDeviceSynchronize();
2246  comm_time+=MPI_Wtime();
2247 
2248 
2249  ptr=0;
2250  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
2251  if(VERBOSE>=2){
2252  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2253  for(int h=0;h<howmany;h++)
2254  for(int id=0;id<nprocs;++id){
2255  if(procid==id)
2256  for(int i=0;i<local_n1;i++){
2257  std::cout<<std::endl;
2258  for(int j=0;j<N[0];j++){
2259  std::cout<<'\t'<<data_cpu[ptr];//send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
2260  ptr+=n_tuples;
2261  }
2262  }
2263  std::cout<<'\n';
2264  MPI_Barrier(T_plan->comm);
2265  }
2266  }
2267  //PCOUT<<" ============================================= "<<std::endl;
2268  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
2269  //PCOUT<<" ============================================= "<<std::endl;
2270  reshuffle_time-=MPI_Wtime();
2271  ptr=0;
2272  if(Flags[1]==0)
2273  local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
2274 
2275 
2276 
2277 
2278  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
2279  if(VERBOSE>=2){
2280  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2281  for(int h=0;h<howmany;h++)
2282  for(int id=0;id<nprocs_1;++id){
2283  if(procid==id)
2284  for(int i=0;i<local_n1;i++){
2285  std::cout<<std::endl;
2286  for(int j=0;j<N[0];j++){
2287  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
2288  }
2289  }
2290  std::cout<<'\n';
2291  MPI_Barrier(T_plan->comm);
2292  }
2293  }
2294 
2295  reshuffle_time+=MPI_Wtime();
2296  delete [] request;
2297  delete [] s_request;
2298 
2299 
2300  if(VERBOSE>=1){
2301  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
2302  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
2303  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
2304  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2305  }
2306  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
2307  timings[1]+=shuffle_time;
2308  timings[2]+=comm_time;
2309  timings[3]+=reshuffle_time;
2310  return;
2311 
2312 
2313 }
2314 
2315 
2316 
2317 
2318 void fast_transpose_cuda_v2(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
2319  // This function handles cases where howmany=1 (it is more optimal)
2320 
2321 
2322  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
2323  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
2324  MPI_Barrier(T_plan->comm);
2325  return;
2326  }
2327  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
2328  transpose_cuda_v6(T_plan,(double*)data,timings,flags,howmany,tag);
2329  MPI_Barrier(T_plan->comm);
2330  return;
2331  }
2332  timings[0]-=MPI_Wtime();
2333  int nprocs, procid;
2334  nprocs=T_plan->nprocs;
2335  procid=T_plan->procid;
2336  int nprocs_1=T_plan->nprocs_1;
2337  ptrdiff_t *N=T_plan->N;
2338 
2339  double * data_cpu=T_plan->buffer; //pinned
2340  double * send_recv_cpu = T_plan->buffer_2;//pinned
2341  double * send_recv_d = T_plan->buffer_d;
2342 
2343 
2344  ptrdiff_t local_n0=T_plan->local_n0;
2345  ptrdiff_t local_n1=T_plan->local_n1;
2346  ptrdiff_t n_tuples=T_plan->n_tuples;
2347 
2348  int idist=N[1]*local_n0*n_tuples;
2349  int odist=N[0]*local_n1*n_tuples;
2350 
2351  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2352 
2353  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
2354  if(VERBOSE>=2){
2355  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2356  for(int h=0;h<howmany;h++)
2357  for(int id=0;id<nprocs;++id){
2358  if(procid==id)
2359  for(int i=0;i<local_n0;i++){
2360  std::cout<<std::endl;
2361  for(int j=0;j<N[1];j++){
2362  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
2363  }
2364  }
2365  std::cout<<'\n';
2366  MPI_Barrier(T_plan->comm);
2367  }
2368  }
2369  //PCOUT<<" ============================================= "<<std::endl;
2370  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
2371  //PCOUT<<" ============================================= "<<std::endl;
2372  int ptr=0;
2373  shuffle_time-=MPI_Wtime();
2374 
2375 
2376  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2377 #pragma omp parallel for
2378  for(int h=0;h<howmany;h++)
2379  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
2380  }
2381  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
2382 #pragma omp parallel for
2383  for(int h=0;h<howmany;h++)
2384  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
2385  }
2386  if(nprocs==1){ // Transpose is done!
2387  shuffle_time+=MPI_Wtime();
2388  timings[0]+=MPI_Wtime();
2389  timings[0]+=shuffle_time;
2390  timings[1]+=shuffle_time;
2391  timings[2]+=0;
2392  timings[3]+=0;
2393  MPI_Barrier(T_plan->comm);
2394  return;
2395  }
2396 
2397  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
2398  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
2399  local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
2400 
2401  shuffle_time+=MPI_Wtime();
2402  ptr=0;
2403  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
2404  if(VERBOSE>=2){
2405  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2406  for(int h=0;h<howmany;h++)
2407  for(int id=0;id<nprocs;++id){
2408  if(procid==id)
2409  for(int i=0;i<N[1];i++){
2410  std::cout<<std::endl;
2411  for(int j=0;j<local_n0;j++){
2412  std::cout<<'\t'<<data_cpu[ptr];//data[h*idist+(i*local_n0+j)*n_tuples];
2413  ptr+=n_tuples;
2414  }
2415  }
2416  std::cout<<'\n';
2417  MPI_Barrier(T_plan->comm);
2418  }
2419  }
2420 
2421  //PCOUT<<" ============================================= "<<std::endl;
2422  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
2423  //PCOUT<<" ============================================= "<<std::endl;
2424 
2425  int* scount_proc_f= T_plan->scount_proc_f;
2426  int* rcount_proc_f= T_plan->rcount_proc_f;
2427  int* soffset_proc_f= T_plan->soffset_proc_f;
2428  int* roffset_proc_f= T_plan->roffset_proc_f;
2429 
2430  MPI_Barrier(T_plan->comm);
2431 
2432  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
2433  comm_time-=MPI_Wtime();
2434 
2435  MPI_Request * s_request= new MPI_Request[nprocs];
2436  MPI_Request * request= new MPI_Request[nprocs];
2437 #pragma omp parallel for
2438  for (int proc=0;proc<nprocs;++proc){
2439  request[proc]=MPI_REQUEST_NULL;
2440  s_request[proc]=MPI_REQUEST_NULL;
2441  }
2442  double *s_buf, *r_buf;
2443  s_buf=data_cpu; r_buf=send_recv_cpu;
2444 
2445  // SEND
2446  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
2447  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
2448  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2449  if(T_plan->is_evenly_distributed==0)
2450  MPI_Alltoallv(s_buf,scount_proc_f,
2451  soffset_proc_f, MPI_DOUBLE,r_buf,
2452  rcount_proc_f,roffset_proc_f, MPI_DOUBLE,
2453  T_plan->comm);
2454  else
2455  MPI_Alltoall(s_buf, scount_proc_f[0], MPI_DOUBLE,
2456  r_buf, rcount_proc_f[0], MPI_DOUBLE,
2457  T_plan->comm);
2458 
2459 
2460  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
2461  if(Flags[1]==1)
2462  cudaMemcpy(data, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
2463  else
2464  cudaMemcpy(send_recv_d, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
2465  comm_time+=MPI_Wtime();
2466 
2467  ptr=0;
2468  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
2469  if(VERBOSE>=2){
2470  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2471  for(int h=0;h<howmany;h++)
2472  for(int id=0;id<nprocs;++id){
2473  if(procid==id)
2474  for(int i=0;i<local_n1;i++){
2475  std::cout<<std::endl;
2476  for(int j=0;j<N[0];j++){
2477  std::cout<<'\t'<<data_cpu[ptr];//send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
2478  ptr+=n_tuples;
2479  }
2480  }
2481  std::cout<<'\n';
2482  MPI_Barrier(T_plan->comm);
2483  }
2484  }
2485  //PCOUT<<" ============================================= "<<std::endl;
2486  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
2487  //PCOUT<<" ============================================= "<<std::endl;
2488  reshuffle_time-=MPI_Wtime();
2489  ptr=0;
2490  if(Flags[1]==0)
2491  local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
2492 
2493 
2494 
2495 
2496  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
2497  if(VERBOSE>=2){
2498  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2499  for(int h=0;h<howmany;h++)
2500  for(int id=0;id<nprocs_1;++id){
2501  if(procid==id)
2502  for(int i=0;i<local_n1;i++){
2503  std::cout<<std::endl;
2504  for(int j=0;j<N[0];j++){
2505  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
2506  }
2507  }
2508  std::cout<<'\n';
2509  MPI_Barrier(T_plan->comm);
2510  }
2511  }
2512 
2513  reshuffle_time+=MPI_Wtime();
2514  delete [] request;
2515  delete [] s_request;
2516 
2517 
2518  if(VERBOSE>=1){
2519  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
2520  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
2521  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
2522  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2523  }
2524  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
2525  timings[1]+=shuffle_time;
2526  timings[2]+=comm_time;
2527  timings[3]+=reshuffle_time;
2528  return;
2529 
2530 }
2531 void fast_transpose_cuda_v3(T_Plan_gpu* T_plan, double * data, double *timings,int kway, unsigned flags, int howmany, int tag ){
2532  // This function handles cases where howmany=1 (it is more optimal)
2533 
2534 
2535  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
2536  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
2537  MPI_Barrier(T_plan->comm);
2538  return;
2539  }
2540  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
2541  transpose_cuda_v5(T_plan,(double*)data,timings,flags,howmany,tag);
2542  MPI_Barrier(T_plan->comm);
2543  return;
2544  }
2545  timings[0]-=MPI_Wtime();
2546  int nprocs, procid;
2547  nprocs=T_plan->nprocs;
2548  procid=T_plan->procid;
2549  int nprocs_1=T_plan->nprocs_1;
2550  ptrdiff_t *N=T_plan->N;
2551 
2552  double * data_cpu=T_plan->buffer; //pinned
2553  double * send_recv_cpu = T_plan->buffer_2;//pinned
2554  double * send_recv_d = T_plan->buffer_d;
2555 
2556 
2557  ptrdiff_t local_n0=T_plan->local_n0;
2558  ptrdiff_t local_n1=T_plan->local_n1;
2559  ptrdiff_t n_tuples=T_plan->n_tuples;
2560 
2561  int idist=N[1]*local_n0*n_tuples;
2562  int odist=N[0]*local_n1*n_tuples;
2563 
2564  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2565 
2566  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
2567  if(VERBOSE>=2){
2568  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2569  for(int h=0;h<howmany;h++)
2570  for(int id=0;id<nprocs;++id){
2571  if(procid==id)
2572  for(int i=0;i<local_n0;i++){
2573  std::cout<<std::endl;
2574  for(int j=0;j<N[1];j++){
2575  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
2576  }
2577  }
2578  std::cout<<'\n';
2579  MPI_Barrier(T_plan->comm);
2580  }
2581  }
2582  //PCOUT<<" ============================================= "<<std::endl;
2583  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
2584  //PCOUT<<" ============================================= "<<std::endl;
2585  int ptr=0;
2586  shuffle_time-=MPI_Wtime();
2587 
2588 
2589  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2590 #pragma omp parallel for
2591  for(int h=0;h<howmany;h++)
2592  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
2593  }
2594  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
2595 #pragma omp parallel for
2596  for(int h=0;h<howmany;h++)
2597  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
2598  }
2599  if(nprocs==1){ // Transpose is done!
2600  shuffle_time+=MPI_Wtime();
2601  timings[0]+=MPI_Wtime();
2602  timings[0]+=shuffle_time;
2603  timings[1]+=shuffle_time;
2604  timings[2]+=0;
2605  timings[3]+=0;
2606  MPI_Barrier(T_plan->comm);
2607  return;
2608  }
2609 
2610  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
2611  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
2612  local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
2613 
2614  shuffle_time+=MPI_Wtime();
2615  ptr=0;
2616  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
2617  if(VERBOSE>=2){
2618  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2619  for(int h=0;h<howmany;h++)
2620  for(int id=0;id<nprocs;++id){
2621  if(procid==id)
2622  for(int i=0;i<N[1];i++){
2623  std::cout<<std::endl;
2624  for(int j=0;j<local_n0;j++){
2625  std::cout<<'\t'<<data_cpu[ptr];//data[h*idist+(i*local_n0+j)*n_tuples];
2626  ptr+=n_tuples;
2627  }
2628  }
2629  std::cout<<'\n';
2630  MPI_Barrier(T_plan->comm);
2631  }
2632  }
2633 
2634  //PCOUT<<" ============================================= "<<std::endl;
2635  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
2636  //PCOUT<<" ============================================= "<<std::endl;
2637 
2638  int* scount_proc_f= T_plan->scount_proc_f;
2639  int* rcount_proc_f= T_plan->rcount_proc_f;
2640  int* soffset_proc_f= T_plan->soffset_proc_f;
2641  int* roffset_proc_f= T_plan->roffset_proc_f;
2642 
2643  MPI_Barrier(T_plan->comm);
2644 
2645  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
2646  comm_time-=MPI_Wtime();
2647 
2648  MPI_Request * s_request= new MPI_Request[nprocs];
2649  MPI_Request * request= new MPI_Request[nprocs];
2650 #pragma omp parallel for
2651  for (int proc=0;proc<nprocs;++proc){
2652  request[proc]=MPI_REQUEST_NULL;
2653  s_request[proc]=MPI_REQUEST_NULL;
2654  }
2655  double *s_buf, *r_buf;
2656  s_buf=data_cpu; r_buf=send_recv_cpu;
2657 
2658  // SEND
2659  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
2660  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
2661  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2662  if(T_plan->kway_async)
2663  par::Mpi_Alltoallv_dense<double,true>(s_buf , scount_proc_f, soffset_proc_f,
2664  r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
2665  else
2666  par::Mpi_Alltoallv_dense<double,false>(s_buf , scount_proc_f, soffset_proc_f,
2667  r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
2668 
2669  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
2670  if(Flags[1]==1)
2671  cudaMemcpy(data, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
2672  else
2673  cudaMemcpy(send_recv_d, r_buf, T_plan->alloc_local, cudaMemcpyHostToDevice);
2674  comm_time+=MPI_Wtime();
2675 
2676  ptr=0;
2677  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
2678  if(VERBOSE>=2){
2679  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2680  for(int h=0;h<howmany;h++)
2681  for(int id=0;id<nprocs;++id){
2682  if(procid==id)
2683  for(int i=0;i<local_n1;i++){
2684  std::cout<<std::endl;
2685  for(int j=0;j<N[0];j++){
2686  std::cout<<'\t'<<data_cpu[ptr];//send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
2687  ptr+=n_tuples;
2688  }
2689  }
2690  std::cout<<'\n';
2691  MPI_Barrier(T_plan->comm);
2692  }
2693  }
2694  //PCOUT<<" ============================================= "<<std::endl;
2695  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
2696  //PCOUT<<" ============================================= "<<std::endl;
2697  reshuffle_time-=MPI_Wtime();
2698  ptr=0;
2699  if(Flags[1]==0)
2700  local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
2701 
2702 
2703 
2704 
2705  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
2706  if(VERBOSE>=2){
2707  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2708  for(int h=0;h<howmany;h++)
2709  for(int id=0;id<nprocs_1;++id){
2710  if(procid==id)
2711  for(int i=0;i<local_n1;i++){
2712  std::cout<<std::endl;
2713  for(int j=0;j<N[0];j++){
2714  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
2715  }
2716  }
2717  std::cout<<'\n';
2718  MPI_Barrier(T_plan->comm);
2719  }
2720  }
2721 
2722  reshuffle_time+=MPI_Wtime();
2723  delete [] request;
2724  delete [] s_request;
2725 
2726 
2727  if(VERBOSE>=1){
2728  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
2729  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
2730  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
2731  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2732  }
2733  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
2734  timings[1]+=shuffle_time;
2735  timings[2]+=comm_time;
2736  timings[3]+=reshuffle_time;
2737  return;
2738 
2739 }
2740 
2741 void fast_transpose_cuda_v3_2(T_Plan_gpu* T_plan, double * data, double *timings,int kway, unsigned flags, int howmany, int tag ){
2742  // This function handles cases where howmany=1 (it is more optimal)
2743 
2744 
2745  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
2746  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
2747  MPI_Barrier(T_plan->comm);
2748  return;
2749  }
2750  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
2751  transpose_cuda_v7_2(T_plan,(double*)data,timings,kway,flags,howmany, tag);
2752  MPI_Barrier(T_plan->comm);
2753  return;
2754  }
2755  if(Flags[1]==0){// because in the communicator part it will need another gpu array.
2756  fast_transpose_cuda_v3(T_plan,(double*)data,timings,kway,flags,howmany, tag);
2757  MPI_Barrier(T_plan->comm);
2758  return;
2759  }
2760  timings[0]-=MPI_Wtime();
2761  int nprocs, procid;
2762  nprocs=T_plan->nprocs;
2763  procid=T_plan->procid;
2764  int nprocs_1=T_plan->nprocs_1;
2765  ptrdiff_t *N=T_plan->N;
2766 
2767  double * data_cpu=T_plan->buffer; //pinned
2768  //double * send_recv_cpu = T_plan->buffer;//pinned
2769  double * send_recv_d = T_plan->buffer_d;
2770 
2771 
2772  ptrdiff_t local_n0=T_plan->local_n0;
2773  ptrdiff_t local_n1=T_plan->local_n1;
2774  ptrdiff_t n_tuples=T_plan->n_tuples;
2775 
2776  int idist=N[1]*local_n0*n_tuples;
2777  int odist=N[0]*local_n1*n_tuples;
2778 
2779  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2780 
2781  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
2782  if(VERBOSE>=2){
2783  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2784  for(int h=0;h<howmany;h++)
2785  for(int id=0;id<nprocs;++id){
2786  if(procid==id)
2787  for(int i=0;i<local_n0;i++){
2788  std::cout<<std::endl;
2789  for(int j=0;j<N[1];j++){
2790  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
2791  }
2792  }
2793  std::cout<<'\n';
2794  MPI_Barrier(T_plan->comm);
2795  }
2796  }
2797  //PCOUT<<" ============================================= "<<std::endl;
2798  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
2799  //PCOUT<<" ============================================= "<<std::endl;
2800  int ptr=0;
2801  shuffle_time-=MPI_Wtime();
2802 
2803 
2804  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2805 #pragma omp parallel for
2806  for(int h=0;h<howmany;h++)
2807  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
2808  }
2809  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
2810 #pragma omp parallel for
2811  for(int h=0;h<howmany;h++)
2812  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
2813  }
2814  if(nprocs==1){ // Transpose is done!
2815  shuffle_time+=MPI_Wtime();
2816  timings[0]+=MPI_Wtime();
2817  timings[0]+=shuffle_time;
2818  timings[1]+=shuffle_time;
2819  timings[2]+=0;
2820  timings[3]+=0;
2821  MPI_Barrier(T_plan->comm);
2822  return;
2823  }
2824 
2825  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
2826  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
2827  local_transpose_col_cuda(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv_d );
2828 
2829  shuffle_time+=MPI_Wtime();
2830  ptr=0;
2831  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
2832  if(VERBOSE>=2){
2833  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2834  for(int h=0;h<howmany;h++)
2835  for(int id=0;id<nprocs;++id){
2836  if(procid==id)
2837  for(int i=0;i<N[1];i++){
2838  std::cout<<std::endl;
2839  for(int j=0;j<local_n0;j++){
2840  std::cout<<'\t'<<data_cpu[ptr];//data[h*idist+(i*local_n0+j)*n_tuples];
2841  ptr+=n_tuples;
2842  }
2843  }
2844  std::cout<<'\n';
2845  MPI_Barrier(T_plan->comm);
2846  }
2847  }
2848 
2849  //PCOUT<<" ============================================= "<<std::endl;
2850  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
2851  //PCOUT<<" ============================================= "<<std::endl;
2852 
2853  int* scount_proc_f= T_plan->scount_proc_f;
2854  int* rcount_proc_f= T_plan->rcount_proc_f;
2855  int* soffset_proc_f= T_plan->soffset_proc_f;
2856  int* roffset_proc_f= T_plan->roffset_proc_f;
2857 
2858  MPI_Barrier(T_plan->comm);
2859 
2860  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
2861  comm_time-=MPI_Wtime();
2862 
2863  MPI_Request * s_request= new MPI_Request[nprocs];
2864  MPI_Request * request= new MPI_Request[nprocs];
2865 #pragma omp parallel for
2866  for (int proc=0;proc<nprocs;++proc){
2867  request[proc]=MPI_REQUEST_NULL;
2868  s_request[proc]=MPI_REQUEST_NULL;
2869  }
2870  double * r_buf_d;
2871  // if Flags[1]==0 the other function should have been called and returned already.
2872  if(Flags[1]==1)
2873  r_buf_d=data;
2874  else
2875  r_buf_d=send_recv_d;
2876 
2877  // SEND
2878  // Flags[1]=1 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --memcpy--> data_d
2879  // Flags[1]=0 data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
2880 
2881  if(T_plan->kway_async)
2882  par::Mpi_Alltoallv_dense_gpu<double,true>(send_recv_d , scount_proc_f, soffset_proc_f,
2883  r_buf_d, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
2884  else
2885  par::Mpi_Alltoallv_dense_gpu<double,false>(send_recv_d , scount_proc_f, soffset_proc_f,
2886  r_buf_d, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
2887 
2888 
2889 
2890  comm_time+=MPI_Wtime();
2891 
2892  ptr=0;
2893  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
2894  if(VERBOSE>=2){
2895  cudaMemcpy(data_cpu,send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2896  for(int h=0;h<howmany;h++)
2897  for(int id=0;id<nprocs;++id){
2898  if(procid==id)
2899  for(int i=0;i<local_n1;i++){
2900  std::cout<<std::endl;
2901  for(int j=0;j<N[0];j++){
2902  std::cout<<'\t'<<data_cpu[ptr];//send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
2903  ptr+=n_tuples;
2904  }
2905  }
2906  std::cout<<'\n';
2907  MPI_Barrier(T_plan->comm);
2908  }
2909  }
2910  //PCOUT<<" ============================================= "<<std::endl;
2911  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
2912  //PCOUT<<" ============================================= "<<std::endl;
2913  reshuffle_time-=MPI_Wtime();
2914  ptr=0;
2915  if(Flags[1]==0)
2916  local_transpose_cuda(N[0],local_n1,n_tuples,send_recv_d,data );
2917 
2918 
2919 
2920 
2921  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
2922  if(VERBOSE>=2){
2923  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2924  for(int h=0;h<howmany;h++)
2925  for(int id=0;id<nprocs_1;++id){
2926  if(procid==id)
2927  for(int i=0;i<local_n1;i++){
2928  std::cout<<std::endl;
2929  for(int j=0;j<N[0];j++){
2930  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
2931  }
2932  }
2933  std::cout<<'\n';
2934  MPI_Barrier(T_plan->comm);
2935  }
2936  }
2937 
2938  reshuffle_time+=MPI_Wtime();
2939  delete [] request;
2940  delete [] s_request;
2941 
2942 
2943  if(VERBOSE>=1){
2944  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
2945  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
2946  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
2947  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2948  }
2949  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
2950  timings[1]+=shuffle_time;
2951  timings[2]+=comm_time;
2952  timings[3]+=reshuffle_time;
2953  return;
2954 
2955 }
2956 
2957 
2958 
2959 
2960 void fast_transpose_cuda_v2_h(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
2961 
2962  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
2963  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If nprocs==1 and Flags==Transposed_Out return
2964  MPI_Barrier(T_plan->comm);
2965  return;
2966  }
2967  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
2968  transpose_cuda_v6(T_plan,(double*)data,timings,flags,howmany,tag);
2969  MPI_Barrier(T_plan->comm);
2970  return;
2971  }
2972  timings[0]-=MPI_Wtime();
2973  int nprocs, procid;
2974  int nprocs_0, nprocs_1;
2975  nprocs=T_plan->nprocs;
2976  procid=T_plan->procid;
2977  nprocs_0=T_plan->nprocs_0;
2978  nprocs_1=T_plan->nprocs_1;
2979  ptrdiff_t *N=T_plan->N;
2980  ptrdiff_t local_n0=T_plan->local_n0;
2981  ptrdiff_t local_n1=T_plan->local_n1;
2982  ptrdiff_t n_tuples=T_plan->n_tuples;
2983 
2984  double * data_cpu=T_plan->buffer; //pinned
2985  double * send_recv_cpu = T_plan->buffer_2;//pinned
2986  double * send_recv_d = T_plan->buffer_d;
2987 
2988  int idist=N[1]*local_n0*n_tuples;
2989  int odist=N[0]*local_n1*n_tuples;
2990 
2991  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2992 
2993  int ptr=0;
2994  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
2995  if(VERBOSE>=2){
2996  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
2997  for(int h=0;h<howmany;h++)
2998  for(int id=0;id<nprocs;++id){
2999  if(procid==id)
3000  for(int i=0;i<local_n0;i++){
3001  std::cout<<std::endl;
3002  for(int j=0;j<N[1];j++){
3003  ptr=h*idist+(i*N[1]+j)*n_tuples;
3004  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
3005  }
3006  }
3007  std::cout<<'\n';
3008  MPI_Barrier(T_plan->comm);
3009  }
3010  }
3011  //PCOUT<<" ============================================= "<<std::endl;
3012  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
3013  //PCOUT<<" ============================================= "<<std::endl;
3014  ptr=0;
3015  ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
3016  ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
3017  ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
3018  ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
3019  shuffle_time-=MPI_Wtime();
3020  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
3021 #pragma omp parallel for
3022  for(int h=0;h<howmany;h++)
3023  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
3024  }
3025  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
3026 #pragma omp parallel for
3027  for(int h=0;h<howmany;h++)
3028  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
3029  }
3030  if(nprocs==1){ // Transpose is done!
3031  shuffle_time+=MPI_Wtime();
3032  timings[0]+=MPI_Wtime();
3033  timings[0]+=shuffle_time;
3034  timings[1]+=shuffle_time;
3035  timings[2]+=0;
3036  timings[3]+=0;
3037  MPI_Barrier(T_plan->comm);
3038  return;
3039  }
3040 
3041  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
3042  ptr=0;
3043 
3044  if(0)
3045  for (int proc=0;proc<nprocs_1;++proc)
3046  for(int h=0;h<howmany;++h){
3047  for(int i=0;i<local_n0;++i){
3048  //for(int j=local_1_start_proc[proc];j<local_1_start_proc[proc]+local_n1_proc[proc];++j){
3049  // memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+j)*n_tuples],sizeof(double)*n_tuples);
3050  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
3051  // ptr+=n_tuples;
3052  //}
3053  //memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],sizeof(double)*n_tuples*local_n1_proc[proc]);
3054  cudaMemcpy(&send_recv_d[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples] , sizeof(double)*n_tuples*local_n1_proc[proc] , cudaMemcpyDeviceToDevice);
3055  ptr+=n_tuples*local_n1_proc[proc];
3056  }
3057  }
3058  memcpy_v1_h1(nprocs_1,howmany,local_n0,n_tuples,local_n1_proc,send_recv_d,data,idist,N[1],local_1_start_proc);
3059  cudaDeviceSynchronize();
3060 
3061  shuffle_time+=MPI_Wtime();
3062  ptr=0;
3063  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
3064  if(VERBOSE>=2){
3065  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3066  for(int id=0;id<nprocs;++id){
3067  for(int h=0;h<howmany;h++)
3068  if(procid==id)
3069  for(int i=0;i<N[1];i++){
3070  std::cout<<std::endl;
3071  for(int j=0;j<local_n0;j++){
3072  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
3073  ptr+=n_tuples;
3074  }
3075  }
3076  std::cout<<'\n';
3077  MPI_Barrier(T_plan->comm);
3078  }
3079  }
3080 
3081 
3082  //PCOUT<<" ============================================= "<<std::endl;
3083  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
3084  //PCOUT<<" ============================================= "<<std::endl;
3085 
3086  int* scount_proc_f= T_plan->scount_proc_f;
3087  int* rcount_proc_f= T_plan->rcount_proc_f;
3088  int* soffset_proc_f= T_plan->soffset_proc_f;
3089  int* roffset_proc_f= T_plan->roffset_proc_f;
3090 
3091  MPI_Barrier(T_plan->comm);
3092 
3093  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
3094  comm_time-=MPI_Wtime();
3095 
3096 
3097  double *s_buf, *r_buf;
3098  s_buf=data_cpu; r_buf=send_recv_cpu;
3099  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
3100 
3101  // SEND
3102  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3103  if(T_plan->is_evenly_distributed==0)
3104  MPI_Alltoallv(s_buf,scount_proc_f,
3105  soffset_proc_f, MPI_DOUBLE,r_buf,
3106  rcount_proc_f,roffset_proc_f, MPI_DOUBLE,
3107  T_plan->comm);
3108  else
3109  MPI_Alltoall(s_buf, scount_proc_f[0], MPI_DOUBLE,
3110  r_buf, rcount_proc_f[0], MPI_DOUBLE,
3111  T_plan->comm);
3112 
3113 
3114  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
3115  //cudaMemcpy(send_recv_d, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
3116  comm_time+=MPI_Wtime();
3117 
3118 
3119  ptr=0;
3120  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
3121  if(VERBOSE>=2){
3122  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3123  for(int id=0;id<nprocs;++id){
3124  if(procid==id)
3125  for(int h=0;h<howmany;h++)
3126  for(int i=0;i<local_n1;i++){
3127  std::cout<<std::endl;
3128  for(int j=0;j<N[0];j++){
3129  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
3130  ptr+=n_tuples;
3131  }
3132  }
3133  std::cout<<'\n';
3134  MPI_Barrier(T_plan->comm);
3135  }
3136  }
3137  //PCOUT<<" ============================================= "<<std::endl;
3138  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
3139  //PCOUT<<" ============================================= "<<std::endl;
3140  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
3141  reshuffle_time-=MPI_Wtime();
3142  ptr=0;
3143  if(0)
3144  for (int proc=0;proc<nprocs_0;++proc)
3145  for(int h=0;h<howmany;++h){
3146  for(int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
3147  //memcpy(&data_cpu[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*sizeof(double)*n_tuples);
3148  cudaMemcpy( &data[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*sizeof(double)*n_tuples,cudaMemcpyHostToDevice);
3149  //std::cout<<"proc= "<<proc<<" h= "<<h<<" i=("<<i<<") data_ptr= "<<h*odist+(i*local_n1)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*local_n1_proc[proc] <<std::endl;
3150  ptr+=n_tuples*local_n1;
3151  //for(int j=0*local_1_start_proc[proc];j<0*local_1_start_proc[proc]+local_n1;++j){
3152  // memcpy(&data[h*odist+(i*local_n1+j)*n_tuples],&send_recv[ptr],sizeof(double)*n_tuples);
3153  //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
3154  // ptr+=n_tuples;
3155  //}
3156  }
3157  }
3158  memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_cpu);
3159  cudaDeviceSynchronize();
3160 
3161  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
3162  if(VERBOSE>=2){
3163  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3164  for(int id=0;id<nprocs_1;++id){
3165  if(procid==id)
3166  for(int h=0;h<howmany;h++)
3167  for(int i=0;i<N[0];i++){
3168  std::cout<<std::endl;
3169  for(int j=0;j<local_n1;j++){
3170  ptr=h*odist+(i*local_n1+j)*n_tuples;
3171  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
3172  }
3173  }
3174  std::cout<<'\n';
3175  MPI_Barrier(T_plan->comm);
3176  }
3177  }
3178  // Right now the data is in transposed out format.
3179  // If the user did not want this layout, transpose again.
3180  if(Flags[1]==0){
3181 #pragma omp parallel for
3182  for(int h=0;h<howmany;h++)
3183  local_transpose_cuda(N[0],local_n1,n_tuples,&data[h*odist] );
3184 
3185  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
3186  if(VERBOSE>=2){
3187  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3188  for(int id=0;id<nprocs_1;++id){
3189  if(procid==id)
3190  for(int h=0;h<howmany;h++)
3191  for(int i=0;i<N[0];i++){
3192  std::cout<<std::endl;
3193  for(int j=0;j<local_n1;j++){
3194  ptr=h*odist+(i*local_n1+j)*n_tuples;
3195  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
3196  }
3197  }
3198  std::cout<<'\n';
3199  MPI_Barrier(T_plan->comm);
3200  }
3201  }
3202  }
3203 
3204  reshuffle_time+=MPI_Wtime();
3205  MPI_Barrier(T_plan->comm);
3206 
3207 
3208  if(VERBOSE>=1){
3209  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
3210  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
3211  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
3212  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
3213  }
3214  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
3215  timings[1]+=shuffle_time;
3216  timings[2]+=comm_time;
3217  timings[3]+=reshuffle_time;
3218  return;
3219 }
3220 
3221 void fast_transpose_cuda_v3_h(T_Plan_gpu* T_plan, double * data, double *timings,int kway, unsigned flags, int howmany , int tag){
3222 
3223  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
3224  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If nprocs==1 and Flags==Transposed_Out return
3225  MPI_Barrier(T_plan->comm);
3226  return;
3227  }
3228  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
3229  transpose_cuda_v6(T_plan,(double*)data,timings,flags,howmany,tag);
3230  MPI_Barrier(T_plan->comm);
3231  return;
3232  }
3233  timings[0]-=MPI_Wtime();
3234  int nprocs, procid;
3235  int nprocs_0, nprocs_1;
3236  nprocs=T_plan->nprocs;
3237  procid=T_plan->procid;
3238  nprocs_0=T_plan->nprocs_0;
3239  nprocs_1=T_plan->nprocs_1;
3240  ptrdiff_t *N=T_plan->N;
3241  ptrdiff_t local_n0=T_plan->local_n0;
3242  ptrdiff_t local_n1=T_plan->local_n1;
3243  ptrdiff_t n_tuples=T_plan->n_tuples;
3244 
3245  double * data_cpu=T_plan->buffer; //pinned
3246  double * send_recv_cpu = T_plan->buffer_2;//pinned
3247  double * send_recv_d = T_plan->buffer_d;
3248 
3249  int idist=N[1]*local_n0*n_tuples;
3250  int odist=N[0]*local_n1*n_tuples;
3251 
3252  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
3253 
3254  int ptr=0;
3255  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
3256  if(VERBOSE>=2){
3257  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3258  for(int h=0;h<howmany;h++)
3259  for(int id=0;id<nprocs;++id){
3260  if(procid==id)
3261  for(int i=0;i<local_n0;i++){
3262  std::cout<<std::endl;
3263  for(int j=0;j<N[1];j++){
3264  ptr=h*idist+(i*N[1]+j)*n_tuples;
3265  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
3266  }
3267  }
3268  std::cout<<'\n';
3269  MPI_Barrier(T_plan->comm);
3270  }
3271  }
3272  //PCOUT<<" ============================================= "<<std::endl;
3273  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
3274  //PCOUT<<" ============================================= "<<std::endl;
3275  ptr=0;
3276  ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
3277  ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
3278  ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
3279  ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
3280  shuffle_time-=MPI_Wtime();
3281  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
3282 #pragma omp parallel for
3283  for(int h=0;h<howmany;h++)
3284  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
3285  }
3286  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
3287 #pragma omp parallel for
3288  for(int h=0;h<howmany;h++)
3289  local_transpose_cuda(N[0],N[1],n_tuples,&data[h*idist] );
3290  }
3291  if(nprocs==1){ // Transpose is done!
3292  shuffle_time+=MPI_Wtime();
3293  timings[0]+=MPI_Wtime();
3294  timings[0]+=shuffle_time;
3295  timings[1]+=shuffle_time;
3296  timings[2]+=0;
3297  timings[3]+=0;
3298  MPI_Barrier(T_plan->comm);
3299  return;
3300  }
3301 
3302  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
3303  ptr=0;
3304 
3305  if(0)
3306  for (int proc=0;proc<nprocs_1;++proc)
3307  for(int h=0;h<howmany;++h){
3308  for(int i=0;i<local_n0;++i){
3309  //for(int j=local_1_start_proc[proc];j<local_1_start_proc[proc]+local_n1_proc[proc];++j){
3310  // memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+j)*n_tuples],sizeof(double)*n_tuples);
3311  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
3312  // ptr+=n_tuples;
3313  //}
3314  //memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],sizeof(double)*n_tuples*local_n1_proc[proc]);
3315  cudaMemcpy(&send_recv_d[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples] , sizeof(double)*n_tuples*local_n1_proc[proc] , cudaMemcpyDeviceToDevice);
3316  ptr+=n_tuples*local_n1_proc[proc];
3317  }
3318  }
3319  memcpy_v1_h1(nprocs_1,howmany,local_n0,n_tuples,local_n1_proc,send_recv_d,data,idist,N[1],local_1_start_proc);
3320  cudaDeviceSynchronize();
3321 
3322  shuffle_time+=MPI_Wtime();
3323  ptr=0;
3324  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
3325  if(VERBOSE>=2){
3326  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3327  for(int id=0;id<nprocs;++id){
3328  for(int h=0;h<howmany;h++)
3329  if(procid==id)
3330  for(int i=0;i<N[1];i++){
3331  std::cout<<std::endl;
3332  for(int j=0;j<local_n0;j++){
3333  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
3334  ptr+=n_tuples;
3335  }
3336  }
3337  std::cout<<'\n';
3338  MPI_Barrier(T_plan->comm);
3339  }
3340  }
3341 
3342 
3343  //PCOUT<<" ============================================= "<<std::endl;
3344  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
3345  //PCOUT<<" ============================================= "<<std::endl;
3346 
3347  int* scount_proc_f= T_plan->scount_proc_f;
3348  int* rcount_proc_f= T_plan->rcount_proc_f;
3349  int* soffset_proc_f= T_plan->soffset_proc_f;
3350  int* roffset_proc_f= T_plan->roffset_proc_f;
3351 
3352  MPI_Barrier(T_plan->comm);
3353 
3354  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
3355  comm_time-=MPI_Wtime();
3356 
3357  MPI_Request * s_request= new MPI_Request[nprocs];
3358  MPI_Request * request= new MPI_Request[nprocs];
3359 #pragma omp parallel for
3360  for (int proc=0;proc<nprocs;++proc){
3361  request[proc]=MPI_REQUEST_NULL;
3362  s_request[proc]=MPI_REQUEST_NULL;
3363  }
3364 
3365  double *s_buf, *r_buf;
3366  s_buf=data_cpu; r_buf=send_recv_cpu;
3367  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
3368 
3369  // SEND
3370  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3371  if(T_plan->kway_async)
3372  par::Mpi_Alltoallv_dense<double,true>(s_buf , scount_proc_f, soffset_proc_f,
3373  r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
3374  else
3375  par::Mpi_Alltoallv_dense<double,false>(s_buf , scount_proc_f, soffset_proc_f,
3376  r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
3377 
3378  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
3379  //cudaMemcpy(send_recv_d, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
3380  comm_time+=MPI_Wtime();
3381 
3382 
3383  ptr=0;
3384  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
3385  if(VERBOSE>=2){
3386  cudaMemcpy(data_cpu, send_recv_d, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3387  for(int id=0;id<nprocs;++id){
3388  if(procid==id)
3389  for(int h=0;h<howmany;h++)
3390  for(int i=0;i<local_n1;i++){
3391  std::cout<<std::endl;
3392  for(int j=0;j<N[0];j++){
3393  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
3394  ptr+=n_tuples;
3395  }
3396  }
3397  std::cout<<'\n';
3398  MPI_Barrier(T_plan->comm);
3399  }
3400  }
3401  //PCOUT<<" ============================================= "<<std::endl;
3402  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
3403  //PCOUT<<" ============================================= "<<std::endl;
3404  // data_d --Th--> send_recv_d --memcpy--> data_cpu --alltoall--> send_recv_cpu --Th--> data_d
3405  reshuffle_time-=MPI_Wtime();
3406  ptr=0;
3407  if(0)
3408  for (int proc=0;proc<nprocs_0;++proc)
3409  for(int h=0;h<howmany;++h){
3410  for(int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
3411  //memcpy(&data_cpu[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*sizeof(double)*n_tuples);
3412  cudaMemcpy( &data[h*odist+(i*local_n1)*n_tuples],&send_recv_cpu[ptr],local_n1*sizeof(double)*n_tuples,cudaMemcpyHostToDevice);
3413  //std::cout<<"proc= "<<proc<<" h= "<<h<<" i=("<<i<<") data_ptr= "<<h*odist+(i*local_n1)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*local_n1_proc[proc] <<std::endl;
3414  ptr+=n_tuples*local_n1;
3415  //for(int j=0*local_1_start_proc[proc];j<0*local_1_start_proc[proc]+local_n1;++j){
3416  // memcpy(&data[h*odist+(i*local_n1+j)*n_tuples],&send_recv[ptr],sizeof(double)*n_tuples);
3417  //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
3418  // ptr+=n_tuples;
3419  //}
3420  }
3421  }
3422  memcpy_v1_h2(nprocs_0,howmany,local_0_start_proc,local_n0_proc,data,odist,local_n1,n_tuples,send_recv_cpu);
3423  cudaDeviceSynchronize();
3424 
3425  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
3426  if(VERBOSE>=2){
3427  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3428  for(int id=0;id<nprocs_1;++id){
3429  if(procid==id)
3430  for(int h=0;h<howmany;h++)
3431  for(int i=0;i<N[0];i++){
3432  std::cout<<std::endl;
3433  for(int j=0;j<local_n1;j++){
3434  ptr=h*odist+(i*local_n1+j)*n_tuples;
3435  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
3436  }
3437  }
3438  std::cout<<'\n';
3439  MPI_Barrier(T_plan->comm);
3440  }
3441  }
3442  // Right now the data is in transposed out format.
3443  // If the user did not want this layout, transpose again.
3444  if(Flags[1]==0){
3445 #pragma omp parallel for
3446  for(int h=0;h<howmany;h++)
3447  local_transpose_cuda(N[0],local_n1,n_tuples,&data[h*odist] );
3448 
3449  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
3450  if(VERBOSE>=2){
3451  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3452  for(int id=0;id<nprocs_1;++id){
3453  if(procid==id)
3454  for(int h=0;h<howmany;h++)
3455  for(int i=0;i<N[0];i++){
3456  std::cout<<std::endl;
3457  for(int j=0;j<local_n1;j++){
3458  ptr=h*odist+(i*local_n1+j)*n_tuples;
3459  std::cout<<'\t'<<data_cpu[ptr]<<","<<data_cpu[ptr+1];
3460  }
3461  }
3462  std::cout<<'\n';
3463  MPI_Barrier(T_plan->comm);
3464  }
3465  }
3466  }
3467 
3468  reshuffle_time+=MPI_Wtime();
3469  MPI_Barrier(T_plan->comm);
3470  delete [] request;
3471  delete [] s_request;
3472 
3473 
3474  if(VERBOSE>=1){
3475  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
3476  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
3477  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
3478  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
3479  }
3480  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
3481  timings[1]+=shuffle_time;
3482  timings[2]+=comm_time;
3483  timings[3]+=reshuffle_time;
3484  return;
3485 }
3486 
3487 void transpose_cuda_v5(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
3488 
3489  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
3490  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
3491  MPI_Barrier(T_plan->comm);
3492  return;
3493  }
3494  timings[0]-=MPI_Wtime();
3495  int nprocs, procid;
3496  int nprocs_0, nprocs_1;
3497  nprocs=T_plan->nprocs;
3498  procid=T_plan->procid;
3499  nprocs_0=T_plan->nprocs_0;
3500  nprocs_1=T_plan->nprocs_1;
3501  ptrdiff_t *N=T_plan->N;
3502  double* data_cpu=T_plan->buffer;
3503  double * send_recv_cpu = T_plan->buffer_2;
3504  double * send_recv = T_plan->buffer_d;
3505  ptrdiff_t local_n0=T_plan->local_n0;
3506  ptrdiff_t local_n1=T_plan->local_n1;
3507  ptrdiff_t n_tuples=T_plan->n_tuples;
3508  int idist=N[1]*local_n0*n_tuples;
3509  int odist=N[0]*local_n1*n_tuples;
3510 
3511  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
3512 
3513  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
3514  if(VERBOSE>=2){
3515  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3516  for(int h=0;h<howmany;h++)
3517  for(int id=0;id<nprocs;++id){
3518  if(procid==id)
3519  for(int i=0;i<local_n0;i++){
3520  std::cout<<std::endl;
3521  for(int j=0;j<N[1];j++){
3522  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
3523  }
3524  }
3525  std::cout<<'\n';
3526  MPI_Barrier(T_plan->comm);
3527  }
3528  }
3529  //PCOUT<<" ============================================= "<<std::endl;
3530  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
3531  //PCOUT<<" ============================================= "<<std::endl;
3532  shuffle_time-=MPI_Wtime();
3533 
3534  if(Flags[0]==0){
3535  for(int h=0;h<howmany;h++)
3536  local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
3537  }
3538  cudaDeviceSynchronize();
3539 
3540  shuffle_time+=MPI_Wtime();
3541  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
3542  if(VERBOSE>=2){
3543  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3544  for(int h=0;h<howmany;h++)
3545  for(int id=0;id<nprocs;++id){
3546  if(procid==id)
3547  for(int i=0;i<N[1];i++){
3548  std::cout<<std::endl;
3549  for(int j=0;j<local_n0;j++){
3550  std::cout<<'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
3551  }
3552  }
3553  std::cout<<'\n';
3554  MPI_Barrier(T_plan->comm);
3555  }
3556  }
3557 
3558  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
3559  for(int h=0;h<howmany;h++)
3560  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
3561  }
3562  if(nprocs==1){ // Transpose is done!
3563  timings[0]+=MPI_Wtime();
3564  timings[1]+=shuffle_time;
3565  timings[2]+=0;
3566  timings[3]+=0;
3567  return;
3568  }
3569 
3570 
3571 
3572  //PCOUT<<" ============================================= "<<std::endl;
3573  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
3574  //PCOUT<<" ============================================= "<<std::endl;
3575  int* scount_proc= T_plan->scount_proc;
3576  //int* rcount_proc= T_plan->rcount_proc;
3577  int* soffset_proc= T_plan->soffset_proc;
3578  int* roffset_proc= T_plan->roffset_proc;
3579 
3580  comm_time-=MPI_Wtime();
3581 
3582  int soffset=0,roffset=0;
3583  MPI_Status ierr;
3584  MPI_Request request[nprocs], s_request[nprocs];
3585 #pragma omp parallel for
3586  for (int proc=0;proc<nprocs;++proc){
3587  request[proc]=MPI_REQUEST_NULL;
3588  s_request[proc]=MPI_REQUEST_NULL;
3589  }
3590 
3591  MPI_Datatype *stype=T_plan->stype;
3592  MPI_Datatype *rtype=T_plan->rtype;
3593  MPI_Barrier(T_plan->comm);
3594 
3595  // Post Receives
3596  for (int proc=0;proc<nprocs;++proc){
3597  if(proc!=procid){
3598  roffset=roffset_proc[proc];
3599  MPI_Irecv(&send_recv_cpu[roffset],1, rtype[proc], proc,
3600  tag, T_plan->comm, &request[proc]);
3601  }
3602  }
3603  // SEND
3604  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3605  for (int proc=0;proc<nprocs;++proc){
3606  if(proc!=procid){
3607  soffset=soffset_proc[proc];
3608  MPI_Isend(&data_cpu[soffset],1, stype[proc],proc, tag,
3609  T_plan->comm, &s_request[proc]);
3610  }
3611  }
3612 
3613  soffset=soffset_proc[procid];//aoffset_proc[proc];SNAFU //proc*count_proc[proc];
3614  roffset=roffset_proc[procid];
3615  for(int h=0;h<howmany;h++)
3616  memcpy(&send_recv_cpu[h*odist+roffset],&data_cpu[h*idist+soffset],sizeof(double)*scount_proc[procid]);
3617  for (int proc=0;proc<nprocs;++proc){
3618  MPI_Wait(&request[proc], &ierr);
3619  MPI_Wait(&s_request[proc], &ierr);
3620  }
3621 
3622  cudaMemcpy(send_recv, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
3623  cudaDeviceSynchronize();
3624  comm_time+=MPI_Wtime();
3625 
3626  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
3627  if(VERBOSE>=2){
3628  for(int h=0;h<howmany;h++)
3629  for(int id=0;id<nprocs;++id){
3630  if(procid==id)
3631  for(int i=0;i<local_n1;i++){
3632  std::cout<<std::endl;
3633  for(int j=0;j<N[0];j++){
3634  std::cout<<'\t'<<send_recv_cpu[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
3635  }
3636  }
3637  std::cout<<'\n';
3638  MPI_Barrier(T_plan->comm);
3639  }
3640  }
3641 
3642 
3643 
3644  //PCOUT<<" ============================================= "<<std::endl;
3645  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
3646  //PCOUT<<" ============================================= "<<std::endl;
3647 
3648  reshuffle_time-=MPI_Wtime();
3649 
3650  // int first_local_n0, last_local_n0;
3651  // first_local_n0=local_n0_proc[0]; last_local_n0=local_n0_proc[nprocs_1-1];
3652 
3653  //local_transpose(nprocs_1,local_n1,n_tuples*local_n0,send_recv,data );
3654 
3655  int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
3656  if(local_n1!=0)
3657  last_ntuples=T_plan->last_recv_count/((int)local_n1);
3658 
3659  for(int h=0;h<howmany;h++){
3660  if(local_n1==1)
3661  cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice); // you are done, no additional transpose is needed.
3662  else if(last_ntuples!=first_ntuples){
3663  local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
3664  local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
3665  }
3666  else if(last_ntuples==first_ntuples){
3667  //local_transpose_cuda(nprocs_0,local_n1,n_tuples*local_n0,send_recv );
3668  local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
3669  }
3670  }
3671  cudaDeviceSynchronize();
3672 
3673 
3674 
3675  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
3676  if(VERBOSE>=2){
3677  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3678  for(int h=0;h<howmany;h++)
3679  for(int id=0;id<nprocs_1;++id){
3680  if(procid==id)
3681  for(int i=0;i<local_n1;i++){
3682  std::cout<<std::endl;
3683  for(int j=0;j<N[0];j++){
3684  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
3685  }
3686  }
3687  std::cout<<'\n';
3688  MPI_Barrier(T_plan->comm);
3689  }
3690  }
3691  if(Flags[1]==1){ // Transpose output
3692  for(int h=0;h<howmany;h++)
3693  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
3694  }
3695  reshuffle_time+=MPI_Wtime();
3696  MPI_Barrier(T_plan->comm);
3697  if(VERBOSE>=1){
3698  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
3699  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
3700  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
3701  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
3702  }
3703  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
3704  timings[1]+=shuffle_time;
3705  timings[2]+=comm_time;
3706  timings[3]+=reshuffle_time;
3707  return;
3708 }
3709 
3710 void transpose_cuda_v5_2(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
3711 
3712  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
3713  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
3714  MPI_Barrier(T_plan->comm);
3715  return;
3716  }
3717  timings[0]-=MPI_Wtime();
3718  int nprocs, procid;
3719  int nprocs_0, nprocs_1;
3720  nprocs=T_plan->nprocs;
3721  procid=T_plan->procid;
3722  nprocs_0=T_plan->nprocs_0;
3723  nprocs_1=T_plan->nprocs_1;
3724  ptrdiff_t *N=T_plan->N;
3725  double* data_cpu=T_plan->buffer;
3726  double * send_recv_cpu = T_plan->buffer_2;
3727  double * send_recv = T_plan->buffer_d;
3728  ptrdiff_t local_n0=T_plan->local_n0;
3729  ptrdiff_t local_n1=T_plan->local_n1;
3730  ptrdiff_t n_tuples=T_plan->n_tuples;
3731  int idist=N[1]*local_n0*n_tuples;
3732  int odist=N[0]*local_n1*n_tuples;
3733 
3734  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
3735 
3736  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
3737  if(VERBOSE>=2){
3738  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3739  for(int h=0;h<howmany;h++)
3740  for(int id=0;id<nprocs;++id){
3741  if(procid==id)
3742  for(int i=0;i<local_n0;i++){
3743  std::cout<<std::endl;
3744  for(int j=0;j<N[1];j++){
3745  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
3746  }
3747  }
3748  std::cout<<'\n';
3749  MPI_Barrier(T_plan->comm);
3750  }
3751  }
3752  //PCOUT<<" ============================================= "<<std::endl;
3753  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
3754  //PCOUT<<" ============================================= "<<std::endl;
3755  shuffle_time-=MPI_Wtime();
3756 
3757  if(Flags[0]==0){
3758  for(int h=0;h<howmany;h++)
3759  local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
3760  }
3761  cudaDeviceSynchronize();
3762 
3763  shuffle_time+=MPI_Wtime();
3764  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
3765  if(VERBOSE>=2){
3766  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3767  for(int h=0;h<howmany;h++)
3768  for(int id=0;id<nprocs;++id){
3769  if(procid==id)
3770  for(int i=0;i<N[1];i++){
3771  std::cout<<std::endl;
3772  for(int j=0;j<local_n0;j++){
3773  std::cout<<'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
3774  }
3775  }
3776  std::cout<<'\n';
3777  MPI_Barrier(T_plan->comm);
3778  }
3779  }
3780  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
3781  for(int h=0;h<howmany;h++)
3782  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
3783  }
3784  if(nprocs==1){ // Transpose is done!
3785  timings[0]+=MPI_Wtime();
3786  timings[1]+=shuffle_time;
3787  timings[2]+=0;
3788  timings[3]+=0;
3789  return;
3790  }
3791 
3792 
3793  //PCOUT<<" ============================================= "<<std::endl;
3794  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
3795  //PCOUT<<" ============================================= "<<std::endl;
3796  int* scount_proc= T_plan->scount_proc;
3797  int* rcount_proc= T_plan->rcount_proc;
3798  int* soffset_proc= T_plan->soffset_proc;
3799  int* roffset_proc= T_plan->roffset_proc;
3800 
3801  comm_time-=MPI_Wtime();
3802 
3803  int soffset=0,roffset=0;
3804  MPI_Status ierr;
3805  MPI_Request request[nprocs], s_request[nprocs];
3806  int flag[nprocs],color[nprocs];
3807  memset(flag,0,sizeof(int)*nprocs);
3808  memset(color,0,sizeof(int)*nprocs);
3809 #pragma omp parallel for
3810  for (int proc=0;proc<nprocs;++proc){
3811  request[proc]=MPI_REQUEST_NULL;
3812  s_request[proc]=MPI_REQUEST_NULL;
3813  }
3814 
3815  int counter=1;
3816  MPI_Datatype *stype=T_plan->stype;
3817  MPI_Datatype *rtype=T_plan->rtype;
3818  // Post Receives
3819  for (int proc=0;proc<nprocs;++proc){
3820  if(proc!=procid){
3821  roffset=roffset_proc[proc];
3822  MPI_Irecv(&send_recv_cpu[roffset],1, rtype[proc], proc,
3823  tag, T_plan->comm, &request[proc]);
3824  }
3825  }
3826  // SEND
3827  //cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3828  for (int proc=0;proc<nprocs;++proc){
3829  if(proc!=procid){
3830  soffset=soffset_proc[proc];
3831  for(int h=0;h<howmany;h++)
3832  cudaMemcpy(&data_cpu[h*idist+soffset], &data[h*idist+soffset],sizeof(double)*scount_proc[proc], cudaMemcpyDeviceToHost);
3833  MPI_Isend(&data_cpu[soffset],1, stype[proc],proc, tag,
3834  T_plan->comm, &s_request[proc]);
3835 
3836  }
3837  else{ // copy your own part directly
3838  for(int h=0;h<howmany;h++)
3839  cudaMemcpy(&send_recv[h*odist+roffset_proc[procid]], &data[h*idist+soffset_proc[procid]],sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToDevice);
3840  }
3841  }
3842  while(counter!=nprocs){
3843 
3844  for (int proc=0;proc<nprocs;++proc){
3845  MPI_Test(&request[proc], &flag[proc],&ierr);
3846  if(flag[proc]==1 && color[proc]==0 && proc!=procid){
3847  for(int h=0;h<howmany;h++)
3848  cudaMemcpyAsync(&send_recv[h*odist+roffset_proc[proc]],&send_recv_cpu[h*odist+roffset_proc[proc]],sizeof(double)*rcount_proc[proc],cudaMemcpyHostToDevice);
3849  color[proc]=1;
3850  counter+=1;
3851  }
3852  }
3853 
3854  }
3855  cudaDeviceSynchronize();
3856  //cudaMemcpy(send_recv, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
3857  comm_time+=MPI_Wtime();
3858 
3859  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
3860  if(VERBOSE>=2){
3861  cudaMemcpy(send_recv_cpu, send_recv, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3862  for(int h=0;h<howmany;h++)
3863  for(int id=0;id<nprocs;++id){
3864  if(procid==id)
3865  for(int i=0;i<local_n1;i++){
3866  std::cout<<std::endl;
3867  for(int j=0;j<N[0];j++){
3868  std::cout<<'\t'<<send_recv_cpu[odist*h+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
3869  }
3870  }
3871  std::cout<<'\n';
3872  MPI_Barrier(T_plan->comm);
3873  }
3874  }
3875 
3876 
3877 
3878  //PCOUT<<" ============================================= "<<std::endl;
3879  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
3880  //PCOUT<<" ============================================= "<<std::endl;
3881 
3882  reshuffle_time-=MPI_Wtime();
3883 
3884  // int first_local_n0, last_local_n0;
3885  // first_local_n0=local_n0_proc[0]; last_local_n0=local_n0_proc[nprocs_1-1];
3886 
3887  //local_transpose(nprocs_1,local_n1,n_tuples*local_n0,send_recv,data );
3888 
3889  int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
3890  if(local_n1!=0)
3891  last_ntuples=T_plan->last_recv_count/((int)local_n1);
3892 
3893  for(int h=0;h<howmany;h++){
3894  if(local_n1==1)
3895  cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice); // you are done, no additional transpose is needed.
3896  else if(last_ntuples!=first_ntuples){
3897  local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
3898  local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
3899  }
3900  else if(last_ntuples==first_ntuples){
3901  //local_transpose_cuda(nprocs_0,local_n1,n_tuples*local_n0,send_recv );
3902  local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
3903  }
3904  }
3905  cudaDeviceSynchronize();
3906 
3907 
3908  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
3909  if(VERBOSE>=2){
3910  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3911  for(int h=0;h<howmany;h++)
3912  for(int id=0;id<nprocs_1;++id){
3913  if(procid==id)
3914  for(int i=0;i<local_n1;i++){
3915  std::cout<<std::endl;
3916  for(int j=0;j<N[0];j++){
3917  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
3918  }
3919  }
3920  std::cout<<'\n';
3921  MPI_Barrier(T_plan->comm);
3922  }
3923  }
3924  if(Flags[1]==1){ // Transpose output
3925  for(int h=0;h<howmany;h++)
3926  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
3927  }
3928  reshuffle_time+=MPI_Wtime();
3929  MPI_Barrier(T_plan->comm);
3930  if(VERBOSE>=1){
3931  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
3932  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
3933  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
3934  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
3935  }
3936  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
3937  timings[1]+=shuffle_time;
3938  timings[2]+=comm_time;
3939  timings[3]+=reshuffle_time;
3940  return;
3941 }
3942 
3943 
3944 void transpose_cuda_v5_3(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany , int tag){
3945 
3946  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
3947  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
3948  MPI_Barrier(T_plan->comm);
3949  return;
3950  }
3951  timings[0]-=MPI_Wtime();
3952  int nprocs, procid;
3953  int nprocs_0, nprocs_1;
3954  nprocs=T_plan->nprocs;
3955  procid=T_plan->procid;
3956  nprocs_0=T_plan->nprocs_0;
3957  nprocs_1=T_plan->nprocs_1;
3958  ptrdiff_t *N=T_plan->N;
3959  double* data_cpu=T_plan->buffer;
3960  double * send_recv_cpu = T_plan->buffer_2;
3961  double * send_recv = T_plan->buffer_d;
3962  ptrdiff_t local_n0=T_plan->local_n0;
3963  ptrdiff_t local_n1=T_plan->local_n1;
3964  ptrdiff_t n_tuples=T_plan->n_tuples;
3965  int idist=N[1]*local_n0*n_tuples;
3966  int odist=N[0]*local_n1*n_tuples;
3967 
3968  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
3969 
3970  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
3971  if(VERBOSE>=2){
3972  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
3973  for(int h=0;h<howmany;h++)
3974  for(int id=0;id<nprocs;++id){
3975  if(procid==id)
3976  for(int i=0;i<local_n0;i++){
3977  std::cout<<std::endl;
3978  for(int j=0;j<N[1];j++){
3979  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
3980  }
3981  }
3982  std::cout<<'\n';
3983  MPI_Barrier(T_plan->comm);
3984  }
3985  }
3986  //PCOUT<<" ============================================= "<<std::endl;
3987  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
3988  //PCOUT<<" ============================================= "<<std::endl;
3989  shuffle_time-=MPI_Wtime();
3990 
3991  if(Flags[0]==0){
3992  for(int h=0;h<howmany;h++)
3993  local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
3994  }
3995  cudaDeviceSynchronize();
3996 
3997  shuffle_time+=MPI_Wtime();
3998  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
3999  if(VERBOSE>=2){
4000  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4001  for(int h=0;h<howmany;h++)
4002  for(int id=0;id<nprocs;++id){
4003  if(procid==id)
4004  for(int i=0;i<N[1];i++){
4005  std::cout<<std::endl;
4006  for(int j=0;j<local_n0;j++){
4007  std::cout<<'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
4008  }
4009  }
4010  std::cout<<'\n';
4011  MPI_Barrier(T_plan->comm);
4012  }
4013  }
4014  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
4015  for(int h=0;h<howmany;h++)
4016  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
4017  }
4018  if(nprocs==1){ // Transpose is done!
4019  timings[0]+=MPI_Wtime();
4020  timings[1]+=shuffle_time;
4021  timings[2]+=0;
4022  timings[3]+=0;
4023  return;
4024  }
4025 
4026 
4027  //PCOUT<<" ============================================= "<<std::endl;
4028  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
4029  //PCOUT<<" ============================================= "<<std::endl;
4030  int* scount_proc= T_plan->scount_proc;
4031  int* rcount_proc= T_plan->rcount_proc;
4032  int* soffset_proc= T_plan->soffset_proc;
4033  int* roffset_proc= T_plan->roffset_proc;
4034 
4035  comm_time-=MPI_Wtime();
4036 
4037  int soffset=0,roffset=0;
4038  MPI_Status ierr;
4039 
4040  MPI_Datatype *stype=T_plan->stype;
4041  MPI_Datatype *rtype=T_plan->rtype;
4042 
4043  //cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4044  for (int proc=0;proc<nprocs;++proc){
4045  if(proc!=procid){
4046  soffset=soffset_proc[proc];
4047  roffset=roffset_proc[proc];
4048  for(int h=0;h<howmany;h++)
4049  cudaMemcpy(&data_cpu[h*idist+soffset], &data[h*idist+soffset],sizeof(double)*scount_proc[proc], cudaMemcpyDeviceToHost);
4050  MPI_Sendrecv(&data_cpu[soffset],1, stype[proc],
4051  proc, tag,
4052  &send_recv_cpu[roffset],1, rtype[proc],
4053  proc, tag,
4054  T_plan->comm,&ierr);
4055  for(int h=0;h<howmany;h++)
4056  cudaMemcpyAsync(&send_recv[h*odist+roffset_proc[proc]],&send_recv_cpu[h*odist+roffset_proc[proc]],sizeof(double)*rcount_proc[proc],cudaMemcpyHostToDevice);
4057 
4058  }
4059  else{ // copy your own part directly
4060  for(int h=0;h<howmany;h++)
4061  cudaMemcpyAsync(&send_recv[h*odist+roffset_proc[procid]], &data[h*idist+soffset_proc[procid]],sizeof(double)*scount_proc[procid], cudaMemcpyDeviceToDevice);
4062  }
4063  }
4064 
4065 
4066  cudaDeviceSynchronize();
4067  //cudaMemcpy(send_recv, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
4068  comm_time+=MPI_Wtime();
4069 
4070  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
4071  if(VERBOSE>=2){
4072  cudaMemcpy(send_recv_cpu, send_recv, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4073  for(int h=0;h<howmany;h++)
4074  for(int id=0;id<nprocs;++id){
4075  if(procid==id)
4076  for(int i=0;i<local_n1;i++){
4077  std::cout<<std::endl;
4078  for(int j=0;j<N[0];j++){
4079  std::cout<<'\t'<<send_recv_cpu[odist*h+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
4080  }
4081  }
4082  std::cout<<'\n';
4083  MPI_Barrier(T_plan->comm);
4084  }
4085  }
4086 
4087 
4088 
4089  //PCOUT<<" ============================================= "<<std::endl;
4090  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
4091  //PCOUT<<" ============================================= "<<std::endl;
4092 
4093  reshuffle_time-=MPI_Wtime();
4094 
4095  // int first_local_n0, last_local_n0;
4096  // first_local_n0=local_n0_proc[0]; last_local_n0=local_n0_proc[nprocs_1-1];
4097 
4098  //local_transpose(nprocs_1,local_n1,n_tuples*local_n0,send_recv,data );
4099 
4100  int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
4101  if(local_n1!=0)
4102  last_ntuples=T_plan->last_recv_count/((int)local_n1);
4103 
4104  for(int h=0;h<howmany;h++){
4105  if(local_n1==1)
4106  cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice); // you are done, no additional transpose is needed.
4107  else if(last_ntuples!=first_ntuples){
4108  local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
4109  local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
4110  }
4111  else if(last_ntuples==first_ntuples){
4112  //local_transpose_cuda(nprocs_0,local_n1,n_tuples*local_n0,send_recv );
4113  local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
4114  }
4115  }
4116  cudaDeviceSynchronize();
4117 
4118 
4119  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
4120  if(VERBOSE>=2){
4121  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4122  for(int h=0;h<howmany;h++)
4123  for(int id=0;id<nprocs_1;++id){
4124  if(procid==id)
4125  for(int i=0;i<local_n1;i++){
4126  std::cout<<std::endl;
4127  for(int j=0;j<N[0];j++){
4128  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
4129  }
4130  }
4131  std::cout<<'\n';
4132  MPI_Barrier(T_plan->comm);
4133  }
4134  }
4135  if(Flags[1]==1){ // Transpose output
4136  for(int h=0;h<howmany;h++)
4137  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
4138  }
4139  reshuffle_time+=MPI_Wtime();
4140  MPI_Barrier(T_plan->comm);
4141  if(VERBOSE>=1){
4142  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
4143  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
4144  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
4145  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
4146  }
4147  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
4148  timings[1]+=shuffle_time;
4149  timings[2]+=comm_time;
4150  timings[3]+=reshuffle_time;
4151  return;
4152 }
4153 
4154 
4155 
4156 void transpose_cuda_v6(T_Plan_gpu* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
4157 
4158  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
4159  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
4160  MPI_Barrier(T_plan->comm);
4161  return;
4162  }
4163  timings[0]-=MPI_Wtime();
4164  int nprocs, procid;
4165  int nprocs_0, nprocs_1;
4166  nprocs=T_plan->nprocs;
4167  procid=T_plan->procid;
4168  nprocs_0=T_plan->nprocs_0;
4169  nprocs_1=T_plan->nprocs_1;
4170  ptrdiff_t *N=T_plan->N;
4171  double* data_cpu=T_plan->buffer;
4172  double * send_recv_cpu = T_plan->buffer_2;
4173  double * send_recv = T_plan->buffer_d;
4174  ptrdiff_t local_n0=T_plan->local_n0;
4175  ptrdiff_t local_n1=T_plan->local_n1;
4176  ptrdiff_t n_tuples=T_plan->n_tuples;
4177  int idist=N[1]*local_n0*n_tuples;
4178  int odist=N[0]*local_n1*n_tuples;
4179 
4180  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
4181 
4182  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
4183  if(VERBOSE>=2){
4184  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4185  for(int h=0;h<howmany;h++)
4186  for(int id=0;id<1+0*nprocs;++id){
4187  if(procid==id)
4188  for(int i=0;i<local_n0;i++){
4189  std::cout<<std::endl;
4190  for(int j=0;j<N[1];j++){
4191  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
4192  }
4193  }
4194  std::cout<<'\n';
4195  MPI_Barrier(T_plan->comm);
4196  }
4197  }
4198  //PCOUT<<" ============================================= "<<std::endl;
4199  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
4200  //PCOUT<<" ============================================= "<<std::endl;
4201  shuffle_time-=MPI_Wtime();
4202 
4203  if(Flags[0]==0){
4204  for(int h=0;h<howmany;h++)
4205  local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
4206  }
4207  cudaDeviceSynchronize();
4208 
4209  shuffle_time+=MPI_Wtime();
4210  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
4211  if(VERBOSE>=2){
4212  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4213  for(int h=0;h<howmany;h++)
4214  for(int id=0;id<nprocs;++id){
4215  if(procid==id)
4216  for(int i=0;i<N[1];i++){
4217  std::cout<<std::endl;
4218  for(int j=0;j<local_n0;j++){
4219  std::cout<<'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
4220  }
4221  }
4222  std::cout<<'\n';
4223  MPI_Barrier(T_plan->comm);
4224  }
4225  }
4226  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
4227  for(int h=0;h<howmany;h++)
4228  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
4229  }
4230  if(nprocs==1){ // Transpose is done!
4231  timings[0]+=MPI_Wtime();
4232  timings[1]+=shuffle_time;
4233  timings[2]+=0;
4234  timings[3]+=0;
4235  return;
4236  }
4237 
4238 
4239  //PCOUT<<" ============================================= "<<std::endl;
4240  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
4241  //PCOUT<<" ============================================= "<<std::endl;
4242  int* scount_proc= T_plan->scount_proc;
4243  int* rcount_proc= T_plan->rcount_proc;
4244  int* soffset_proc= T_plan->soffset_proc;
4245  int* roffset_proc= T_plan->roffset_proc;
4246 
4247  MPI_Datatype *stype=T_plan->stype;
4248  MPI_Datatype *rtype=T_plan->rtype;
4249  MPI_Barrier(T_plan->comm);
4250  comm_time-=MPI_Wtime();
4251  // SEND
4252  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4253  if(howmany>1){
4254  MPI_Alltoallw(data_cpu,T_plan->scount_proc_w,
4255  T_plan->soffset_proc_w, stype,
4256  send_recv_cpu,T_plan->rcount_proc_w, T_plan->roffset_proc_w,
4257  rtype, T_plan->comm);
4258  }
4259  else if(T_plan->is_evenly_distributed==0)
4260  MPI_Alltoallv(data_cpu,scount_proc,
4261  soffset_proc, MPI_DOUBLE,send_recv_cpu,
4262  rcount_proc,roffset_proc, MPI_DOUBLE,
4263  T_plan->comm);
4264  else
4265  MPI_Alltoall(data_cpu, scount_proc[0], MPI_DOUBLE,
4266  send_recv_cpu, rcount_proc[0], MPI_DOUBLE,
4267  T_plan->comm);
4268 
4269  cudaMemcpy(send_recv, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
4270  comm_time+=MPI_Wtime();
4271 
4272  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
4273  if(VERBOSE>=2)
4274  for(int h=0;h<howmany;h++)
4275  for(int id=0;id<nprocs;++id){
4276  if(procid==id)
4277  for(int i=0;i<local_n1;i++){
4278  std::cout<<std::endl;
4279  for(int j=0;j<N[0];j++){
4280  std::cout<<'\t'<<send_recv_cpu[h*odist+(i*N[0]+j)*n_tuples];
4281  }
4282  }
4283  std::cout<<'\n';
4284  MPI_Barrier(T_plan->comm);
4285  }
4286 
4287 
4288 
4289 
4290  //PCOUT<<" ============================================= "<<std::endl;
4291  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
4292  //PCOUT<<" ============================================= "<<std::endl;
4293 
4294  reshuffle_time-=MPI_Wtime();
4295 
4296  // int first_local_n0, last_local_n0;
4297  // first_local_n0=local_n0_proc[0]; last_local_n0=local_n0_proc[nprocs_1-1];
4298 
4299  //local_transpose(nprocs_1,local_n1,n_tuples*local_n0,send_recv,data );
4300 
4301  int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
4302  if(local_n1!=0)
4303  last_ntuples=T_plan->last_recv_count/((int)local_n1);
4304 
4305  for(int h=0;h<howmany;h++){
4306  if(local_n1==1)
4307  cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice); // you are done, no additional transpose is needed.
4308  else if(last_ntuples!=first_ntuples){
4309  local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
4310  local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
4311  }
4312  else if(last_ntuples==first_ntuples){
4313  //local_transpose_cuda(nprocs_0,local_n1,n_tuples*local_n0,send_recv );
4314  local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
4315  }
4316  }
4317 
4318  cudaDeviceSynchronize();
4319 
4320 
4321  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
4322  if(VERBOSE>=2){
4323  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4324  for(int h=0;h<howmany;h++)
4325  for(int id=0;id<nprocs_1;++id){
4326  if(procid==id)
4327  for(int i=0;i<local_n1;i++){
4328  std::cout<<std::endl;
4329  for(int j=0;j<N[0];j++){
4330  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
4331  }
4332  }
4333  std::cout<<'\n';
4334  MPI_Barrier(T_plan->comm);
4335  }
4336  }
4337  if(Flags[1]==1){ // Transpose output
4338  for(int h=0;h<howmany;h++)
4339  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
4340  }
4341  reshuffle_time+=MPI_Wtime();
4342 
4343  MPI_Barrier(T_plan->comm);
4344  if(VERBOSE>=1){
4345  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
4346  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
4347  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
4348  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
4349  }
4350  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
4351  timings[1]+=shuffle_time;
4352  timings[2]+=comm_time;
4353  timings[3]+=reshuffle_time;
4354  return;
4355 }
4356 void transpose_cuda_v7(T_Plan_gpu* T_plan, double * data, double *timings,int kway, unsigned flags, int howmany, int tag ){
4357 
4358  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
4359  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
4360  MPI_Barrier(T_plan->comm);
4361  return;
4362  }
4363  timings[0]-=MPI_Wtime();
4364  int nprocs, procid;
4365  int nprocs_0, nprocs_1;
4366  nprocs=T_plan->nprocs;
4367  procid=T_plan->procid;
4368  nprocs_0=T_plan->nprocs_0;
4369  nprocs_1=T_plan->nprocs_1;
4370  ptrdiff_t *N=T_plan->N;
4371  double* data_cpu=T_plan->buffer;
4372  double * send_recv_cpu = T_plan->buffer_2;
4373  double * send_recv = T_plan->buffer_d;
4374  ptrdiff_t local_n0=T_plan->local_n0;
4375  ptrdiff_t local_n1=T_plan->local_n1;
4376  ptrdiff_t n_tuples=T_plan->n_tuples;
4377  int idist=N[1]*local_n0*n_tuples;
4378  int odist=N[0]*local_n1*n_tuples;
4379 
4380  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
4381 
4382  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
4383  if(VERBOSE>=2){
4384  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4385  for(int h=0;h<howmany;h++)
4386  for(int id=0;id<nprocs;++id){
4387  if(procid==id)
4388  for(int i=0;i<local_n0;i++){
4389  std::cout<<std::endl;
4390  for(int j=0;j<N[1];j++){
4391  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
4392  }
4393  }
4394  std::cout<<'\n';
4395  MPI_Barrier(T_plan->comm);
4396  }
4397  }
4398  //PCOUT<<" ============================================= "<<std::endl;
4399  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
4400  //PCOUT<<" ============================================= "<<std::endl;
4401  shuffle_time-=MPI_Wtime();
4402 
4403  if(Flags[0]==0){
4404  for(int h=0;h<howmany;h++)
4405  local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
4406  }
4407  cudaDeviceSynchronize();
4408 
4409  shuffle_time+=MPI_Wtime();
4410  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
4411  if(VERBOSE>=2){
4412  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4413  for(int h=0;h<howmany;h++)
4414  for(int id=0;id<nprocs;++id){
4415  if(procid==id)
4416  for(int i=0;i<N[1];i++){
4417  std::cout<<std::endl;
4418  for(int j=0;j<local_n0;j++){
4419  std::cout<<'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
4420  }
4421  }
4422  std::cout<<'\n';
4423  MPI_Barrier(T_plan->comm);
4424  }
4425  }
4426  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
4427  for(int h=0;h<howmany;h++)
4428  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
4429  }
4430  if(nprocs==1){ // Transpose is done!
4431  timings[0]+=MPI_Wtime();
4432  timings[1]+=shuffle_time;
4433  timings[2]+=0;
4434  timings[3]+=0;
4435  return;
4436  }
4437 
4438 
4439  //PCOUT<<" ============================================= "<<std::endl;
4440  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
4441  //PCOUT<<" ============================================= "<<std::endl;
4442  int* scount_proc= T_plan->scount_proc;
4443  int* rcount_proc= T_plan->rcount_proc;
4444  int* soffset_proc= T_plan->soffset_proc;
4445  int* roffset_proc= T_plan->roffset_proc;
4446 
4447  comm_time-=MPI_Wtime();
4448  // SEND
4449  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4450  //if(T_plan->is_evenly_distributed==0)
4451  //MPI_Alltoallv(data_cpu,scount_proc,
4452  // soffset_proc, MPI_DOUBLE,send_recv_cpu,
4453  // rcount_proc,roffset_proc, MPI_DOUBLE,
4454  // T_plan->comm);
4455  //else
4456  //MPI_Alltoall(data_cpu, scount_proc[0], MPI_DOUBLE,
4457  // send_recv_cpu, rcount_proc[0], MPI_DOUBLE,
4458  // T_plan->comm);
4459  if(T_plan->kway_async)
4460  par::Mpi_Alltoallv_dense<double,true>(data_cpu , scount_proc, soffset_proc,
4461  send_recv_cpu, rcount_proc, roffset_proc, T_plan->comm,kway);
4462  else
4463  par::Mpi_Alltoallv_dense<double,false>(data_cpu , scount_proc, soffset_proc,
4464  send_recv_cpu, rcount_proc, roffset_proc, T_plan->comm,kway);
4465 
4466 
4467  cudaMemcpy(send_recv, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
4468  comm_time+=MPI_Wtime();
4469 
4470  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
4471  if(VERBOSE>=2)
4472  for(int h=0;h<howmany;h++)
4473  for(int id=0;id<nprocs;++id){
4474  if(procid==id)
4475  for(int i=0;i<local_n1;i++){
4476  std::cout<<std::endl;
4477  for(int j=0;j<N[0];j++){
4478  std::cout<<'\t'<<send_recv_cpu[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
4479  }
4480  }
4481  std::cout<<'\n';
4482  MPI_Barrier(T_plan->comm);
4483  }
4484 
4485 
4486 
4487 
4488  //PCOUT<<" ============================================= "<<std::endl;
4489  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
4490  //PCOUT<<" ============================================= "<<std::endl;
4491 
4492  reshuffle_time-=MPI_Wtime();
4493 
4494  // int first_local_n0, last_local_n0;
4495  // first_local_n0=local_n0_proc[0]; last_local_n0=local_n0_proc[nprocs_1-1];
4496 
4497  //local_transpose(nprocs_1,local_n1,n_tuples*local_n0,send_recv,data );
4498 
4499  int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
4500  if(local_n1!=0)
4501  last_ntuples=T_plan->last_recv_count/((int)local_n1);
4502 
4503  for(int h=0;h<howmany;h++){
4504  if(local_n1==1)
4505  cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice); // you are done, no additional transpose is needed.
4506  else if(last_ntuples!=first_ntuples){
4507  local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
4508  local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
4509  }
4510  else if(last_ntuples==first_ntuples){
4511  //local_transpose_cuda(nprocs_0,local_n1,n_tuples*local_n0,send_recv );
4512  local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
4513  }
4514  }
4515  cudaDeviceSynchronize();
4516 
4517  if(Flags[1]==1){ // Transpose output
4518  for(int h=0;h<howmany;h++)
4519  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
4520  }
4521  reshuffle_time+=MPI_Wtime();
4522 
4523  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
4524  if(VERBOSE>=2){
4525  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4526  for(int h=0;h<howmany;h++)
4527  for(int id=0;id<nprocs_1;++id){
4528  if(procid==id)
4529  for(int i=0;i<local_n1;i++){
4530  std::cout<<std::endl;
4531  for(int j=0;j<N[0];j++){
4532  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
4533  }
4534  }
4535  std::cout<<'\n';
4536  MPI_Barrier(T_plan->comm);
4537  }
4538  }
4539  MPI_Barrier(T_plan->comm);
4540  if(VERBOSE>=1){
4541  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
4542  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
4543  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
4544  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
4545  }
4546  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
4547  timings[1]+=shuffle_time;
4548  timings[2]+=comm_time;
4549  timings[3]+=reshuffle_time;
4550  return;
4551 }
4552 void transpose_cuda_v7_2(T_Plan_gpu* T_plan, double * data, double *timings,int kway, unsigned flags, int howmany, int tag ){
4553 
4554  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
4555  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
4556  MPI_Barrier(T_plan->comm);
4557  return;
4558  }
4559  timings[0]-=MPI_Wtime();
4560  int nprocs, procid;
4561  int nprocs_0, nprocs_1;
4562  nprocs=T_plan->nprocs;
4563  procid=T_plan->procid;
4564  nprocs_0=T_plan->nprocs_0;
4565  nprocs_1=T_plan->nprocs_1;
4566  ptrdiff_t *N=T_plan->N;
4567  double* data_cpu=T_plan->buffer;
4568  double * send_recv_cpu = T_plan->buffer_2;
4569  double * send_recv = T_plan->buffer_d;
4570  ptrdiff_t local_n0=T_plan->local_n0;
4571  ptrdiff_t local_n1=T_plan->local_n1;
4572  ptrdiff_t n_tuples=T_plan->n_tuples;
4573  int idist=N[1]*local_n0*n_tuples;
4574  int odist=N[0]*local_n1*n_tuples;
4575 
4576  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
4577 
4578  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
4579  if(VERBOSE>=2){
4580  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4581  for(int h=0;h<howmany;h++)
4582  for(int id=0;id<nprocs;++id){
4583  if(procid==id)
4584  for(int i=0;i<local_n0;i++){
4585  std::cout<<std::endl;
4586  for(int j=0;j<N[1];j++){
4587  std::cout<<'\t'<<data_cpu[h*idist+(i*N[1]+j)*n_tuples];
4588  }
4589  }
4590  std::cout<<'\n';
4591  MPI_Barrier(T_plan->comm);
4592  }
4593  }
4594  //PCOUT<<" ============================================= "<<std::endl;
4595  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
4596  //PCOUT<<" ============================================= "<<std::endl;
4597  shuffle_time-=MPI_Wtime();
4598 
4599  if(Flags[0]==0){
4600  for(int h=0;h<howmany;h++)
4601  local_transpose_cuda(local_n0,N[1],n_tuples,&data[h*idist] );
4602  }
4603  cudaDeviceSynchronize();
4604 
4605  shuffle_time+=MPI_Wtime();
4606  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
4607  if(VERBOSE>=2){
4608  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4609  for(int h=0;h<howmany;h++)
4610  for(int id=0;id<nprocs;++id){
4611  if(procid==id)
4612  for(int i=0;i<N[1];i++){
4613  std::cout<<std::endl;
4614  for(int j=0;j<local_n0;j++){
4615  std::cout<<'\t'<<data_cpu[h*idist+(i*local_n0+j)*n_tuples];
4616  }
4617  }
4618  std::cout<<'\n';
4619  MPI_Barrier(T_plan->comm);
4620  }
4621  }
4622  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
4623  for(int h=0;h<howmany;h++)
4624  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*idist] );
4625  }
4626  if(nprocs==1){ // Transpose is done!
4627  timings[0]+=MPI_Wtime();
4628  timings[1]+=shuffle_time;
4629  timings[2]+=0;
4630  timings[3]+=0;
4631  return;
4632  }
4633 
4634 
4635  //PCOUT<<" ============================================= "<<std::endl;
4636  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
4637  //PCOUT<<" ============================================= "<<std::endl;
4638  int* scount_proc= T_plan->scount_proc;
4639  int* rcount_proc= T_plan->rcount_proc;
4640  int* soffset_proc= T_plan->soffset_proc;
4641  int* roffset_proc= T_plan->roffset_proc;
4642 
4643  comm_time-=MPI_Wtime();
4644  // SEND
4645  // cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4646  //if(T_plan->is_evenly_distributed==0)
4647  //MPI_Alltoallv(data_cpu,scount_proc,
4648  // soffset_proc, MPI_DOUBLE,send_recv_cpu,
4649  // rcount_proc,roffset_proc, MPI_DOUBLE,
4650  // T_plan->comm);
4651  //else
4652  //MPI_Alltoall(data_cpu, scount_proc[0], MPI_DOUBLE,
4653  // send_recv_cpu, rcount_proc[0], MPI_DOUBLE,
4654  // T_plan->comm);
4655  if(T_plan->kway_async)
4656  par::Mpi_Alltoallv_dense_gpu<double,true>(data , scount_proc, soffset_proc,
4657  send_recv, rcount_proc, roffset_proc, T_plan->comm,kway);
4658  else
4659  par::Mpi_Alltoallv_dense_gpu<double,false>(data , scount_proc, soffset_proc,
4660  send_recv, rcount_proc, roffset_proc, T_plan->comm,kway);
4661 
4662 
4663  // cudaMemcpy(send_recv, send_recv_cpu, T_plan->alloc_local, cudaMemcpyHostToDevice);
4664  comm_time+=MPI_Wtime();
4665 
4666  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
4667  if(VERBOSE>=2)
4668  for(int h=0;h<howmany;h++)
4669  for(int id=0;id<nprocs;++id){
4670  if(procid==id)
4671  for(int i=0;i<local_n1;i++){
4672  std::cout<<std::endl;
4673  for(int j=0;j<N[0];j++){
4674  std::cout<<'\t'<<send_recv_cpu[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
4675  }
4676  }
4677  std::cout<<'\n';
4678  MPI_Barrier(T_plan->comm);
4679  }
4680 
4681 
4682 
4683 
4684  //PCOUT<<" ============================================= "<<std::endl;
4685  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
4686  //PCOUT<<" ============================================= "<<std::endl;
4687 
4688  reshuffle_time-=MPI_Wtime();
4689 
4690  // int first_local_n0, last_local_n0;
4691  // first_local_n0=local_n0_proc[0]; last_local_n0=local_n0_proc[nprocs_1-1];
4692 
4693  //local_transpose(nprocs_1,local_n1,n_tuples*local_n0,send_recv,data );
4694 
4695  int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
4696  if(local_n1!=0)
4697  last_ntuples=T_plan->last_recv_count/((int)local_n1);
4698 
4699  for(int h=0;h<howmany;h++){
4700  if(local_n1==1)
4701  cudaMemcpy(&data[h*odist],&send_recv[h*odist],T_plan->alloc_local/howmany ,cudaMemcpyDeviceToDevice); // you are done, no additional transpose is needed.
4702  else if(last_ntuples!=first_ntuples){
4703  local_transpose_cuda((nprocs_0-1),local_n1,first_ntuples,&send_recv[h*odist] );
4704  local_transpose_cuda(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[h*odist],&data[h*odist] );
4705  }
4706  else if(last_ntuples==first_ntuples){
4707  //local_transpose_cuda(nprocs_0,local_n1,n_tuples*local_n0,send_recv );
4708  local_transpose_cuda(nprocs_0,local_n1,first_ntuples,&send_recv[h*odist],&data[h*odist] );
4709  }
4710  }
4711  cudaDeviceSynchronize();
4712 
4713  if(Flags[1]==1){ // Transpose output
4714  for(int h=0;h<howmany;h++)
4715  local_transpose_cuda(local_n1,N[0],n_tuples,&data[h*odist] );
4716  }
4717  reshuffle_time+=MPI_Wtime();
4718 
4719  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
4720  if(VERBOSE>=2){
4721  cudaMemcpy(data_cpu, data, T_plan->alloc_local, cudaMemcpyDeviceToHost);
4722  for(int h=0;h<howmany;h++)
4723  for(int id=0;id<nprocs_1;++id){
4724  if(procid==id)
4725  for(int i=0;i<local_n1;i++){
4726  std::cout<<std::endl;
4727  for(int j=0;j<N[0];j++){
4728  std::cout<<'\t'<<data_cpu[h*odist+(i*N[0]+j)*n_tuples];
4729  }
4730  }
4731  std::cout<<'\n';
4732  MPI_Barrier(T_plan->comm);
4733  }
4734  }
4735  MPI_Barrier(T_plan->comm);
4736  if(VERBOSE>=1){
4737  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
4738  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
4739  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
4740  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
4741  }
4742  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
4743  timings[1]+=shuffle_time;
4744  timings[2]+=comm_time;
4745  timings[3]+=reshuffle_time;
4746  return;
4747 }