AccFFT
transpose.cpp
1 /*
2  * Copyright (c) 2014-2015, Amir Gholami, George Biros
3  * All rights reserved.
4  * This file is part of AccFFT library.
5  *
6  * AccFFT is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * AccFFT is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with AccFFT. If not, see <http://www.gnu.org/licenses/>.
18  *
19 */
20 
21 
22 #include <mpi.h>
23 #include <iostream>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <math.h>
27 #include <cmath>
28 #include <string.h>
29 #include <fftw3.h>
30 #include <vector>
31 #include <bitset>
32 #include "transpose.h"
33 #include "parUtils.h"
34 
35 #define PCOUT if(procid==0) std::cout
36 #define VERBOSE 0
37 
38 static bool IsPowerOfTwo(ulong x)
39 {
40  return (x & (x - 1)) == 0;
41 }
42 Mem_Mgr::Mem_Mgr(int N0, int N1,int tuples, MPI_Comm Comm, int howmany, int specified_alloc_local){
43 
44  N[0]=N0;
45  N[1]=N1;
46  n_tuples=tuples;
47  int procid, nprocs;
48  MPI_Comm_rank(Comm, &procid);
49  MPI_Comm_size(Comm,&nprocs);
50 
51 
52  // Determine local_n0/n1 of each processor
53  if(specified_alloc_local==0){
54  {
55  ptrdiff_t * local_n0_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
56  ptrdiff_t * local_n1_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
57 #pragma omp parallel for
58  for (int proc=0;proc<nprocs;++proc){
59  local_n0_proc[proc]=ceil(N[0]/(double)nprocs);
60  local_n1_proc[proc]=ceil(N[1]/(double)nprocs);
61 
62 
63  if((N[0]-local_n0_proc[proc]*proc)<local_n0_proc[proc]) {local_n0_proc[proc]=N[0]-local_n0_proc[proc]*proc; local_n0_proc[proc]*=(int) local_n0_proc[proc]>0;}
64  if((N[1]-local_n1_proc[proc]*proc)<local_n1_proc[proc]) {local_n1_proc[proc]=N[1]-local_n1_proc[proc]*proc;local_n1_proc[proc]*=(int) local_n1_proc[proc]>0;}
65 
66 
67  }
68 
69  local_n0=local_n0_proc[procid];
70  local_n1=local_n1_proc[procid];
71  free(local_n0_proc);
72  free(local_n1_proc);
73  }
74 
75 
76  // Determine alloc local based on maximum size of input and output distribution
77  alloc_local=local_n0*N[1]*n_tuples*sizeof(double);
78  if(alloc_local<local_n1*N[0]*n_tuples*sizeof(double))
79  alloc_local=local_n1*N[0]*n_tuples*sizeof(double);
80 
81  alloc_local*=howmany;
82  }
83  else{
84  alloc_local=specified_alloc_local;
85  }
86  if( alloc_local<=1.05*std::pow(2,30) )
87  PINNED=1;
88  else
89  PINNED=0;
90 
91  posix_memalign((void **)&buffer,64, alloc_local);
92  posix_memalign((void **)&buffer_2,64, alloc_local);
93  memset( buffer,0, alloc_local );
94  memset( buffer_2,0, alloc_local );
95 
96 }
97 Mem_Mgr::~Mem_Mgr(){
98 
99  free(buffer);
100  free(buffer_2);
101 }
102 
103 T_Plan::T_Plan(int N0, int N1,int tuples, Mem_Mgr * Mem_mgr, MPI_Comm Comm, int howmany){
104 
105  N[0]=N0;
106  N[1]=N1;
107  n_tuples=tuples;
108  MPI_Comm_rank(Comm, &procid);
109  MPI_Comm_size(Comm,&nprocs);
110 
111  local_n0_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
112  local_n1_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
113  local_0_start_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
114  local_1_start_proc=(ptrdiff_t*) malloc(sizeof(ptrdiff_t)*nprocs);
115 
116  // Determine local_n0/n1 of each processor
117 
118  local_0_start_proc[0]=0;local_1_start_proc[0]=0;
119  for (int proc=0;proc<nprocs;++proc){
120  local_n0_proc[proc]=ceil(N[0]/(double)nprocs);
121  local_n1_proc[proc]=ceil(N[1]/(double)nprocs);
122 
123 
124  if((N[0]-local_n0_proc[proc]*proc)<local_n0_proc[proc]) {local_n0_proc[proc]=N[0]-local_n0_proc[proc]*proc; local_n0_proc[proc]*=(int) local_n0_proc[proc]>0;}
125  if((N[1]-local_n1_proc[proc]*proc)<local_n1_proc[proc]) {local_n1_proc[proc]=N[1]-local_n1_proc[proc]*proc;local_n1_proc[proc]*=(int) local_n1_proc[proc]>0;}
126 
127  if(proc!=0){
128  local_0_start_proc[proc]=local_0_start_proc[proc-1]+local_n0_proc[proc-1];
129  local_1_start_proc[proc]=local_1_start_proc[proc-1]+local_n1_proc[proc-1];
130  }
131 
132  }
133 
134  local_n0=local_n0_proc[procid];
135  local_n1=local_n1_proc[procid];
136  local_0_start=local_0_start_proc[procid];
137  local_1_start=local_1_start_proc[procid];
138 
139 
140  alloc_local=Mem_mgr->alloc_local;
141  // Determine effective processors taking part in each transpose phase
142  nprocs_0=0; nprocs_1=0;
143  for (int proc=0;proc<nprocs;++proc){
144  if(local_n0_proc[proc]!=0)
145  nprocs_0+=1;
146  if(local_n1_proc[proc]!=0)
147  nprocs_1+=1;
148  }
149 
150 
151 
152  // Set send recv counts for communication part
153  scount_proc=(int*) malloc(sizeof(int)*nprocs);
154  rcount_proc=(int*) malloc(sizeof(int)*nprocs);
155  soffset_proc=(int*) malloc(sizeof(int)*nprocs);
156  roffset_proc=(int*) malloc(sizeof(int)*nprocs);
157 
158  scount_proc_f=(int*) malloc(sizeof(int)*nprocs);
159  rcount_proc_f=(int*) malloc(sizeof(int)*nprocs);
160  soffset_proc_f=(int*) malloc(sizeof(int)*nprocs);
161  roffset_proc_f=(int*) malloc(sizeof(int)*nprocs);
162 
163  scount_proc_w=(int*) malloc(sizeof(int)*nprocs);
164  rcount_proc_w=(int*) malloc(sizeof(int)*nprocs);
165  soffset_proc_w=(int*) malloc(sizeof(int)*nprocs);
166  roffset_proc_w=(int*) malloc(sizeof(int)*nprocs);
167 
168  //scount_proc_v8=(int*) malloc(sizeof(int)*nprocs);
169  //rcount_proc_v8=(int*) malloc(sizeof(int)*nprocs);
170  //soffset_proc_v8=(int*) malloc(sizeof(int)*nprocs);
171  //roffset_proc_v8=(int*) malloc(sizeof(int)*nprocs);
172  //memset(scount_proc_v8,0,sizeof(int)*nprocs);
173  //memset(rcount_proc_v8,0,sizeof(int)*nprocs);
174  //memset(soffset_proc_v8,0,sizeof(int)*nprocs);
175  //memset(roffset_proc_v8,0,sizeof(int)*nprocs);
176 
177  last_recv_count=0; // Will store the n_tuples of the last received data. In general ~=n_tuples
178  if(nprocs_1>nprocs_0)
179  for (int proc=0;proc<nprocs;++proc){
180 
181  scount_proc[proc]=local_n1_proc[proc]*local_n0*n_tuples;
182 
183  if(scount_proc[proc]!=0)
184  rcount_proc[proc]=local_n1_proc[proc]*local_n0_proc[proc]*n_tuples;//scount_proc[proc];
185  else
186  rcount_proc[proc]=local_n1*local_n0_proc[proc]*n_tuples;//local_n0_proc[proc]*n_tuples; //
187 
188  soffset_proc[proc]=0;
189  roffset_proc[proc]=0;
190  if(proc==0){
191  soffset_proc[proc]=0;
192  roffset_proc[proc]=0;
193  }
194  else{
195  soffset_proc[proc]=soffset_proc[proc-1]+scount_proc[proc-1];
196  roffset_proc[proc]=roffset_proc[proc-1]+rcount_proc[proc-1];
197  }
198 
199  if(proc>=nprocs_1){ // in case you have requested too many processes
200  rcount_proc[proc]=0;
201  scount_proc[proc]=0;
202  }
203  if(scount_proc[proc]==0) soffset_proc[proc]=0;//local_n0*N[1]*n_tuples-1;
204 
205  if(proc>=nprocs_0){
206  rcount_proc[proc]=0;
207  roffset_proc[proc]=0;
208  }
209  if(rcount_proc[proc]!=0)
210  last_recv_count=rcount_proc[proc];
211  if(local_n1_proc[proc]!=0)
212  last_local_n1=local_n1_proc[proc];
213  }
214  else if(nprocs_1<=nprocs_0)
215  for (int proc=0;proc<nprocs;++proc){
216 
217  scount_proc[proc]=local_n1_proc[proc]*local_n0*n_tuples;
218  rcount_proc[proc]=local_n1*local_n0_proc[proc]*n_tuples;//scount_proc[proc];
219 
220 
221 
222  soffset_proc[proc]=0;
223  roffset_proc[proc]=0;
224  if(proc==0){
225  soffset_proc[proc]=0;
226  roffset_proc[proc]=0;
227  }
228  else{
229  soffset_proc[proc]=soffset_proc[proc-1]+scount_proc[proc-1];
230  roffset_proc[proc]=roffset_proc[proc-1]+rcount_proc[proc-1];
231  }
232 
233  if(proc>=nprocs_0){ // in case you have requested too many processes
234  rcount_proc[proc]=0;
235  scount_proc[proc]=0;
236  roffset_proc[proc]=0;
237  soffset_proc[proc]=0;
238  }
239 
240  if(scount_proc[proc]==0) soffset_proc[proc]=0;//local_n0*N[1]*n_tuples-1;
241 
242  if(proc>=nprocs_1){
243  scount_proc[proc]=0;
244  soffset_proc[proc]=0;
245  }
246  if(rcount_proc[proc]!=0)
247  last_recv_count=rcount_proc[proc];
248 
249  if(local_n1_proc[proc]!=0)
250  last_local_n1=local_n1_proc[proc];
251  }
252 
253 
254  is_evenly_distributed=0; // use alltoallv
255  if((local_n0*nprocs_0-N[0])==0 && (local_n1*nprocs_1-N[1])==0 && nprocs_0==nprocs_1 && nprocs_0==nprocs){
256  is_evenly_distributed=1; // use alltoall
257  }
258 
259  method=5; //Default Transpose method
260  kway=8; // for transpose_v7
261  kway_async=true;
262  stype=new MPI_Datatype[nprocs];
263  rtype=new MPI_Datatype[nprocs];
264  //stype_v8=new MPI_Datatype[nprocs];
265  //rtype_v8_=new MPI_Datatype[nprocs];
266  //rtype_v8=new MPI_Datatype[nprocs];
267 
268  for (int i=0;i<nprocs;i++){
269  MPI_Type_vector(howmany,scount_proc[i],local_n0*N[1]*n_tuples, MPI_DOUBLE, &stype[i]);
270  MPI_Type_vector(howmany,rcount_proc[i],local_n1*N[0]*n_tuples, MPI_DOUBLE, &rtype[i]);
271 
272  MPI_Type_commit(&stype[i]);
273  MPI_Type_commit(&rtype[i]);
274 
275  soffset_proc_w[i]=soffset_proc[i]*8; //SNAFU in bytes
276  roffset_proc_w[i]=roffset_proc[i]*8;
277  scount_proc_w[i]=1;
278  rcount_proc_w[i]=1;
279 
280  soffset_proc_f[i]=soffset_proc[i]*howmany; //SNAFU in bytes
281  roffset_proc_f[i]=roffset_proc[i]*howmany;
282  scount_proc_f[i]= scount_proc[i]*howmany;
283  rcount_proc_f[i]= rcount_proc[i]*howmany;
284 
285 
286  /*
287  if(local_n0!=0){
288  soffset_proc_v8[i]=soffset_proc[i]/local_n0;
289  scount_proc_v8[i]=scount_proc[i]/local_n0;
290  }
291  if(local_n1!=0){
292  roffset_proc_v8[i]=roffset_proc[i]/local_n1;
293  rcount_proc_v8[i]=rcount_proc[i]/local_n1;
294  }
295 
296  MPI_Type_vector(local_n0,scount_proc_v8[i],N[1]*n_tuples, MPI_DOUBLE, &stype_v8[i]);
297  MPI_Type_vector(local_n1,1*n_tuples,N[0]*n_tuples, MPI_DOUBLE, &rtype_v8_[i]);
298  MPI_Type_hvector(rcount_proc_v8[i],1,n_tuples*sizeof(double),rtype_v8_[i], &rtype_v8[i]);
299 
300  MPI_Type_commit(&stype_v8[i]);
301  MPI_Type_commit(&rtype_v8[i]);
302  */
303 
304  }
305 
306  comm=Comm; // MPI Communicator
307  buffer=Mem_mgr->buffer;
308  buffer_2=Mem_mgr->buffer_2;
309  buffer_d=Mem_mgr->buffer_d;
310  //data_cpu=Mem_mgr->data_cpu;
311 
312 }
313 void T_Plan::which_method(T_Plan* T_plan,double* data){
314 
315  double dummy[4]={0};
316  double * time= (double*) malloc(sizeof(double)*(2*(int)log2(nprocs)+3));
317  double * g_time= (double*) malloc(sizeof(double)*(2*(int)log2(nprocs)+3));
318  for (int i=0;i<2*(int)log2(nprocs)+3;i++)
319  time[i]=1000;
320 
321  transpose_v5(T_plan,(double*)data,dummy); // Warmup
322  time[0]=-MPI_Wtime();
323  transpose_v5(T_plan,(double*)data,dummy);
324  time[0]+=MPI_Wtime();
325 
326  transpose_v6(T_plan,(double*)data,dummy); // Warmup
327  time[1]=-MPI_Wtime();
328  transpose_v6(T_plan,(double*)data,dummy);
329  time[1]+=MPI_Wtime();
330 
331  if(IsPowerOfTwo(nprocs) && nprocs>511){
332  kway_async=true;
333 #ifndef TORUS_TOPOL
334  for (int i=0;i<6;i++){
335  kway=nprocs/std::pow(2,i);
336  MPI_Barrier(T_plan->comm);
337  transpose_v7(T_plan,(double*)data,dummy,kway); // Warmup
338  time[2+i]=-MPI_Wtime();
339  transpose_v7(T_plan,(double*)data,dummy,kway);
340  time[2+i]+=MPI_Wtime();
341  }
342 #endif
343 
344 #ifdef TORUS_TOPOL
345  kway_async=false;
346  for (int i=0;i<6;i++){
347  kway=nprocs/std::pow(2,i);
348  MPI_Barrier(T_plan->comm);
349  transpose_v7(T_plan,(double*)data,dummy,kway); // Warmup
350  time[2+(int)log2(nprocs)+i]=-MPI_Wtime();
351  transpose_v7(T_plan,(double*)data,dummy,kway);
352  time[2+(int)log2(nprocs)+i]+=MPI_Wtime();
353  }
354 #endif
355  }
356 
357  //transpose_v8(T_plan,(double*)data,dummy); // Warmup
358  //time[2*(int)log2(nprocs)+2]=-MPI_Wtime();
359  //transpose_v8(T_plan,(double*)data,dummy);
360  //time[2*(int)log2(nprocs)+2]+=MPI_Wtime();
361 
362  MPI_Allreduce(time,g_time,(2*(int)log2(nprocs)+3),MPI_DOUBLE,MPI_MAX, T_plan->comm);
363  if(VERBOSE>=1)
364  if(T_plan->procid==0){
365  for(int i=0;i<2*(int)log2(nprocs)+3;++i)
366  std::cout<<" time["<<i<<"]= "<<g_time[i]<<" , ";
367  std::cout<<'\n';
368  }
369 
370  double smallest=1000;
371  for (int i=0;i<2*(int)log2(nprocs)+3;i++)
372  smallest=std::min(smallest,g_time[i]);
373 
374  if(g_time[0]==smallest){
375  T_plan->method=5;
376  }
377  else if(g_time[1]==smallest){
378  T_plan->method=6;
379  }
380  else if(g_time[2*(int)log2(nprocs)+2]==smallest){
381  T_plan->method=8;
382  }
383  else{
384  for (int i=0;i<(int)log2(nprocs);i++)
385  if(g_time[2+i]==smallest){
386  T_plan->method=7;
387  T_plan->kway=nprocs/std::pow(2,i);
388  T_plan->kway_async=true;
389  break;
390  }
391  for (int i=0;i<(int)log2(nprocs);i++)
392  if(g_time[2+(int)log2(nprocs)+i]==smallest){
393  T_plan->method=7;
394  T_plan->kway=nprocs/std::pow(2,i);
395  T_plan->kway_async=false;
396  break;
397  }
398  }
399 
400  if(VERBOSE>=1){
401  PCOUT<<"smallest= "<<smallest<<std::endl;
402  PCOUT<<"Using transpose v"<<method<<" kway= "<<T_plan->kway<<" kway_async="<<T_plan->kway_async<<std::endl;
403  }
404  free(time);
405  free(g_time);
406  MPI_Barrier(T_plan->comm);
407 
408  return;
409 }
410 void T_Plan::which_fast_method(T_Plan* T_plan,double* data){
411 
412  double dummy[4]={0};
413  double * time= (double*) malloc(sizeof(double)*(2*(int)log2(nprocs)+3));
414  double * g_time= (double*) malloc(sizeof(double)*(2*(int)log2(nprocs)+3));
415  for (int i=0;i<2*(int)log2(nprocs)+3;i++)
416  time[i]=1000;
417 
418  fast_transpose_v1(T_plan,(double*)data,dummy,2); // Warmup
419  time[0]=-MPI_Wtime();
420  fast_transpose_v1(T_plan,(double*)data,dummy,2);
421  time[0]+=MPI_Wtime();
422 
423  fast_transpose_v2(T_plan,(double*)data,dummy,2); // Warmup
424  time[1]=-MPI_Wtime();
425  fast_transpose_v2(T_plan,(double*)data,dummy,2);
426  time[1]+=MPI_Wtime();
427 
428  if(IsPowerOfTwo(nprocs) && nprocs>511){
429  kway_async=true;
430  for (int i=0;i<6;i++){
431  kway=nprocs/std::pow(2,i);
432  MPI_Barrier(T_plan->comm);
433  fast_transpose_v3(T_plan,(double*)data,dummy,kway,2); // Warmup
434  time[2+i]=-MPI_Wtime();
435  fast_transpose_v3(T_plan,(double*)data,dummy,kway,2);
436  time[2+i]+=MPI_Wtime();
437  }
438 
439  kway_async=false;
440  for (int i=0;i<6;i++){
441  kway=nprocs/std::pow(2,i);
442  MPI_Barrier(T_plan->comm);
443  fast_transpose_v3(T_plan,(double*)data,dummy,kway); // Warmup
444  time[2+(int)log2(nprocs)+i]=-MPI_Wtime();
445  fast_transpose_v3(T_plan,(double*)data,dummy,kway);
446  time[2+(int)log2(nprocs)+i]+=MPI_Wtime();
447  }
448  }
449 
450  //transpose_v8(T_plan,(double*)data,dummy); // Warmup
451  //time[2*(int)log2(nprocs)+2]=-MPI_Wtime();
452  //transpose_v8(T_plan,(double*)data,dummy);
453  //time[2*(int)log2(nprocs)+2]+=MPI_Wtime();
454 
455  MPI_Allreduce(time,g_time,(2*(int)log2(nprocs)+3),MPI_DOUBLE,MPI_MAX, T_plan->comm);
456  if(VERBOSE>=1){
457  if(T_plan->procid==0){
458  for(int i=0;i<2*(int)log2(nprocs)+3;++i)
459  std::cout<<" time["<<i<<"]= "<<g_time[i]<<" , ";
460  std::cout<<'\n';
461  }
462  }
463 
464  double smallest=1000;
465  for (int i=0;i<2*(int)log2(nprocs)+3;i++)
466  smallest=std::min(smallest,g_time[i]);
467 
468  if(g_time[0]==smallest){
469  T_plan->method=1;
470  }
471  else if(g_time[1]==smallest){
472  T_plan->method=2;
473  }
474  else if(g_time[2*(int)log2(nprocs)+2]==smallest){
475  T_plan->method=8;
476  }
477  else{
478  for (int i=0;i<(int)log2(nprocs);i++)
479  if(g_time[2+i]==smallest){
480  T_plan->method=3;
481  T_plan->kway=nprocs/std::pow(2,i);
482  T_plan->kway_async=true;
483  break;
484  }
485  for (int i=0;i<(int)log2(nprocs);i++)
486  if(g_time[2+(int)log2(nprocs)+i]==smallest){
487  T_plan->method=3;
488  T_plan->kway=nprocs/std::pow(2,i);
489  T_plan->kway_async=false;
490  break;
491  }
492  }
493 
494  if(VERBOSE>=1){
495  PCOUT<<"smallest= "<<smallest<<std::endl;
496  PCOUT<<"Using transpose v"<<method<<" kway= "<<T_plan->kway<<" kway_async="<<T_plan->kway_async<<std::endl;
497  }
498  free(time);
499  free(g_time);
500  MPI_Barrier(T_plan->comm);
501 
502  return;
503 }
504 void T_Plan::execute(T_Plan* T_plan,double* data,double *timings, unsigned flags, int howmany, int tag){
505 
506 
507  if(howmany==1){
508  if(method==1)
509  fast_transpose_v1(T_plan,(double*)data,timings,flags,howmany,tag);
510  if(method==2)
511  fast_transpose_v2(T_plan,(double*)data,timings,flags,howmany,tag);
512  if(method==3)
513  fast_transpose_v3(T_plan,(double*)data,timings,kway,flags,howmany,tag);
514  }
515  else
516  {
517  if(method==1)
518  fast_transpose_v1_h(T_plan,(double*)data,timings,flags,howmany,tag);
519  if(method==2)
520  fast_transpose_v2_h(T_plan,(double*)data,timings,flags,howmany,tag);
521  if(method==3)
522  fast_transpose_v3_h(T_plan,(double*)data,timings,kway,flags,howmany,tag);
523  }
524  if(method==5)
525  transpose_v5(T_plan,(double*)data,timings,flags,howmany,tag);
526  if(method==6)
527  transpose_v6(T_plan,(double*)data,timings,flags,howmany);
528  if(method==7)
529  transpose_v7(T_plan,(double*)data,timings,kway,flags,howmany);
530  if(method==8)
531  transpose_v8(T_plan,(double*)data,timings,flags,howmany,tag);
532 
533  return;
534 }
535 
536 
537 T_Plan::~T_Plan(){
538  free(local_n0_proc);
539  free(local_n1_proc);
540  free(local_0_start_proc);
541  free(local_1_start_proc);
542  free(scount_proc);
543  free(rcount_proc);
544  free(soffset_proc);
545  free(roffset_proc);
546 
547  free(scount_proc_w);
548  free(rcount_proc_w);
549  free(soffset_proc_w);
550  free(roffset_proc_w);
551 
552  free(scount_proc_f);
553  free(rcount_proc_f);
554  free(soffset_proc_f);
555  free(roffset_proc_f);
556 
557  //free(scount_proc_v8);
558  //free(rcount_proc_v8);
559  //free(soffset_proc_v8);
560  //free(roffset_proc_v8);
561  delete [] stype;
562  delete [] rtype;
563  //delete [] stype_v8;
564  //delete [] rtype_v8;
565  //delete [] rtype_v8_;
566 }
567 
568 // outplace local transpose
569 void local_transpose(int r, int c, int n_tuples, double * in, double *out ){
570 
571  if(r==0 || c==0) return;
572  int ptr=0;
573  for(int j=0;j<c;j++){
574  for(int i=0;i<r;i++){
575  memcpy (&out[ptr],&in[(i*c+j)*n_tuples],sizeof(double)*n_tuples);
576  ptr+=n_tuples;
577  }
578  }
579  return;
580 
581 }
582 // outplace local transpose multiple n_tuples for the last row
583 void local_transpose(int r, int c, int n_tuples, int n_tuples2,double * in, double *out ){
584 
585  if(r==0 || c==0) return;
586  int ptr=0;
587  for(int j=0;j<c;j++){
588  for(int i=0;i<r;i++){
589 
590  if(i==r-1){
591  memcpy (&out[ptr],&in[i*c*n_tuples+j*n_tuples2],sizeof(double)*n_tuples2);
592  ptr+=n_tuples2;
593  }
594  else{
595  memcpy (&out[ptr],&in[(i*c+j)*n_tuples],sizeof(double)*n_tuples);
596  ptr+=n_tuples;
597  }
598  }
599  }
600  return;
601 
602 }
603 // outplace local transpose multiple n_tuples for the last col
604 void local_transpose_col(int r, int c, int n_tuples, int n_tuples2,double * in, double *out ){
605 
606  if(r==0 || c==0) return;
607  int ptr=0;
608  for(int j=0;j<c;j++){
609  for(int i=0;i<r;i++){
610 
611  if(j==c-1){
612  memcpy (&out[ptr],&in[i*((c-1)*n_tuples+n_tuples2)+(j)*n_tuples],sizeof(double)*n_tuples2);
613  ptr+=n_tuples2;
614  }
615  else{
616  memcpy (&out[ptr],&in[i*((c-1)*n_tuples+n_tuples2)+j*n_tuples],sizeof(double)*n_tuples);
617  ptr+=n_tuples;
618  }
619  }
620  }
621  return;
622 
623 }
624 
625 // in place local transpose
626 void local_transpose(int r, int c, int n_tuples, double* A){
627  {// new CPU code
628  size_t i_, j_;
629  double* buff=(double*)malloc(n_tuples*sizeof(double));
630  for(size_t i=0;i<r;i++)
631  for(size_t j=0;j<c;j++){
632  size_t src=j+c*i;
633  size_t trg=i+r*j;
634  if(src==trg) continue; // nothing to be done
635 
636  size_t cycle_len=0;
637  while(src<trg){ // find cycle
638  i_=trg/c;
639  j_=trg%c;
640  trg=i_+r*j_;
641  cycle_len++;
642  }
643  if(src!=trg) continue;
644 
645  memcpy(buff, A+trg*n_tuples, n_tuples*sizeof(double));
646  for(size_t k=0;k<cycle_len;k++){ // reverse cycle
647  j_=trg/r;
648  i_=trg%r;
649  src=j_+c*i_;
650  memcpy(A+trg*n_tuples, A+src*n_tuples, n_tuples*sizeof(double));
651  trg=src;
652  }
653  memcpy(A+trg*n_tuples, buff, n_tuples*sizeof(double));
654  }
655  free(buff);
656  }
657 }
658 
659 
660 void fast_transpose_v1(T_Plan* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
661 
662  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
663  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
664  MPI_Barrier(T_plan->comm);
665  return;
666  }
667  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
668  transpose_v5(T_plan,(double*)data,timings,flags,howmany,tag);
669  MPI_Barrier(T_plan->comm);
670  return;
671  }
672  timings[0]-=MPI_Wtime();
673  int nprocs, procid;
674  int nprocs_0, nprocs_1;
675  nprocs=T_plan->nprocs;
676  procid=T_plan->procid;
677  nprocs_0=T_plan->nprocs_0;
678  nprocs_1=T_plan->nprocs_1;
679  ptrdiff_t *N=T_plan->N;
680  double * send_recv = T_plan->buffer;
681  double * buffer_2= T_plan->buffer_2;
682  ptrdiff_t local_n0=T_plan->local_n0;
683  ptrdiff_t local_n1=T_plan->local_n1;
684  ptrdiff_t n_tuples=T_plan->n_tuples;
685 
686  int idist=N[1]*local_n0*n_tuples;
687  int odist=N[0]*local_n1*n_tuples;
688 
689  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
690 
691  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
692  if(VERBOSE>=2)
693  for(int h=0;h<howmany;h++)
694  for(int id=0;id<nprocs;++id){
695  if(procid==id)
696  for(int i=0;i<local_n0;i++){
697  std::cout<<std::endl;
698  for(int j=0;j<N[1];j++){
699  std::cout<<'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
700  }
701  }
702  std::cout<<'\n';
703  MPI_Barrier(T_plan->comm);
704  }
705 
706  //PCOUT<<" ============================================= "<<std::endl;
707  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
708  //PCOUT<<" ============================================= "<<std::endl;
709  int ptr=0;
710  shuffle_time-=MPI_Wtime();
711 
712 
713  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
714 #pragma omp parallel for
715  for(int h=0;h<howmany;h++)
716  local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
717  }
718  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
719 #pragma omp parallel for
720  for(int h=0;h<howmany;h++)
721  local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
722  }
723  if(nprocs==1){ // Transpose is done!
724  shuffle_time+=MPI_Wtime();
725  timings[0]+=MPI_Wtime();
726  timings[0]+=shuffle_time;
727  timings[1]+=shuffle_time;
728  timings[2]+=0;
729  timings[3]+=0;
730  MPI_Barrier(T_plan->comm);
731  return;
732  }
733 
734  // The idea is this: If The Output is to be transposed, then only one buffer is needed. The algorithm would be:
735  // data --T-> send_recv ... send_recv --alltoall--> data
736  // Otherwise buffer2 needs to be used as well:
737  // data --T-> buffer_2 ... buffer_2 --alltoall--> send_recv ... send_recv -T-> data
738  if(Flags[1]==1){
739  local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv );
740  }
741  else if(Flags[0]==0 && Flags[1]==0){
742  local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,T_plan->buffer_2 );
743  }
744 
745  shuffle_time+=MPI_Wtime();
746  ptr=0;
747  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
748  if(VERBOSE>=2)
749  for(int h=0;h<howmany;h++)
750  for(int id=0;id<nprocs;++id){
751  if(procid==id)
752  for(int i=0;i<N[1];i++){
753  std::cout<<std::endl;
754  for(int j=0;j<local_n0;j++){
755  std::cout<<'\t'<<T_plan->buffer_2[ptr];//data[h*idist+(i*local_n0+j)*n_tuples];
756  ptr+=n_tuples;
757  }
758  }
759  std::cout<<'\n';
760  MPI_Barrier(T_plan->comm);
761  }
762 
763 
764  //PCOUT<<" ============================================= "<<std::endl;
765  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
766  //PCOUT<<" ============================================= "<<std::endl;
767 
768  int* scount_proc= T_plan->scount_proc;
769  int* rcount_proc= T_plan->rcount_proc;
770  int* soffset_proc= T_plan->soffset_proc;
771  int* roffset_proc= T_plan->roffset_proc;
772 
773  MPI_Datatype *stype=T_plan->stype;
774  MPI_Datatype *rtype=T_plan->rtype;
775  MPI_Barrier(T_plan->comm);
776 
777  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
778  comm_time-=MPI_Wtime();
779 
780  int soffset=0,roffset=0;
781  MPI_Status ierr;
782  MPI_Request * s_request= new MPI_Request[nprocs];
783  MPI_Request * request= new MPI_Request[nprocs];
784 #pragma omp parallel for
785  for (int proc=0;proc<nprocs;++proc){
786  request[proc]=MPI_REQUEST_NULL;
787  s_request[proc]=MPI_REQUEST_NULL;
788  }
789  int counter=1;
790 
791  double *s_buf, *r_buf;
792  if(Flags[1]==1){
793  s_buf=send_recv; r_buf=data;
794  }
795  else if(Flags[0]==0 && Flags[1]==0){
796  s_buf=buffer_2; r_buf=send_recv;
797  }
798  // SEND
799  for (int proc=0;proc<nprocs;++proc){
800  if(proc!=procid){
801  soffset=soffset_proc[proc];
802  roffset=roffset_proc[proc];
803  MPI_Isend(&s_buf[soffset],scount_proc[proc],MPI_DOUBLE,proc, tag,
804  T_plan->comm, &s_request[proc]);
805  MPI_Irecv(&r_buf[roffset],rcount_proc[proc],MPI_DOUBLE, proc,
806  tag, T_plan->comm, &request[proc]);
807  }
808  }
809  // Copy Your own part. See the note below for the if condition
810  soffset=soffset_proc[procid];//aoffset_proc[proc];//proc*count_proc[proc];
811  roffset=roffset_proc[procid];
812  for(int h=0;h<howmany;h++)
813  memcpy(&r_buf[h*odist+roffset],&s_buf[h*idist+soffset],sizeof(double)*scount_proc[procid]);
814 
815 
816  // If the output is to be transposed locally, then everything is done in sendrecv. just copy it
817  // Otherwise you have to perform the copy via the multiple stride transpose
818  for (int proc=0;proc<nprocs;++proc){
819  MPI_Wait(&request[proc], &ierr);
820  MPI_Wait(&s_request[proc], &ierr);
821  }
822  comm_time+=MPI_Wtime();
823 
824  ptr=0;
825  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
826  if(VERBOSE>=2)
827  for(int h=0;h<howmany;h++)
828  for(int id=0;id<nprocs;++id){
829  if(procid==id)
830  for(int i=0;i<local_n1;i++){
831  std::cout<<std::endl;
832  for(int j=0;j<N[0];j++){
833  std::cout<<'\t'<<send_recv[ptr];//send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
834  ptr+=n_tuples;
835  }
836  }
837  std::cout<<'\n';
838  MPI_Barrier(T_plan->comm);
839  }
840 
841  //PCOUT<<" ============================================= "<<std::endl;
842  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
843  //PCOUT<<" ============================================= "<<std::endl;
844  reshuffle_time-=MPI_Wtime();
845  ptr=0;
846  if(Flags[1]==0)
847  local_transpose(N[0],local_n1,n_tuples,send_recv,data );
848 
849 
850 
851 
852  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
853  if(VERBOSE>=2)
854  for(int h=0;h<howmany;h++)
855  for(int id=0;id<nprocs_1;++id){
856  if(procid==id)
857  for(int i=0;i<local_n1;i++){
858  std::cout<<std::endl;
859  for(int j=0;j<N[0];j++){
860  std::cout<<'\t'<<data[h*odist+(i*N[0]+j)*n_tuples];
861  }
862  }
863  std::cout<<'\n';
864  MPI_Barrier(T_plan->comm);
865  }
866 
867 
868  reshuffle_time+=MPI_Wtime();
869  MPI_Barrier(T_plan->comm);
870  delete [] request;
871  delete [] s_request;
872 
873 
874  if(VERBOSE>=1){
875  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
876  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
877  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
878  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
879  }
880  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
881  timings[1]+=shuffle_time;
882  timings[2]+=comm_time;
883  timings[3]+=reshuffle_time;
884  return;
885 }
886 void fast_transpose_v2(T_Plan* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
887 
888  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
889  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
890  MPI_Barrier(T_plan->comm);
891  return;
892  }
893  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
894  transpose_v6(T_plan,(double*)data,timings,flags,howmany);
895  MPI_Barrier(T_plan->comm);
896  return;
897  }
898  timings[0]-=MPI_Wtime();
899  int nprocs, procid;
900  int nprocs_0, nprocs_1;
901  nprocs=T_plan->nprocs;
902  procid=T_plan->procid;
903  nprocs_0=T_plan->nprocs_0;
904  nprocs_1=T_plan->nprocs_1;
905  ptrdiff_t *N=T_plan->N;
906  double * send_recv = T_plan->buffer;
907  double * buffer_2= T_plan->buffer_2;
908  ptrdiff_t local_n0=T_plan->local_n0;
909  ptrdiff_t local_n1=T_plan->local_n1;
910  ptrdiff_t n_tuples=T_plan->n_tuples;
911 
912  int idist=N[1]*local_n0*n_tuples;
913  int odist=N[0]*local_n1*n_tuples;
914 
915  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
916 
917  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
918  if(VERBOSE>=2)
919  for(int h=0;h<howmany;h++)
920  for(int id=0;id<nprocs;++id){
921  if(procid==id)
922  for(int i=0;i<local_n0;i++){
923  std::cout<<std::endl;
924  for(int j=0;j<N[1];j++){
925  std::cout<<'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
926  }
927  }
928  std::cout<<'\n';
929  MPI_Barrier(T_plan->comm);
930  }
931 
932  //PCOUT<<" ============================================= "<<std::endl;
933  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
934  //PCOUT<<" ============================================= "<<std::endl;
935  int ptr=0;
936  shuffle_time-=MPI_Wtime();
937 
938 
939  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
940 #pragma omp parallel for
941  for(int h=0;h<howmany;h++)
942  local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
943  }
944  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
945 #pragma omp parallel for
946  for(int h=0;h<howmany;h++)
947  local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
948  }
949  if(nprocs==1){ // Transpose is done!
950  shuffle_time+=MPI_Wtime();
951  timings[0]+=MPI_Wtime();
952  timings[0]+=shuffle_time;
953  timings[1]+=shuffle_time;
954  timings[2]+=0;
955  timings[3]+=0;
956  MPI_Barrier(T_plan->comm);
957  return;
958  }
959 
960  // The idea is this: If The Output is to be transposed, then only one buffer is needed. The algorithm would be:
961  // data --T-> send_recv ... send_recv --alltoall--> data
962  // Otherwise buffer2 needs to be used as well:
963  // data --T-> buffer_2 ... buffer_2 --alltoall--> send_recv ... send_recv -T-> data
964  if(Flags[1]==1){
965  local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv );
966  }
967  else if(Flags[0]==0 && Flags[1]==0){
968  local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,T_plan->buffer_2 );
969  }
970 
971  shuffle_time+=MPI_Wtime();
972  ptr=0;
973  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
974  if(VERBOSE>=2)
975  for(int h=0;h<howmany;h++)
976  for(int id=0;id<nprocs;++id){
977  if(procid==id)
978  for(int i=0;i<N[1];i++){
979  std::cout<<std::endl;
980  for(int j=0;j<local_n0;j++){
981  std::cout<<'\t'<<T_plan->buffer_2[ptr];//data[h*idist+(i*local_n0+j)*n_tuples];
982  ptr+=n_tuples;
983  }
984  }
985  std::cout<<'\n';
986  MPI_Barrier(T_plan->comm);
987  }
988 
989 
990  //PCOUT<<" ============================================= "<<std::endl;
991  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
992  //PCOUT<<" ============================================= "<<std::endl;
993 
994  int* scount_proc_f= T_plan->scount_proc_f;
995  int* rcount_proc_f= T_plan->rcount_proc_f;
996  int* soffset_proc_f= T_plan->soffset_proc_f;
997  int* roffset_proc_f= T_plan->roffset_proc_f;
998 
999  MPI_Datatype *stype=T_plan->stype;
1000  MPI_Datatype *rtype=T_plan->rtype;
1001  MPI_Barrier(T_plan->comm);
1002 
1003  double *s_buf, *r_buf;
1004  if(Flags[1]==1){
1005  s_buf=send_recv; r_buf=data;
1006  }
1007  else if(Flags[0]==0 && Flags[1]==0){
1008  s_buf=buffer_2; r_buf=send_recv;
1009  }
1010  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1011  comm_time-=MPI_Wtime();
1012  if(T_plan->is_evenly_distributed==0)
1013  MPI_Alltoallv(s_buf,scount_proc_f,
1014  soffset_proc_f, MPI_DOUBLE,r_buf,
1015  rcount_proc_f,roffset_proc_f, MPI_DOUBLE,
1016  T_plan->comm);
1017  else
1018  MPI_Alltoall(s_buf, scount_proc_f[0], MPI_DOUBLE,
1019  r_buf, rcount_proc_f[0], MPI_DOUBLE,
1020  T_plan->comm);
1021 
1022  comm_time+=MPI_Wtime();
1023 
1024  ptr=0;
1025  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
1026  if(VERBOSE>=2)
1027  for(int h=0;h<howmany;h++)
1028  for(int id=0;id<nprocs;++id){
1029  if(procid==id)
1030  for(int i=0;i<local_n1;i++){
1031  std::cout<<std::endl;
1032  for(int j=0;j<N[0];j++){
1033  std::cout<<'\t'<<send_recv[ptr];//send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
1034  ptr+=n_tuples;
1035  }
1036  }
1037  std::cout<<'\n';
1038  MPI_Barrier(T_plan->comm);
1039  }
1040 
1041  //PCOUT<<" ============================================= "<<std::endl;
1042  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
1043  //PCOUT<<" ============================================= "<<std::endl;
1044  reshuffle_time-=MPI_Wtime();
1045  ptr=0;
1046  if(Flags[1]==0)
1047  local_transpose(N[0],local_n1,n_tuples,send_recv,data );
1048 
1049 
1050 
1051 
1052  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
1053  if(VERBOSE>=2)
1054  for(int h=0;h<howmany;h++)
1055  for(int id=0;id<nprocs_1;++id){
1056  if(procid==id)
1057  for(int i=0;i<local_n1;i++){
1058  std::cout<<std::endl;
1059  for(int j=0;j<N[0];j++){
1060  std::cout<<'\t'<<data[h*odist+(i*N[0]+j)*n_tuples];
1061  }
1062  }
1063  std::cout<<'\n';
1064  MPI_Barrier(T_plan->comm);
1065  }
1066 
1067 
1068  reshuffle_time+=MPI_Wtime();
1069  MPI_Barrier(T_plan->comm);
1070 
1071 
1072  if(VERBOSE>=1){
1073  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
1074  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
1075  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
1076  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1077  }
1078  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
1079  timings[1]+=shuffle_time;
1080  timings[2]+=comm_time;
1081  timings[3]+=reshuffle_time;
1082  return;
1083 }
1084 void fast_transpose_v3(T_Plan* T_plan, double * data, double *timings, int kway, unsigned flags, int howmany, int tag ){
1085 
1086  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
1087  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
1088  MPI_Barrier(T_plan->comm);
1089  return;
1090  }
1091  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
1092  transpose_v7(T_plan,(double*)data,timings,kway,flags,howmany);
1093  MPI_Barrier(T_plan->comm);
1094  return;
1095  }
1096  timings[0]-=MPI_Wtime();
1097  int nprocs, procid;
1098  int nprocs_0, nprocs_1;
1099  nprocs=T_plan->nprocs;
1100  procid=T_plan->procid;
1101  nprocs_0=T_plan->nprocs_0;
1102  nprocs_1=T_plan->nprocs_1;
1103  ptrdiff_t *N=T_plan->N;
1104  double * send_recv = T_plan->buffer;
1105  double * buffer_2= T_plan->buffer_2;
1106  ptrdiff_t local_n0=T_plan->local_n0;
1107  ptrdiff_t local_n1=T_plan->local_n1;
1108  ptrdiff_t n_tuples=T_plan->n_tuples;
1109 
1110  int idist=N[1]*local_n0*n_tuples;
1111  int odist=N[0]*local_n1*n_tuples;
1112 
1113  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1114 
1115  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
1116  if(VERBOSE>=2)
1117  for(int h=0;h<howmany;h++)
1118  for(int id=0;id<nprocs;++id){
1119  if(procid==id)
1120  for(int i=0;i<local_n0;i++){
1121  std::cout<<std::endl;
1122  for(int j=0;j<N[1];j++){
1123  std::cout<<'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
1124  }
1125  }
1126  std::cout<<'\n';
1127  MPI_Barrier(T_plan->comm);
1128  }
1129 
1130  //PCOUT<<" ============================================= "<<std::endl;
1131  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
1132  //PCOUT<<" ============================================= "<<std::endl;
1133  int ptr=0;
1134  shuffle_time-=MPI_Wtime();
1135 
1136 
1137  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1138 #pragma omp parallel for
1139  for(int h=0;h<howmany;h++)
1140  local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
1141  }
1142  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1143 #pragma omp parallel for
1144  for(int h=0;h<howmany;h++)
1145  local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
1146  }
1147  if(nprocs==1){ // Transpose is done!
1148  shuffle_time+=MPI_Wtime();
1149  timings[0]+=MPI_Wtime();
1150  timings[0]+=shuffle_time;
1151  timings[1]+=shuffle_time;
1152  timings[2]+=0;
1153  timings[3]+=0;
1154  MPI_Barrier(T_plan->comm);
1155  return;
1156  }
1157 
1158  // The idea is this: If The Output is to be transposed, then only one buffer is needed. The algorithm would be:
1159  // data --T-> send_recv ... send_recv --alltoall--> data
1160  // Otherwise buffer2 needs to be used as well:
1161  // data --T-> buffer_2 ... buffer_2 --alltoall--> send_recv ... send_recv -T-> data
1162  if(Flags[1]==1){
1163  local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv );
1164  }
1165  else if(Flags[0]==0 && Flags[1]==0){
1166  local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,T_plan->buffer_2 );
1167  }
1168 
1169  shuffle_time+=MPI_Wtime();
1170  ptr=0;
1171  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
1172  if(VERBOSE>=2)
1173  for(int h=0;h<howmany;h++)
1174  for(int id=0;id<nprocs;++id){
1175  if(procid==id)
1176  for(int i=0;i<N[1];i++){
1177  std::cout<<std::endl;
1178  for(int j=0;j<local_n0;j++){
1179  std::cout<<'\t'<<T_plan->buffer_2[ptr];//data[h*idist+(i*local_n0+j)*n_tuples];
1180  ptr+=n_tuples;
1181  }
1182  }
1183  std::cout<<'\n';
1184  MPI_Barrier(T_plan->comm);
1185  }
1186 
1187 
1188  //PCOUT<<" ============================================= "<<std::endl;
1189  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
1190  //PCOUT<<" ============================================= "<<std::endl;
1191 
1192  int* scount_proc_f= T_plan->scount_proc_f;
1193  int* rcount_proc_f= T_plan->rcount_proc_f;
1194  int* soffset_proc_f= T_plan->soffset_proc_f;
1195  int* roffset_proc_f= T_plan->roffset_proc_f;
1196 
1197  MPI_Datatype *stype=T_plan->stype;
1198  MPI_Datatype *rtype=T_plan->rtype;
1199  MPI_Barrier(T_plan->comm);
1200 
1201  double *s_buf, *r_buf;
1202  if(Flags[1]==1){
1203  s_buf=send_recv; r_buf=data;
1204  }
1205  else if(Flags[0]==0 && Flags[1]==0){
1206  s_buf=buffer_2; r_buf=send_recv;
1207  }
1208  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1209  comm_time-=MPI_Wtime();
1210  if(T_plan->kway_async)
1211  par::Mpi_Alltoallv_dense<double,true>(s_buf , scount_proc_f, soffset_proc_f,
1212  r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
1213  else
1214  par::Mpi_Alltoallv_dense<double,false>(s_buf , scount_proc_f, soffset_proc_f,
1215  r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
1216 
1217  comm_time+=MPI_Wtime();
1218 
1219  ptr=0;
1220  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
1221  if(VERBOSE>=2)
1222  for(int h=0;h<howmany;h++)
1223  for(int id=0;id<nprocs;++id){
1224  if(procid==id)
1225  for(int i=0;i<local_n1;i++){
1226  std::cout<<std::endl;
1227  for(int j=0;j<N[0];j++){
1228  std::cout<<'\t'<<send_recv[ptr];//send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
1229  ptr+=n_tuples;
1230  }
1231  }
1232  std::cout<<'\n';
1233  MPI_Barrier(T_plan->comm);
1234  }
1235 
1236  //PCOUT<<" ============================================= "<<std::endl;
1237  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
1238  //PCOUT<<" ============================================= "<<std::endl;
1239  reshuffle_time-=MPI_Wtime();
1240  ptr=0;
1241  if(Flags[1]==0)
1242  local_transpose(N[0],local_n1,n_tuples,send_recv,data );
1243 
1244 
1245 
1246 
1247  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
1248  if(VERBOSE>=2)
1249  for(int h=0;h<howmany;h++)
1250  for(int id=0;id<nprocs_1;++id){
1251  if(procid==id)
1252  for(int i=0;i<local_n1;i++){
1253  std::cout<<std::endl;
1254  for(int j=0;j<N[0];j++){
1255  std::cout<<'\t'<<data[h*odist+(i*N[0]+j)*n_tuples];
1256  }
1257  }
1258  std::cout<<'\n';
1259  MPI_Barrier(T_plan->comm);
1260  }
1261 
1262 
1263  reshuffle_time+=MPI_Wtime();
1264  MPI_Barrier(T_plan->comm);
1265 
1266 
1267  if(VERBOSE>=1){
1268  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
1269  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
1270  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
1271  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1272  }
1273  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
1274  timings[1]+=shuffle_time;
1275  timings[2]+=comm_time;
1276  timings[3]+=reshuffle_time;
1277  return;
1278 }
1279 
1280 void fast_transpose_v1_h(T_Plan* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
1281 
1282  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
1283  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
1284  MPI_Barrier(T_plan->comm);
1285  return;
1286  }
1287  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
1288  transpose_v5(T_plan,(double*)data,timings,flags,howmany,tag);
1289  MPI_Barrier(T_plan->comm);
1290  return;
1291  }
1292  timings[0]-=MPI_Wtime();
1293  int nprocs, procid;
1294  int nprocs_0, nprocs_1;
1295  nprocs=T_plan->nprocs;
1296  procid=T_plan->procid;
1297  nprocs_0=T_plan->nprocs_0;
1298  nprocs_1=T_plan->nprocs_1;
1299  ptrdiff_t *N=T_plan->N;
1300  double * send_recv = T_plan->buffer;
1301  double * buffer_2= T_plan->buffer_2;
1302  ptrdiff_t local_n0=T_plan->local_n0;
1303  ptrdiff_t local_n1=T_plan->local_n1;
1304  ptrdiff_t n_tuples=T_plan->n_tuples;
1305 
1306  int idist=N[1]*local_n0*n_tuples;
1307  int odist=N[0]*local_n1*n_tuples;
1308 
1309  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1310 
1311  int ptr=0;
1312  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
1313  if(VERBOSE>=2)
1314  for(int h=0;h<howmany;h++)
1315  for(int id=0;id<nprocs;++id){
1316  if(procid==id)
1317  for(int i=0;i<local_n0;i++){
1318  std::cout<<std::endl;
1319  for(int j=0;j<N[1];j++){
1320  ptr=h*idist+(i*N[1]+j)*n_tuples;
1321  std::cout<<'\t'<<data[ptr]<<","<<data[ptr+1];
1322  }
1323  }
1324  std::cout<<'\n';
1325  MPI_Barrier(T_plan->comm);
1326  }
1327 
1328  //PCOUT<<" ============================================= "<<std::endl;
1329  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
1330  //PCOUT<<" ============================================= "<<std::endl;
1331  ptr=0;
1332  ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
1333  ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
1334  ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
1335  ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
1336  shuffle_time-=MPI_Wtime();
1337  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1338 #pragma omp parallel for
1339  for(int h=0;h<howmany;h++)
1340  local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
1341  }
1342  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1343 #pragma omp parallel for
1344  for(int h=0;h<howmany;h++)
1345  local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
1346  }
1347  if(nprocs==1){ // Transpose is done!
1348  shuffle_time+=MPI_Wtime();
1349  timings[0]+=MPI_Wtime();
1350  timings[0]+=shuffle_time;
1351  timings[1]+=shuffle_time;
1352  timings[2]+=0;
1353  timings[3]+=0;
1354  MPI_Barrier(T_plan->comm);
1355  return;
1356  }
1357 
1358  // The idea is this: If The Output is to be transposed, then only one buffer is needed. The algorithm would be:
1359  // data --T-> send_recv ... send_recv --alltoall--> data
1360  // Otherwise buffer2 needs to be used as well:
1361  // data --T-> buffer_2 ... buffer_2 --alltoall--> send_recv ... send_recv -T-> data
1362  ptr=0;
1363 
1364  for (int proc=0;proc<nprocs_1;++proc)
1365  for(int h=0;h<howmany;++h){
1366  for(int i=0;i<local_n0;++i){
1367  //for(int j=local_1_start_proc[proc];j<local_1_start_proc[proc]+local_n1_proc[proc];++j){
1368  // memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+j)*n_tuples],sizeof(double)*n_tuples);
1369  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
1370  // ptr+=n_tuples;
1371  //}
1372  memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],sizeof(double)*n_tuples*local_n1_proc[proc]);
1373  ptr+=n_tuples*local_n1_proc[proc];
1374  }
1375  }
1376 
1377  shuffle_time+=MPI_Wtime();
1378  ptr=0;
1379  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
1380  if(VERBOSE>=2)
1381  for(int id=0;id<nprocs;++id){
1382  for(int h=0;h<howmany;h++)
1383  if(procid==id)
1384  for(int i=0;i<N[1];i++){
1385  std::cout<<std::endl;
1386  for(int j=0;j<local_n0;j++){
1387  std::cout<<'\t'<<buffer_2[ptr]<<","<<buffer_2[ptr+1];
1388  ptr+=n_tuples;
1389  }
1390  }
1391  std::cout<<'\n';
1392  MPI_Barrier(T_plan->comm);
1393  }
1394 
1395 
1396 
1397  //PCOUT<<" ============================================= "<<std::endl;
1398  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
1399  //PCOUT<<" ============================================= "<<std::endl;
1400 
1401  int* scount_proc= T_plan->scount_proc;
1402  int* rcount_proc= T_plan->rcount_proc;
1403  int* soffset_proc= T_plan->soffset_proc;
1404  int* roffset_proc= T_plan->roffset_proc;
1405 
1406  MPI_Barrier(T_plan->comm);
1407 
1408  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1409  comm_time-=MPI_Wtime();
1410 
1411  int soffset=0,roffset=0;
1412  MPI_Status ierr;
1413  MPI_Request * s_request= new MPI_Request[nprocs];
1414  MPI_Request * request= new MPI_Request[nprocs];
1415 #pragma omp parallel for
1416  for (int proc=0;proc<nprocs;++proc){
1417  request[proc]=MPI_REQUEST_NULL;
1418  s_request[proc]=MPI_REQUEST_NULL;
1419  }
1420  int counter=1;
1421 
1422  double *s_buf, *r_buf;
1423  s_buf=buffer_2; r_buf=send_recv;
1424  // SEND
1425  for (int proc=0;proc<nprocs;++proc){
1426  if(proc!=procid){
1427  soffset=soffset_proc[proc];
1428  roffset=roffset_proc[proc];
1429  MPI_Isend(&s_buf[soffset*howmany],scount_proc[proc]*howmany,MPI_DOUBLE,proc, tag,
1430  T_plan->comm, &s_request[proc]);
1431  MPI_Irecv(&r_buf[roffset*howmany],rcount_proc[proc]*howmany,MPI_DOUBLE, proc,
1432  tag, T_plan->comm, &request[proc]);
1433  }
1434  }
1435  soffset=soffset_proc[procid];//aoffset_proc[proc];//proc*count_proc[proc];
1436  roffset=roffset_proc[procid];
1437  memcpy(&r_buf[roffset*howmany],&s_buf[soffset*howmany],howmany*sizeof(double)*scount_proc[procid]);
1438 
1439 
1440  // If the output is to be transposed locally, then everything is done in sendrecv. just copy it
1441  // Otherwise you have to perform the copy via the multiple stride transpose
1442  for (int proc=0;proc<nprocs;++proc){
1443  MPI_Wait(&request[proc], &ierr);
1444  MPI_Wait(&s_request[proc], &ierr);
1445  }
1446  comm_time+=MPI_Wtime();
1447 
1448  ptr=0;
1449  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
1450  if(VERBOSE>=2)
1451  for(int id=0;id<nprocs;++id){
1452  if(procid==id)
1453  for(int h=0;h<howmany;h++)
1454  for(int i=0;i<local_n1;i++){
1455  std::cout<<std::endl;
1456  for(int j=0;j<N[0];j++){
1457  std::cout<<'\t'<<send_recv[ptr]<<","<<send_recv[ptr+1];
1458  ptr+=n_tuples;
1459  }
1460  }
1461  std::cout<<'\n';
1462  MPI_Barrier(T_plan->comm);
1463  }
1464 
1465  //PCOUT<<" ============================================= "<<std::endl;
1466  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
1467  //PCOUT<<" ============================================= "<<std::endl;
1468  reshuffle_time-=MPI_Wtime();
1469  ptr=0;
1470  //memset(data,0,T_plan->alloc_local);
1471  for (int proc=0;proc<nprocs_0;++proc)
1472  for(int h=0;h<howmany;++h){
1473  for(int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
1474  memcpy(&data[h*odist+(i*local_n1)*n_tuples],&send_recv[ptr],local_n1*sizeof(double)*n_tuples);
1475  //std::cout<<"proc= "<<proc<<" h= "<<h<<" i=("<<i<<") data_ptr= "<<h*odist+(i*local_n1)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*local_n1_proc[proc] <<std::endl;
1476  ptr+=n_tuples*local_n1;
1477  //for(int j=0*local_1_start_proc[proc];j<0*local_1_start_proc[proc]+local_n1;++j){
1478  // memcpy(&data[h*odist+(i*local_n1+j)*n_tuples],&send_recv[ptr],sizeof(double)*n_tuples);
1479  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
1480  // ptr+=n_tuples;
1481  //}
1482  }
1483  }
1484 
1485  // Right now the data is in transposed out format.
1486  // If the user did not want this layout, transpose again.
1487  if(Flags[1]==0){
1488 #pragma omp parallel for
1489  for(int h=0;h<howmany;h++)
1490  local_transpose(N[0],local_n1,n_tuples,&data[h*odist] );
1491  }
1492 
1493  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
1494  if(VERBOSE>=2)
1495  for(int id=0;id<nprocs_1;++id){
1496  if(procid==id)
1497  for(int h=0;h<howmany;h++)
1498  for(int i=0;i<N[0];i++){
1499  std::cout<<std::endl;
1500  for(int j=0;j<local_n1;j++){
1501  ptr=h*odist+(i*local_n1+j)*n_tuples;
1502  std::cout<<'\t'<<data[ptr]<<","<<data[ptr+1];
1503  }
1504  }
1505  std::cout<<'\n';
1506  MPI_Barrier(T_plan->comm);
1507  }
1508 
1509 
1510  reshuffle_time+=MPI_Wtime();
1511  MPI_Barrier(T_plan->comm);
1512  delete [] request;
1513  delete [] s_request;
1514 
1515 
1516  if(VERBOSE>=1){
1517  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
1518  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
1519  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
1520  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1521  }
1522  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
1523  timings[1]+=shuffle_time;
1524  timings[2]+=comm_time;
1525  timings[3]+=reshuffle_time;
1526  return;
1527 }
1528 void fast_transpose_v2_h(T_Plan* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
1529 
1530  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
1531  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
1532  MPI_Barrier(T_plan->comm);
1533  return;
1534  }
1535  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
1536  transpose_v6(T_plan,(double*)data,timings,flags,howmany);
1537  MPI_Barrier(T_plan->comm);
1538  return;
1539  }
1540  timings[0]-=MPI_Wtime();
1541  int nprocs, procid;
1542  int nprocs_0, nprocs_1;
1543  nprocs=T_plan->nprocs;
1544  procid=T_plan->procid;
1545  nprocs_0=T_plan->nprocs_0;
1546  nprocs_1=T_plan->nprocs_1;
1547  ptrdiff_t *N=T_plan->N;
1548  double * send_recv = T_plan->buffer;
1549  double * buffer_2= T_plan->buffer_2;
1550  ptrdiff_t local_n0=T_plan->local_n0;
1551  ptrdiff_t local_n1=T_plan->local_n1;
1552  ptrdiff_t n_tuples=T_plan->n_tuples;
1553 
1554  int idist=N[1]*local_n0*n_tuples;
1555  int odist=N[0]*local_n1*n_tuples;
1556 
1557  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1558 
1559  int ptr=0;
1560  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
1561  if(VERBOSE>=2)
1562  for(int h=0;h<howmany;h++)
1563  for(int id=0;id<nprocs;++id){
1564  if(procid==id)
1565  for(int i=0;i<local_n0;i++){
1566  std::cout<<std::endl;
1567  for(int j=0;j<N[1];j++){
1568  ptr=h*idist+(i*N[1]+j)*n_tuples;
1569  std::cout<<'\t'<<data[ptr]<<","<<data[ptr+1];
1570  }
1571  }
1572  std::cout<<'\n';
1573  MPI_Barrier(T_plan->comm);
1574  }
1575 
1576  //PCOUT<<" ============================================= "<<std::endl;
1577  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
1578  //PCOUT<<" ============================================= "<<std::endl;
1579  ptr=0;
1580  ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
1581  ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
1582  ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
1583  ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
1584  shuffle_time-=MPI_Wtime();
1585  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1586 #pragma omp parallel for
1587  for(int h=0;h<howmany;h++)
1588  local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
1589  }
1590  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1591 #pragma omp parallel for
1592  for(int h=0;h<howmany;h++)
1593  local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
1594  }
1595  if(nprocs==1){ // Transpose is done!
1596  shuffle_time+=MPI_Wtime();
1597  timings[0]+=MPI_Wtime();
1598  timings[0]+=shuffle_time;
1599  timings[1]+=shuffle_time;
1600  timings[2]+=0;
1601  timings[3]+=0;
1602  MPI_Barrier(T_plan->comm);
1603  return;
1604  }
1605 
1606  // The idea is this: If The Output is to be transposed, then only one buffer is needed. The algorithm would be:
1607  // data --T-> send_recv ... send_recv --alltoall--> data
1608  // Otherwise buffer2 needs to be used as well:
1609  // data --T-> buffer_2 ... buffer_2 --alltoall--> send_recv ... send_recv -T-> data
1610  ptr=0;
1611 
1612  for (int proc=0;proc<nprocs_1;++proc)
1613  for(int h=0;h<howmany;++h){
1614  for(int i=0;i<local_n0;++i){
1615  //for(int j=local_1_start_proc[proc];j<local_1_start_proc[proc]+local_n1_proc[proc];++j){
1616  // memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+j)*n_tuples],sizeof(double)*n_tuples);
1617  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
1618  // ptr+=n_tuples;
1619  //}
1620  memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],sizeof(double)*n_tuples*local_n1_proc[proc]);
1621  ptr+=n_tuples*local_n1_proc[proc];
1622  }
1623  }
1624 
1625  shuffle_time+=MPI_Wtime();
1626  ptr=0;
1627  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
1628  if(VERBOSE>=2)
1629  for(int id=0;id<nprocs;++id){
1630  for(int h=0;h<howmany;h++)
1631  if(procid==id)
1632  for(int i=0;i<N[1];i++){
1633  std::cout<<std::endl;
1634  for(int j=0;j<local_n0;j++){
1635  std::cout<<'\t'<<buffer_2[ptr]<<","<<buffer_2[ptr+1];
1636  ptr+=n_tuples;
1637  }
1638  }
1639  std::cout<<'\n';
1640  MPI_Barrier(T_plan->comm);
1641  }
1642 
1643 
1644 
1645  //PCOUT<<" ============================================= "<<std::endl;
1646  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
1647  //PCOUT<<" ============================================= "<<std::endl;
1648 
1649  int* scount_proc_f= T_plan->scount_proc_f;
1650  int* rcount_proc_f= T_plan->rcount_proc_f;
1651  int* soffset_proc_f= T_plan->soffset_proc_f;
1652  int* roffset_proc_f= T_plan->roffset_proc_f;
1653 
1654  MPI_Barrier(T_plan->comm);
1655 
1656  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1657  comm_time-=MPI_Wtime();
1658 
1659  int soffset=0,roffset=0;
1660  MPI_Status ierr;
1661  int counter=1;
1662 
1663  double *s_buf, *r_buf;
1664  s_buf=buffer_2; r_buf=send_recv;
1665  // SEND
1666  if(T_plan->is_evenly_distributed==0)
1667  MPI_Alltoallv(s_buf,scount_proc_f,
1668  soffset_proc_f, MPI_DOUBLE,r_buf,
1669  rcount_proc_f,roffset_proc_f, MPI_DOUBLE,
1670  T_plan->comm);
1671  else
1672  MPI_Alltoall(s_buf, scount_proc_f[0], MPI_DOUBLE,
1673  r_buf, rcount_proc_f[0], MPI_DOUBLE,
1674  T_plan->comm);
1675 
1676  comm_time+=MPI_Wtime();
1677 
1678 
1679  ptr=0;
1680  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
1681  if(VERBOSE>=2)
1682  for(int id=0;id<nprocs;++id){
1683  if(procid==id)
1684  for(int h=0;h<howmany;h++)
1685  for(int i=0;i<local_n1;i++){
1686  std::cout<<std::endl;
1687  for(int j=0;j<N[0];j++){
1688  std::cout<<'\t'<<send_recv[ptr]<<","<<send_recv[ptr+1];
1689  ptr+=n_tuples;
1690  }
1691  }
1692  std::cout<<'\n';
1693  MPI_Barrier(T_plan->comm);
1694  }
1695 
1696  //PCOUT<<" ============================================= "<<std::endl;
1697  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
1698  //PCOUT<<" ============================================= "<<std::endl;
1699  reshuffle_time-=MPI_Wtime();
1700  ptr=0;
1701  for (int proc=0;proc<nprocs_0;++proc)
1702  for(int h=0;h<howmany;++h){
1703  for(int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
1704  memcpy(&data[h*odist+(i*local_n1)*n_tuples],&send_recv[ptr],local_n1*sizeof(double)*n_tuples);
1705  ptr+=n_tuples*local_n1;
1706  //for(int j=0*local_1_start_proc[proc];j<0*local_1_start_proc[proc]+local_n1;++j){
1707  // memcpy(&data[h*odist+(i*local_n1+j)*n_tuples],&send_recv[ptr],sizeof(double)*n_tuples);
1708  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
1709  // ptr+=n_tuples;
1710  //}
1711  }
1712  }
1713 
1714  // Right now the data is in transposed out format.
1715  // If the user did not want this layout, transpose again.
1716  if(Flags[1]==0){
1717 #pragma omp parallel for
1718  for(int h=0;h<howmany;h++)
1719  local_transpose(N[0],local_n1,n_tuples,&data[h*odist] );
1720  }
1721 
1722  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
1723  if(VERBOSE>=2)
1724  for(int id=0;id<nprocs_1;++id){
1725  if(procid==id)
1726  for(int h=0;h<howmany;h++)
1727  for(int i=0;i<N[0];i++){
1728  std::cout<<std::endl;
1729  for(int j=0;j<local_n1;j++){
1730  ptr=h*odist+(i*local_n1+j)*n_tuples;
1731  std::cout<<'\t'<<data[ptr]<<","<<data[ptr+1];
1732  }
1733  }
1734  std::cout<<'\n';
1735  MPI_Barrier(T_plan->comm);
1736  }
1737 
1738  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1739 
1740  reshuffle_time+=MPI_Wtime();
1741  MPI_Barrier(T_plan->comm);
1742 
1743 
1744  if(VERBOSE>=1){
1745  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
1746  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
1747  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
1748  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1749  }
1750  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
1751  timings[1]+=shuffle_time;
1752  timings[2]+=comm_time;
1753  timings[3]+=reshuffle_time;
1754  return;
1755 }
1756 void fast_transpose_v3_h(T_Plan* T_plan, double * data, double *timings,int kway, unsigned flags, int howmany, int tag ){
1757 
1758  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
1759  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
1760  MPI_Barrier(T_plan->comm);
1761  return;
1762  }
1763  if(Flags[0]==1){ // If Flags==Transposed_In This function can not handle it, call other versions
1764  transpose_v7(T_plan,(double*)data,timings,kway,flags,howmany);
1765  MPI_Barrier(T_plan->comm);
1766  return;
1767  }
1768  timings[0]-=MPI_Wtime();
1769  int nprocs, procid;
1770  int nprocs_0, nprocs_1;
1771  nprocs=T_plan->nprocs;
1772  procid=T_plan->procid;
1773  nprocs_0=T_plan->nprocs_0;
1774  nprocs_1=T_plan->nprocs_1;
1775  ptrdiff_t *N=T_plan->N;
1776  double * send_recv = T_plan->buffer;
1777  double * buffer_2= T_plan->buffer_2;
1778  ptrdiff_t local_n0=T_plan->local_n0;
1779  ptrdiff_t local_n1=T_plan->local_n1;
1780  ptrdiff_t n_tuples=T_plan->n_tuples;
1781 
1782  int idist=N[1]*local_n0*n_tuples;
1783  int odist=N[0]*local_n1*n_tuples;
1784 
1785  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1786 
1787  int ptr=0;
1788  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
1789  if(VERBOSE>=2)
1790  for(int h=0;h<howmany;h++)
1791  for(int id=0;id<nprocs;++id){
1792  if(procid==id)
1793  for(int i=0;i<local_n0;i++){
1794  std::cout<<std::endl;
1795  for(int j=0;j<N[1];j++){
1796  ptr=h*idist+(i*N[1]+j)*n_tuples;
1797  std::cout<<'\t'<<data[ptr]<<","<<data[ptr+1];
1798  }
1799  }
1800  std::cout<<'\n';
1801  MPI_Barrier(T_plan->comm);
1802  }
1803 
1804  //PCOUT<<" ============================================= "<<std::endl;
1805  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
1806  //PCOUT<<" ============================================= "<<std::endl;
1807  ptr=0;
1808  ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
1809  ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
1810  ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
1811  ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
1812  shuffle_time-=MPI_Wtime();
1813  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1814 #pragma omp parallel for
1815  for(int h=0;h<howmany;h++)
1816  local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
1817  }
1818  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1819 #pragma omp parallel for
1820  for(int h=0;h<howmany;h++)
1821  local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
1822  }
1823  if(nprocs==1){ // Transpose is done!
1824  shuffle_time+=MPI_Wtime();
1825  timings[0]+=MPI_Wtime();
1826  timings[0]+=shuffle_time;
1827  timings[1]+=shuffle_time;
1828  timings[2]+=0;
1829  timings[3]+=0;
1830  MPI_Barrier(T_plan->comm);
1831  return;
1832  }
1833 
1834  // The idea is this: If The Output is to be transposed, then only one buffer is needed. The algorithm would be:
1835  // data --T-> send_recv ... send_recv --alltoall--> data
1836  // Otherwise buffer2 needs to be used as well:
1837  // data --T-> buffer_2 ... buffer_2 --alltoall--> send_recv ... send_recv -T-> data
1838  ptr=0;
1839 
1840  for (int proc=0;proc<nprocs_1;++proc)
1841  for(int h=0;h<howmany;++h){
1842  for(int i=0;i<local_n0;++i){
1843  //for(int j=local_1_start_proc[proc];j<local_1_start_proc[proc]+local_n1_proc[proc];++j){
1844  // memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+j)*n_tuples],sizeof(double)*n_tuples);
1845  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
1846  // ptr+=n_tuples;
1847  //}
1848  memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],sizeof(double)*n_tuples*local_n1_proc[proc]);
1849  ptr+=n_tuples*local_n1_proc[proc];
1850  }
1851  }
1852 
1853  shuffle_time+=MPI_Wtime();
1854  ptr=0;
1855  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
1856  if(VERBOSE>=2)
1857  for(int id=0;id<nprocs;++id){
1858  for(int h=0;h<howmany;h++)
1859  if(procid==id)
1860  for(int i=0;i<N[1];i++){
1861  std::cout<<std::endl;
1862  for(int j=0;j<local_n0;j++){
1863  std::cout<<'\t'<<buffer_2[ptr]<<","<<buffer_2[ptr+1];
1864  ptr+=n_tuples;
1865  }
1866  }
1867  std::cout<<'\n';
1868  MPI_Barrier(T_plan->comm);
1869  }
1870 
1871 
1872 
1873  //PCOUT<<" ============================================= "<<std::endl;
1874  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
1875  //PCOUT<<" ============================================= "<<std::endl;
1876 
1877  int* scount_proc_f= T_plan->scount_proc_f;
1878  int* rcount_proc_f= T_plan->rcount_proc_f;
1879  int* soffset_proc_f= T_plan->soffset_proc_f;
1880  int* roffset_proc_f= T_plan->roffset_proc_f;
1881 
1882  MPI_Barrier(T_plan->comm);
1883 
1884  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1885  comm_time-=MPI_Wtime();
1886 
1887  int soffset=0,roffset=0;
1888  MPI_Status ierr;
1889  int counter=1;
1890 
1891  double *s_buf, *r_buf;
1892  s_buf=buffer_2; r_buf=send_recv;
1893 
1894  if(T_plan->kway_async)
1895  par::Mpi_Alltoallv_dense<double,true>(s_buf , scount_proc_f, soffset_proc_f,
1896  r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
1897  else
1898  par::Mpi_Alltoallv_dense<double,false>(s_buf , scount_proc_f, soffset_proc_f,
1899  r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
1900 
1901  comm_time+=MPI_Wtime();
1902 
1903 
1904  ptr=0;
1905  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
1906  if(VERBOSE>=2)
1907  for(int id=0;id<nprocs;++id){
1908  if(procid==id)
1909  for(int h=0;h<howmany;h++)
1910  for(int i=0;i<local_n1;i++){
1911  std::cout<<std::endl;
1912  for(int j=0;j<N[0];j++){
1913  std::cout<<'\t'<<send_recv[ptr]<<","<<send_recv[ptr+1];
1914  ptr+=n_tuples;
1915  }
1916  }
1917  std::cout<<'\n';
1918  MPI_Barrier(T_plan->comm);
1919  }
1920 
1921  //PCOUT<<" ============================================= "<<std::endl;
1922  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
1923  //PCOUT<<" ============================================= "<<std::endl;
1924  reshuffle_time-=MPI_Wtime();
1925  ptr=0;
1926  for (int proc=0;proc<nprocs_0;++proc)
1927  for(int h=0;h<howmany;++h){
1928  for(int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
1929  memcpy(&data[h*odist+(i*local_n1)*n_tuples],&send_recv[ptr],local_n1*sizeof(double)*n_tuples);
1930  ptr+=n_tuples*local_n1;
1931  //for(int j=0*local_1_start_proc[proc];j<0*local_1_start_proc[proc]+local_n1;++j){
1932  // memcpy(&data[h*odist+(i*local_n1+j)*n_tuples],&send_recv[ptr],sizeof(double)*n_tuples);
1933  // //std::cout<<"proc= "<<proc<<" h= "<<h<<" (i,j)=("<<i<<","<<j<<") data_ptr= "<<h*idist+(i*local_n1+j)*n_tuples<< " ptr= "<<ptr <<" cpy= "<<n_tuples*T_plan->local_n1_proc[0] <<std::endl;
1934  // ptr+=n_tuples;
1935  //}
1936  }
1937  }
1938 
1939  // Right now the data is in transposed out format.
1940  // If the user did not want this layout, transpose again.
1941  if(Flags[1]==0){
1942 #pragma omp parallel for
1943  for(int h=0;h<howmany;h++)
1944  local_transpose(N[0],local_n1,n_tuples,&data[h*odist] );
1945  }
1946 
1947  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
1948  if(VERBOSE>=2)
1949  for(int id=0;id<nprocs_1;++id){
1950  if(procid==id)
1951  for(int h=0;h<howmany;h++)
1952  for(int i=0;i<N[0];i++){
1953  std::cout<<std::endl;
1954  for(int j=0;j<local_n1;j++){
1955  ptr=h*odist+(i*local_n1+j)*n_tuples;
1956  std::cout<<'\t'<<data[ptr]<<","<<data[ptr+1];
1957  }
1958  }
1959  std::cout<<'\n';
1960  MPI_Barrier(T_plan->comm);
1961  }
1962 
1963  //PCOUT<<"nprocs_0= "<<nprocs_0<<" nprocs_1= "<<nprocs_1<<std::endl;
1964 
1965  reshuffle_time+=MPI_Wtime();
1966  MPI_Barrier(T_plan->comm);
1967 
1968 
1969  if(VERBOSE>=1){
1970  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
1971  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
1972  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
1973  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1974  }
1975  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
1976  timings[1]+=shuffle_time;
1977  timings[2]+=comm_time;
1978  timings[3]+=reshuffle_time;
1979  return;
1980 }
1981 
1982 void transpose_v5(T_Plan* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
1983 
1984  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
1985  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
1986  MPI_Barrier(T_plan->comm);
1987  return;
1988  }
1989  timings[0]-=MPI_Wtime();
1990  int nprocs, procid;
1991  int nprocs_0, nprocs_1;
1992  nprocs=T_plan->nprocs;
1993  procid=T_plan->procid;
1994  nprocs_0=T_plan->nprocs_0;
1995  nprocs_1=T_plan->nprocs_1;
1996  ptrdiff_t *N=T_plan->N;
1997  double * send_recv = T_plan->buffer;
1998  ptrdiff_t local_n0=T_plan->local_n0;
1999  ptrdiff_t local_n1=T_plan->local_n1;
2000  ptrdiff_t n_tuples=T_plan->n_tuples;
2001 
2002  int idist=N[1]*local_n0*n_tuples;
2003  int odist=N[0]*local_n1*n_tuples;
2004 
2005  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2006 
2007  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
2008  if(VERBOSE>=2)
2009  for(int h=0;h<howmany;h++)
2010  for(int id=0;id<nprocs;++id){
2011  if(procid==id)
2012  for(int i=0;i<local_n0;i++){
2013  std::cout<<std::endl;
2014  for(int j=0;j<N[1];j++){
2015  std::cout<<'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
2016  }
2017  }
2018  std::cout<<'\n';
2019  MPI_Barrier(T_plan->comm);
2020  }
2021 
2022  //PCOUT<<" ============================================= "<<std::endl;
2023  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
2024  //PCOUT<<" ============================================= "<<std::endl;
2025  int ptr=0;
2026  shuffle_time-=MPI_Wtime();
2027 
2028  if(Flags[0]==0){
2029  if(howmany==1)
2030  local_transpose(local_n0,N[1],n_tuples,data );
2031  else{
2032 #pragma omp parallel for
2033  for(int i=0;i<howmany;i++)
2034  local_transpose(local_n0,N[1],n_tuples,&data[i*idist] );
2035  }
2036  }
2037  shuffle_time+=MPI_Wtime();
2038  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
2039  if(VERBOSE>=2)
2040  for(int h=0;h<howmany;h++)
2041  for(int id=0;id<nprocs;++id){
2042  if(procid==id)
2043  for(int i=0;i<N[1];i++){
2044  std::cout<<std::endl;
2045  for(int j=0;j<local_n0;j++){
2046  std::cout<<'\t'<<data[h*idist+(i*local_n0+j)*n_tuples];
2047  }
2048  }
2049  std::cout<<'\n';
2050  MPI_Barrier(T_plan->comm);
2051  }
2052 
2053  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2054 #pragma omp parallel for
2055  for(int h=0;h<howmany;h++)
2056  local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
2057  }
2058  if(nprocs==1){ // Transpose is done!
2059  timings[0]+=MPI_Wtime();
2060  timings[0]+=shuffle_time;
2061  timings[1]+=shuffle_time;
2062  timings[2]+=0;
2063  timings[3]+=0;
2064  MPI_Barrier(T_plan->comm);
2065  return;
2066  }
2067 
2068 
2069  //PCOUT<<" ============================================= "<<std::endl;
2070  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
2071  //PCOUT<<" ============================================= "<<std::endl;
2072 
2073  int* scount_proc= T_plan->scount_proc;
2074  int* rcount_proc= T_plan->rcount_proc;
2075  int* soffset_proc= T_plan->soffset_proc;
2076  int* roffset_proc= T_plan->roffset_proc;
2077 
2078  MPI_Datatype *stype=T_plan->stype;
2079  MPI_Datatype *rtype=T_plan->rtype;
2080  MPI_Barrier(T_plan->comm);
2081 
2082  comm_time-=MPI_Wtime();
2083 
2084  int soffset=0,roffset=0;
2085  MPI_Status ierr;
2086  MPI_Request * s_request= new MPI_Request[nprocs];
2087  MPI_Request * request= new MPI_Request[nprocs];
2088 #pragma omp parallel for
2089  for (int proc=0;proc<nprocs;++proc){
2090  request[proc]=MPI_REQUEST_NULL;
2091  s_request[proc]=MPI_REQUEST_NULL;
2092  }
2093 
2094  // SEND
2095  for (int proc=0;proc<nprocs;++proc){
2096  if(proc!=procid){
2097  soffset=soffset_proc[proc];
2098  MPI_Isend(&data[soffset],1,stype[proc],proc, tag,
2099  T_plan->comm, &s_request[proc]);
2100  }
2101  }
2102  // RECV
2103  for (int proc=0;proc<nprocs;++proc){
2104  if(proc!=procid){
2105  roffset=roffset_proc[proc];
2106  MPI_Irecv(&send_recv[roffset],1, rtype[proc], proc,
2107  tag, T_plan->comm, &request[proc]);
2108  }
2109  else{
2110  soffset=soffset_proc[proc];//aoffset_proc[proc];//proc*count_proc[proc];
2111  roffset=roffset_proc[proc];
2112  for(int h=0;h<howmany;h++)
2113  memcpy(&send_recv[h*odist+roffset],&data[h*idist+soffset],sizeof(double)*scount_proc[proc]);
2114  }
2115  }
2116 
2117  for (int proc=0;proc<nprocs;++proc){
2118  MPI_Wait(&request[proc], &ierr);
2119  MPI_Wait(&s_request[proc], &ierr);
2120  }
2121  comm_time+=MPI_Wtime();
2122 
2123  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
2124  if(VERBOSE>=2)
2125  for(int h=0;h<howmany;h++)
2126  for(int id=0;id<nprocs;++id){
2127  if(procid==id)
2128  for(int i=0;i<local_n1;i++){
2129  std::cout<<std::endl;
2130  for(int j=0;j<N[0];j++){
2131  std::cout<<'\t'<<send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
2132  }
2133  }
2134  std::cout<<'\n';
2135  MPI_Barrier(T_plan->comm);
2136  }
2137 
2138  //PCOUT<<" ============================================= "<<std::endl;
2139  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
2140  //PCOUT<<" ============================================= "<<std::endl;
2141  reshuffle_time-=MPI_Wtime();
2142  ptr=0;
2143 
2144 
2145  // int first_local_n0, last_local_n0;
2146  // first_local_n0=local_n0_proc[0]; last_local_n0=local_n0_proc[nprocs_1-1];
2147 
2148  //local_transpose(nprocs_1,local_n1,n_tuples*local_n0,send_recv,data );
2149 
2150  int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
2151  if(local_n1!=0)
2152  last_ntuples=T_plan->last_recv_count/((int)local_n1);
2153 #pragma omp parallel for
2154  for(int i=0;i<howmany;i++){
2155  if(local_n1==1)
2156  memcpy(&data[i*odist],&send_recv[i*odist],T_plan->alloc_local/howmany ); // you are done, no additional transpose is needed.
2157  else if(last_ntuples!=first_ntuples){
2158  local_transpose((nprocs_0-1),local_n1,first_ntuples,&send_recv[i*odist] );
2159  local_transpose(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[i*odist],&data[i*odist] );
2160  }
2161  else if(last_ntuples==first_ntuples){
2162  //local_transpose(nprocs_0,local_n1,n_tuples*local_n0,send_recv );
2163  local_transpose(nprocs_0,local_n1,first_ntuples,&send_recv[i*odist],&data[i*odist] );
2164  }
2165  }
2166 
2167 
2168 
2169  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
2170  if(VERBOSE>=2)
2171  for(int h=0;h<howmany;h++)
2172  for(int id=0;id<nprocs_1;++id){
2173  if(procid==id)
2174  for(int i=0;i<local_n1;i++){
2175  std::cout<<std::endl;
2176  for(int j=0;j<N[0];j++){
2177  std::cout<<'\t'<<data[h*odist+(i*N[0]+j)*n_tuples];
2178  }
2179  }
2180  std::cout<<'\n';
2181  MPI_Barrier(T_plan->comm);
2182  }
2183 
2184 
2185  if(Flags[1]==1){ // Transpose output
2186  if(howmany==1)
2187  local_transpose(local_n1,N[0],n_tuples,data );
2188  else{
2189 #pragma omp parallel for
2190  for(int h=0;h<howmany;h++)
2191  local_transpose(local_n1,N[0],n_tuples,&data[h*odist] );
2192  }
2193  }
2194  reshuffle_time+=MPI_Wtime();
2195  delete [] request;
2196  delete [] s_request;
2197 
2198  if(VERBOSE>=2) PCOUT<<"Transposed Out"<<std::endl;
2199  if(VERBOSE>=2)
2200  for(int h=0;h<howmany;h++)
2201  for(int id=0;id<nprocs_1;++id){
2202  if(procid==id)
2203  for(int i=0;i<N[0];i++){
2204  std::cout<<std::endl;
2205  for(int j=0;j<local_n1;j++){
2206  std::cout<<'\t'<<data[h*odist+(i*local_n1+j)*n_tuples];
2207  }
2208  }
2209  std::cout<<'\n';
2210  MPI_Barrier(T_plan->comm);
2211  }
2212  MPI_Barrier(T_plan->comm);
2213 
2214  if(VERBOSE>=1){
2215  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
2216  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
2217  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
2218  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2219  }
2220  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
2221  timings[1]+=shuffle_time;
2222  timings[2]+=comm_time;
2223  timings[3]+=reshuffle_time;
2224  return;
2225 }
2226 void transpose_v6(T_Plan* T_plan, double * data, double *timings, unsigned flags, int howmany ){
2227 
2228  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
2229  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
2230  MPI_Barrier(T_plan->comm);
2231  return;
2232  }
2233  timings[0]-=MPI_Wtime();
2234  int nprocs, procid;
2235  int nprocs_0, nprocs_1;
2236  nprocs=T_plan->nprocs;
2237  procid=T_plan->procid;
2238  nprocs_0=T_plan->nprocs_0;
2239  nprocs_1=T_plan->nprocs_1;
2240  ptrdiff_t *N=T_plan->N;
2241  double * send_recv = T_plan->buffer;
2242  ptrdiff_t local_n0=T_plan->local_n0;
2243  ptrdiff_t local_n1=T_plan->local_n1;
2244  ptrdiff_t n_tuples=T_plan->n_tuples;
2245  int idist=N[1]*local_n0*n_tuples;
2246  int odist=N[0]*local_n1*n_tuples;
2247 
2248  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2249 
2250  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
2251  if(VERBOSE>=2)
2252  for (int h=0;h<howmany;h++)
2253  for(int id=0;id<nprocs;++id){
2254  if(procid==id)
2255  for(int i=0;i<local_n0;i++){
2256  std::cout<<std::endl;
2257  for(int j=0;j<N[1];j++){
2258  std::cout<<'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
2259  }
2260  }
2261  std::cout<<'\n';
2262  MPI_Barrier(T_plan->comm);
2263  }
2264 
2265  //PCOUT<<" ============================================= "<<std::endl;
2266  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
2267  //PCOUT<<" ============================================= "<<std::endl;
2268  int ptr=0;
2269  shuffle_time-=MPI_Wtime();
2270 
2271  if(Flags[0]==0){
2272  for(int i=0;i<howmany;i++)
2273  local_transpose(local_n0,N[1],n_tuples,&data[i*local_n0*N[1]*n_tuples] );
2274  }
2275 
2276  shuffle_time+=MPI_Wtime();
2277  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
2278  if(VERBOSE>=2)
2279  for (int h=0;h<howmany;h++)
2280  for(int id=0;id<nprocs;++id){
2281  if(procid==id)
2282  for(int i=0;i<N[1];i++){
2283  std::cout<<std::endl;
2284  for(int j=0;j<local_n0;j++){
2285  std::cout<<'\t'<<data[idist*h+(i*local_n0+j)*n_tuples]<<","<<data[idist*h+(i*local_n0+j)*n_tuples+1];
2286  }
2287  }
2288  std::cout<<'\n';
2289  MPI_Barrier(T_plan->comm);
2290  }
2291 
2292  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2293  local_transpose(local_n1,N[0],n_tuples,data );
2294  }
2295  if(nprocs==1){ // Transpose is done!
2296  timings[0]+=MPI_Wtime();
2297  timings[0]+=shuffle_time;
2298  timings[1]+=shuffle_time;
2299  timings[2]+=0;
2300  timings[3]+=0;
2301  MPI_Barrier(T_plan->comm);
2302  return;
2303  }
2304 
2305 
2306  //PCOUT<<" ============================================= "<<std::endl;
2307  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
2308  //PCOUT<<" ============================================= "<<std::endl;
2309 
2310  int* scount_proc= T_plan->scount_proc;
2311  int* rcount_proc= T_plan->rcount_proc;
2312  int* soffset_proc= T_plan->soffset_proc;
2313  int* roffset_proc= T_plan->roffset_proc;
2314  MPI_Datatype *stype=T_plan->stype;
2315  MPI_Datatype *rtype=T_plan->rtype;
2316 
2317  MPI_Barrier(T_plan->comm);
2318 
2319  comm_time-=MPI_Wtime();
2320 
2321  MPI_Status status;
2322 
2323  if(howmany>1){
2324  MPI_Alltoallw(data, T_plan->scount_proc_w,
2325  T_plan->soffset_proc_w, stype,
2326  send_recv,T_plan->rcount_proc_w, T_plan->roffset_proc_w,
2327  rtype, T_plan->comm);
2328  }
2329  else if(T_plan->is_evenly_distributed==0)
2330  MPI_Alltoallv(data,scount_proc,
2331  soffset_proc, MPI_DOUBLE,send_recv,
2332  rcount_proc,roffset_proc, MPI_DOUBLE,
2333  T_plan->comm);
2334  else
2335  MPI_Alltoall(data, scount_proc[0], MPI_DOUBLE,
2336  send_recv, rcount_proc[0], MPI_DOUBLE,
2337  T_plan->comm);
2338 
2339  comm_time+=MPI_Wtime();
2340 
2341  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
2342  if(VERBOSE>=2)
2343  for (int h=0;h<howmany;h++)
2344  for(int id=0;id<nprocs;++id){
2345  if(procid==id)
2346  for(int i=0;i<local_n1;i++){
2347  std::cout<<std::endl;
2348  for(int j=0;j<N[0];j++){
2349  std::cout<<'\t'<<send_recv[odist*h+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
2350  }
2351  }
2352  std::cout<<'\n';
2353  MPI_Barrier(T_plan->comm);
2354  }
2355 
2356 
2357 
2358 
2359  //PCOUT<<" ============================================= "<<std::endl;
2360  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
2361  //PCOUT<<" ============================================= "<<std::endl;
2362 
2363  reshuffle_time-=MPI_Wtime();
2364  ptr=0;
2365 
2366 
2367  // int first_local_n0, last_local_n0;
2368  // first_local_n0=local_n0_proc[0]; last_local_n0=local_n0_proc[nprocs_1-1];
2369 
2370  //local_transpose(nprocs_1,local_n1,n_tuples*local_n0,send_recv,data );
2371 
2372  int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
2373  if(local_n1!=0)
2374  last_ntuples=T_plan->last_recv_count/((int)local_n1);
2375  for(int i=0;i<howmany;i++){
2376  if(local_n1==1)
2377  memcpy(&data[i*odist],&send_recv[i*odist],T_plan->alloc_local/howmany ); // you are done, no additional transpose is needed.
2378  else if(last_ntuples!=first_ntuples){
2379  local_transpose((nprocs_0-1),local_n1,first_ntuples,&send_recv[i*odist] );
2380  local_transpose(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[i*odist],&data[i*odist] );
2381  }
2382  else if(last_ntuples==first_ntuples){
2383  //local_transpose(nprocs_0,local_n1,n_tuples*local_n0,send_recv );
2384  local_transpose(nprocs_0,local_n1,first_ntuples,&send_recv[i*odist],&data[i*odist] );
2385  }
2386  }
2387 
2388 
2389  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
2390  if(VERBOSE>=2)
2391  for (int h=0;h<howmany;h++)
2392  for(int id=0;id<nprocs_1;++id){
2393  if(procid==id)
2394  for(int i=0;i<local_n1;i++){
2395  std::cout<<std::endl;
2396  for(int j=0;j<N[0];j++){
2397  std::cout<<'\t'<<data[odist*h+(i*N[0]+j)*n_tuples];
2398  }
2399  }
2400  std::cout<<'\n';
2401  MPI_Barrier(T_plan->comm);
2402  }
2403 
2404  if(Flags[1]==1){ // Transpose output
2405  for (int h=0;h<howmany;h++)
2406  local_transpose(local_n1,N[0],n_tuples,&data[odist*h] );
2407 
2408  if(VERBOSE>=2) PCOUT<<"Transposed Out"<<std::endl;
2409  if(VERBOSE>=2)
2410  for (int h=0;h<howmany;h++)
2411  for(int id=0;id<nprocs_1;++id){
2412  if(procid==id)
2413  for(int i=0;i<N[0];i++){
2414  std::cout<<std::endl;
2415  for(int j=0;j<local_n1;j++){
2416  std::cout<<'\t'<<data[odist*h+(i*local_n1+j)*n_tuples];
2417  }
2418  }
2419  std::cout<<'\n';
2420  MPI_Barrier(T_plan->comm);
2421  }
2422 
2423 
2424  }
2425  reshuffle_time+=MPI_Wtime();
2426 
2427  MPI_Barrier(T_plan->comm);
2428  if(VERBOSE>=1){
2429  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
2430  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
2431  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
2432  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2433  }
2434  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
2435  timings[1]+=shuffle_time;
2436  timings[2]+=comm_time;
2437  timings[3]+=reshuffle_time;
2438  return;
2439 }
2440 void transpose_v7(T_Plan* T_plan, double * data, double *timings,int kway, unsigned flags, int howmany ){
2441  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
2442  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
2443  MPI_Barrier(T_plan->comm);
2444  return;
2445  }
2446 
2447  timings[0]-=MPI_Wtime();
2448  int nprocs, procid;
2449  int nprocs_0, nprocs_1;
2450  nprocs=T_plan->nprocs;
2451  procid=T_plan->procid;
2452  nprocs_0=T_plan->nprocs_0;
2453  nprocs_1=T_plan->nprocs_1;
2454  ptrdiff_t *N=T_plan->N;
2455  double * send_recv = T_plan->buffer;
2456  ptrdiff_t local_n0=T_plan->local_n0;
2457  ptrdiff_t local_n1=T_plan->local_n1;
2458  ptrdiff_t n_tuples=T_plan->n_tuples;
2459  int idist=N[1]*local_n0*n_tuples;
2460  int odist=N[0]*local_n1*n_tuples;
2461 
2462  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2463 
2464  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
2465  if(VERBOSE>=2)
2466  for(int id=0;id<nprocs;++id){
2467  if(procid==id)
2468  for(int i=0;i<local_n0;i++){
2469  std::cout<<std::endl;
2470  for(int j=0;j<N[1];j++){
2471  std::cout<<'\t'<<data[(i*N[1]+j)*n_tuples];
2472  }
2473  }
2474  std::cout<<'\n';
2475  MPI_Barrier(T_plan->comm);
2476  }
2477 
2478  //PCOUT<<" ============================================= "<<std::endl;
2479  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
2480  //PCOUT<<" ============================================= "<<std::endl;
2481  int ptr=0;
2482  shuffle_time-=MPI_Wtime();
2483 
2484  if(Flags[0]==0){
2485  for(int i=0;i<howmany;i++)
2486  local_transpose(local_n0,N[1],n_tuples,&data[i*local_n0*N[1]*n_tuples] );
2487  }
2488 
2489  shuffle_time+=MPI_Wtime();
2490  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
2491  if(VERBOSE>=2)
2492  for(int id=0;id<nprocs;++id){
2493  if(procid==id)
2494  for(int i=0;i<N[1];i++){
2495  std::cout<<std::endl;
2496  for(int j=0;j<local_n0;j++){
2497  std::cout<<'\t'<<data[(i*local_n0+j)*n_tuples]<<","<<data[(i*local_n0+j)*n_tuples+1];
2498  }
2499  }
2500  std::cout<<'\n';
2501  MPI_Barrier(T_plan->comm);
2502  }
2503  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2504  local_transpose(local_n1,N[0],n_tuples,data );
2505  }
2506 
2507  if(nprocs==1){ // Transpose is done!
2508  timings[0]+=MPI_Wtime();
2509  timings[0]+=shuffle_time;
2510  timings[1]+=shuffle_time;
2511  timings[2]+=0;
2512  timings[3]+=0;
2513  MPI_Barrier(T_plan->comm);
2514  return;
2515  }
2516 
2517 
2518  //PCOUT<<" ============================================= "<<std::endl;
2519  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
2520  //PCOUT<<" ============================================= "<<std::endl;
2521 
2522  int* scount_proc= T_plan->scount_proc;
2523  int* rcount_proc= T_plan->rcount_proc;
2524  int* soffset_proc= T_plan->soffset_proc;
2525  int* roffset_proc= T_plan->roffset_proc;
2526 
2527  comm_time-=MPI_Wtime();
2528 
2529  if(T_plan->kway_async)
2530  par::Mpi_Alltoallv_dense<double,true>(data , scount_proc, soffset_proc,
2531  send_recv, rcount_proc, roffset_proc, T_plan->comm,kway);
2532  else
2533  par::Mpi_Alltoallv_dense<double,false>(data , scount_proc, soffset_proc,
2534  send_recv, rcount_proc, roffset_proc, T_plan->comm,kway);
2535  comm_time+=MPI_Wtime();
2536 
2537  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
2538  if(VERBOSE>=2)
2539  for(int id=0;id<nprocs;++id){
2540  if(procid==id)
2541  for(int i=0;i<local_n1;i++){
2542  std::cout<<std::endl;
2543  for(int j=0;j<N[0];j++){
2544  std::cout<<'\t'<<send_recv[(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
2545  }
2546  }
2547  std::cout<<'\n';
2548  MPI_Barrier(T_plan->comm);
2549  }
2550 
2551 
2552 
2553 
2554  //PCOUT<<" ============================================= "<<std::endl;
2555  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
2556  //PCOUT<<" ============================================= "<<std::endl;
2557 
2558  reshuffle_time-=MPI_Wtime();
2559  ptr=0;
2560 
2561 
2562  // int first_local_n0, last_local_n0;
2563  // first_local_n0=local_n0_proc[0]; last_local_n0=local_n0_proc[nprocs_1-1];
2564 
2565  //local_transpose(nprocs_1,local_n1,n_tuples*local_n0,send_recv,data );
2566 
2567  int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
2568  if(local_n1!=0)
2569  last_ntuples=T_plan->last_recv_count/((int)local_n1);
2570 
2571  for(int i=0;i<howmany;i++){
2572  if(local_n1==1)
2573  memcpy(&data[i*odist],&send_recv[i*odist],T_plan->alloc_local/howmany ); // you are done, no additional transpose is needed.
2574  else if(last_ntuples!=first_ntuples){
2575  local_transpose((nprocs_0-1),local_n1,first_ntuples,&send_recv[i*odist] );
2576  local_transpose(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[i*odist],&data[i*odist] );
2577  }
2578  else if(last_ntuples==first_ntuples){
2579  //local_transpose(nprocs_0,local_n1,n_tuples*local_n0,send_recv );
2580  local_transpose(nprocs_0,local_n1,first_ntuples,&send_recv[i*odist],&data[i*odist] );
2581  }
2582  }
2583 
2584 
2585  if(VERBOSE>=2) PCOUT<<"2nd Transpose"<<std::endl;
2586  if(VERBOSE>=2)
2587  for(int id=0;id<nprocs_1;++id){
2588  if(procid==id)
2589  for(int i=0;i<local_n1;i++){
2590  std::cout<<std::endl;
2591  for(int j=0;j<N[0];j++){
2592  std::cout<<'\t'<<data[(i*N[0]+j)*n_tuples];
2593  }
2594  }
2595  std::cout<<'\n';
2596  MPI_Barrier(T_plan->comm);
2597  }
2598 
2599  if(Flags[1]==1){ // Transpose output
2600  local_transpose(local_n1,N[0],n_tuples,data );
2601  }
2602  reshuffle_time+=MPI_Wtime();
2603  MPI_Barrier(T_plan->comm);
2604  if(VERBOSE>=1){
2605  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
2606  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
2607  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
2608  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2609  }
2610  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
2611  timings[1]+=shuffle_time;
2612  timings[2]+=comm_time;
2613  timings[3]+=reshuffle_time;
2614  return;
2615 }
2616 
2617 void transpose_v8(T_Plan* T_plan, double * data, double *timings, unsigned flags, int howmany, int tag ){
2618 
2619  std::bitset<8> Flags(flags); // 1 Transposed in, 2 Transposed out
2620  if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){ // If Flags==Transposed_Out return
2621  MPI_Barrier(T_plan->comm);
2622  return;
2623  }
2624  timings[0]-=MPI_Wtime();
2625  int nprocs, procid;
2626  int nprocs_0, nprocs_1;
2627  nprocs=T_plan->nprocs;
2628  procid=T_plan->procid;
2629 
2630  nprocs_0=T_plan->nprocs_0;
2631  nprocs_1=T_plan->nprocs_1;
2632  ptrdiff_t *N=T_plan->N;
2633  double * send_recv = T_plan->buffer;
2634  ptrdiff_t local_n0=T_plan->local_n0;
2635  ptrdiff_t local_n1=T_plan->local_n1;
2636  ptrdiff_t n_tuples=T_plan->n_tuples;
2637 
2638  int idist=N[1]*local_n0*n_tuples;
2639  int odist=N[0]*local_n1*n_tuples;
2640 
2641  double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2642 
2643  if(VERBOSE>=2) PCOUT<<"INPUT:"<<std::endl;
2644  if(VERBOSE>=2)
2645  for(int h=0;h<howmany;h++)
2646  for(int id=0;id<nprocs;++id){
2647  if(procid==id)
2648  for(int i=0;i<local_n0;i++){
2649  std::cout<<std::endl;
2650  for(int j=0;j<N[1];j++){
2651  std::cout<<'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
2652  }
2653  }
2654  std::cout<<'\n';
2655  MPI_Barrier(T_plan->comm);
2656  }
2657 
2658  //PCOUT<<" ============================================= "<<std::endl;
2659  //PCOUT<<" ============== Local Transpose============= "<<std::endl;
2660  //PCOUT<<" ============================================= "<<std::endl;
2661  int ptr=0;
2662  shuffle_time-=MPI_Wtime();
2663  if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2664 #pragma omp parallel for
2665  for(int h=0;h<howmany;h++)
2666  local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
2667  }
2668  if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
2669 #pragma omp parallel for
2670  for(int h=0;h<howmany;h++)
2671  local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
2672  }
2673  if(nprocs==1){ // Transpose is done!
2674  shuffle_time+=MPI_Wtime();
2675  timings[0]+=MPI_Wtime();
2676  timings[0]+=shuffle_time;
2677  timings[1]+=shuffle_time;
2678  timings[2]+=0;
2679  timings[3]+=0;
2680  MPI_Barrier(T_plan->comm);
2681  return;
2682  }
2683 
2684  PCOUT<<"\n flags="<<flags<<std::endl;
2685  // if the input is transposed, then transpose it to get to non transposed input
2686  if(Flags[0]==1){
2687 #pragma omp parallel for
2688  for(int i=0;i<howmany;i++)
2689  local_transpose(local_n0,N[1],n_tuples,&data[i*idist] );
2690  if(VERBOSE>=2) PCOUT<<"Local Transpose:"<<std::endl;
2691  if(VERBOSE>=2)
2692  for(int h=0;h<howmany;h++)
2693  for(int id=0;id<nprocs;++id){
2694  if(procid==id)
2695  for(int i=0;i<N[1];i++){
2696  std::cout<<std::endl;
2697  for(int j=0;j<local_n0;j++){
2698  std::cout<<'\t'<<data[h*idist+(i*local_n0+j)*n_tuples];
2699  }
2700  }
2701  std::cout<<'\n';
2702  MPI_Barrier(T_plan->comm);
2703  }
2704  }
2705  shuffle_time+=MPI_Wtime();
2706 
2707  // if the input is transposed and the output needs to be transposed locally then do the local transpose
2708 
2709 
2710  //PCOUT<<" ============================================= "<<std::endl;
2711  //PCOUT<<" ============== MPIALLTOALL =============== "<<std::endl;
2712  //PCOUT<<" ============================================= "<<std::endl;
2713 
2714  int* scount_proc= T_plan->scount_proc_v8;
2715  int* rcount_proc= T_plan->rcount_proc_v8;
2716  int* soffset_proc= T_plan->soffset_proc_v8;
2717  int* roffset_proc= T_plan->roffset_proc_v8;
2718 
2719  MPI_Datatype* stype=T_plan->stype_v8;
2720  MPI_Datatype* rtype=T_plan->rtype_v8;
2721  if(0)
2722  for(int proc=0;proc<nprocs;proc++){
2723  if(procid==proc){
2724  std::cout<<std::endl;
2725  std::cout<<"proc= "<<proc<<" l_n0= "<<local_n0<<" l_n1= "<<local_n1<<std::endl;
2726  for (int i=0;i<nprocs;i++){
2727  std::cout<<" sc["<<i<<"]= "<<scount_proc[i];
2728  std::cout<<" so["<<i<<"]= "<<soffset_proc[i]<<std::endl;
2729  }
2730  std::cout<<std::endl;
2731  std::cout<<std::endl;
2732  }
2733  MPI_Barrier(T_plan->comm);
2734  }
2735  MPI_Barrier(T_plan->comm);
2736  comm_time-=MPI_Wtime();
2737 
2738  int soffset=0,roffset=0;
2739  MPI_Status ierr;
2740  MPI_Request * s_request= new MPI_Request[nprocs];
2741  MPI_Request * request= new MPI_Request[nprocs];
2742 #pragma omp parallel for
2743  for (int proc=0;proc<nprocs;++proc){
2744  request[proc]=MPI_REQUEST_NULL;
2745  s_request[proc]=MPI_REQUEST_NULL;
2746  }
2747 
2748  // SEND
2749  for (int proc=0;proc<nprocs;++proc){
2750  soffset=soffset_proc[proc];
2751  MPI_Isend(&data[soffset],1,stype[proc],proc, tag,
2752  T_plan->comm, &s_request[proc]);
2753  }
2754  // RECV
2755  for (int proc=0;proc<nprocs;++proc){
2756  roffset=roffset_proc[proc];
2757  MPI_Irecv(&send_recv[roffset],1, rtype[proc], proc,
2758  tag, T_plan->comm, &request[proc]);
2759  }
2760 
2761  for (int proc=0;proc<nprocs;++proc){
2762  MPI_Wait(&request[proc], &ierr);
2763  MPI_Wait(&s_request[proc], &ierr);
2764  }
2765  comm_time+=MPI_Wtime();
2766  if(0)
2767  for(int i=0;i<local_n1*N[0];i++)
2768  std::cout<<" "<<send_recv[i*n_tuples]<<std::endl;
2769 
2770  if(VERBOSE>=2) PCOUT<<"MPIAlltoAll:"<<std::endl;
2771  if(VERBOSE>=2)
2772  for(int h=0;h<howmany;h++)
2773  for(int id=0;id<nprocs;++id){
2774  if(procid==id)
2775  for(int i=0;i<local_n1;i++){
2776  std::cout<<std::endl;
2777  for(int j=0;j<N[0];j++){
2778  std::cout<<'\t'<<send_recv[h*odist+(i*N[0]+j)*n_tuples];//<<","<<send_recv[(i*N[0]+j)*n_tuples+1];
2779  }
2780  }
2781  std::cout<<'\n';
2782  MPI_Barrier(T_plan->comm);
2783  }
2784 
2785  //PCOUT<<" ============================================= "<<std::endl;
2786  //PCOUT<<" ============== 2nd Local Trnaspose ========== "<<std::endl;
2787  //PCOUT<<" ============================================= "<<std::endl;
2788  reshuffle_time-=MPI_Wtime();
2789  ptr=0;
2790 
2791 
2792  // TODO : interleave this memcpy into the send recvs!
2793  // write a tranposed out dat type.
2794  // implement the how many case.
2795  // If none of these worked try the mpi start!
2796  memcpy(data,send_recv,T_plan->alloc_local);
2797  if(Flags[1]==1){ // Transpose output
2798  if(howmany==1)
2799  local_transpose(local_n1,N[0],n_tuples,data );
2800  else{
2801 #pragma omp parallel for
2802  for(int h=0;h<howmany;h++)
2803  local_transpose(local_n1,N[0],n_tuples,&data[h*odist] );
2804  }
2805 
2806  if(VERBOSE>=2) PCOUT<<"Transposed Out"<<std::endl;
2807  if(VERBOSE>=2)
2808  for(int h=0;h<howmany;h++)
2809  for(int id=0;id<nprocs_1;++id){
2810  if(procid==id)
2811  for(int i=0;i<N[0];i++){
2812  std::cout<<std::endl;
2813  for(int j=0;j<local_n1;j++){
2814  std::cout<<'\t'<<data[h*odist+(i*local_n1+j)*n_tuples];
2815  }
2816  }
2817  std::cout<<'\n';
2818  MPI_Barrier(T_plan->comm);
2819  }
2820  }
2821  reshuffle_time+=MPI_Wtime();
2822  delete [] request;
2823  delete [] s_request;
2824  MPI_Barrier(T_plan->comm);
2825 
2826  if(VERBOSE>=1){
2827  PCOUT<<"Shuffle Time= "<<shuffle_time<<std::endl;
2828  PCOUT<<"Alltoall Time= "<<comm_time<<std::endl;
2829  PCOUT<<"Reshuffle Time= "<<reshuffle_time<<std::endl;
2830  PCOUT<<"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2831  }
2832  timings[0]+=MPI_Wtime();//timings[0]+=shuffle_time+comm_time+reshuffle_time;
2833  timings[1]+=shuffle_time;
2834  timings[2]+=comm_time;
2835  timings[3]+=reshuffle_time;
2836  return;
2837 }