32 #include "transpose.h"
35 #define PCOUT if(procid==0) std::cout
38 static bool IsPowerOfTwo(ulong x)
40 return (x & (x - 1)) == 0;
42 Mem_Mgr::Mem_Mgr(
int N0,
int N1,
int tuples, MPI_Comm Comm,
int howmany,
int specified_alloc_local){
48 MPI_Comm_rank(Comm, &procid);
49 MPI_Comm_size(Comm,&nprocs);
53 if(specified_alloc_local==0){
55 ptrdiff_t * local_n0_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
56 ptrdiff_t * local_n1_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
57 #pragma omp parallel for
58 for (
int proc=0;proc<nprocs;++proc){
59 local_n0_proc[proc]=ceil(N[0]/(
double)nprocs);
60 local_n1_proc[proc]=ceil(N[1]/(
double)nprocs);
63 if((N[0]-local_n0_proc[proc]*proc)<local_n0_proc[proc]) {local_n0_proc[proc]=N[0]-local_n0_proc[proc]*proc; local_n0_proc[proc]*=(int) local_n0_proc[proc]>0;}
64 if((N[1]-local_n1_proc[proc]*proc)<local_n1_proc[proc]) {local_n1_proc[proc]=N[1]-local_n1_proc[proc]*proc;local_n1_proc[proc]*=(int) local_n1_proc[proc]>0;}
69 local_n0=local_n0_proc[procid];
70 local_n1=local_n1_proc[procid];
77 alloc_local=local_n0*N[1]*n_tuples*
sizeof(double);
78 if(alloc_local<local_n1*N[0]*n_tuples*
sizeof(
double))
79 alloc_local=local_n1*N[0]*n_tuples*
sizeof(
double);
84 alloc_local=specified_alloc_local;
86 if( alloc_local<=1.05*std::pow(2,30) )
91 posix_memalign((
void **)&buffer,64, alloc_local);
92 posix_memalign((
void **)&buffer_2,64, alloc_local);
93 memset( buffer,0, alloc_local );
94 memset( buffer_2,0, alloc_local );
103 T_Plan::T_Plan(
int N0,
int N1,
int tuples, Mem_Mgr * Mem_mgr, MPI_Comm Comm,
int howmany){
108 MPI_Comm_rank(Comm, &procid);
109 MPI_Comm_size(Comm,&nprocs);
111 local_n0_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
112 local_n1_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
113 local_0_start_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
114 local_1_start_proc=(ptrdiff_t*) malloc(
sizeof(ptrdiff_t)*nprocs);
118 local_0_start_proc[0]=0;local_1_start_proc[0]=0;
119 for (
int proc=0;proc<nprocs;++proc){
120 local_n0_proc[proc]=ceil(N[0]/(
double)nprocs);
121 local_n1_proc[proc]=ceil(N[1]/(
double)nprocs);
124 if((N[0]-local_n0_proc[proc]*proc)<local_n0_proc[proc]) {local_n0_proc[proc]=N[0]-local_n0_proc[proc]*proc; local_n0_proc[proc]*=(int) local_n0_proc[proc]>0;}
125 if((N[1]-local_n1_proc[proc]*proc)<local_n1_proc[proc]) {local_n1_proc[proc]=N[1]-local_n1_proc[proc]*proc;local_n1_proc[proc]*=(int) local_n1_proc[proc]>0;}
128 local_0_start_proc[proc]=local_0_start_proc[proc-1]+local_n0_proc[proc-1];
129 local_1_start_proc[proc]=local_1_start_proc[proc-1]+local_n1_proc[proc-1];
134 local_n0=local_n0_proc[procid];
135 local_n1=local_n1_proc[procid];
136 local_0_start=local_0_start_proc[procid];
137 local_1_start=local_1_start_proc[procid];
140 alloc_local=Mem_mgr->alloc_local;
142 nprocs_0=0; nprocs_1=0;
143 for (
int proc=0;proc<nprocs;++proc){
144 if(local_n0_proc[proc]!=0)
146 if(local_n1_proc[proc]!=0)
153 scount_proc=(
int*) malloc(
sizeof(
int)*nprocs);
154 rcount_proc=(
int*) malloc(
sizeof(
int)*nprocs);
155 soffset_proc=(
int*) malloc(
sizeof(
int)*nprocs);
156 roffset_proc=(
int*) malloc(
sizeof(
int)*nprocs);
158 scount_proc_f=(
int*) malloc(
sizeof(
int)*nprocs);
159 rcount_proc_f=(
int*) malloc(
sizeof(
int)*nprocs);
160 soffset_proc_f=(
int*) malloc(
sizeof(
int)*nprocs);
161 roffset_proc_f=(
int*) malloc(
sizeof(
int)*nprocs);
163 scount_proc_w=(
int*) malloc(
sizeof(
int)*nprocs);
164 rcount_proc_w=(
int*) malloc(
sizeof(
int)*nprocs);
165 soffset_proc_w=(
int*) malloc(
sizeof(
int)*nprocs);
166 roffset_proc_w=(
int*) malloc(
sizeof(
int)*nprocs);
178 if(nprocs_1>nprocs_0)
179 for (
int proc=0;proc<nprocs;++proc){
181 scount_proc[proc]=local_n1_proc[proc]*local_n0*n_tuples;
183 if(scount_proc[proc]!=0)
184 rcount_proc[proc]=local_n1_proc[proc]*local_n0_proc[proc]*n_tuples;
186 rcount_proc[proc]=local_n1*local_n0_proc[proc]*n_tuples;
188 soffset_proc[proc]=0;
189 roffset_proc[proc]=0;
191 soffset_proc[proc]=0;
192 roffset_proc[proc]=0;
195 soffset_proc[proc]=soffset_proc[proc-1]+scount_proc[proc-1];
196 roffset_proc[proc]=roffset_proc[proc-1]+rcount_proc[proc-1];
203 if(scount_proc[proc]==0) soffset_proc[proc]=0;
207 roffset_proc[proc]=0;
209 if(rcount_proc[proc]!=0)
210 last_recv_count=rcount_proc[proc];
211 if(local_n1_proc[proc]!=0)
212 last_local_n1=local_n1_proc[proc];
214 else if(nprocs_1<=nprocs_0)
215 for (
int proc=0;proc<nprocs;++proc){
217 scount_proc[proc]=local_n1_proc[proc]*local_n0*n_tuples;
218 rcount_proc[proc]=local_n1*local_n0_proc[proc]*n_tuples;
222 soffset_proc[proc]=0;
223 roffset_proc[proc]=0;
225 soffset_proc[proc]=0;
226 roffset_proc[proc]=0;
229 soffset_proc[proc]=soffset_proc[proc-1]+scount_proc[proc-1];
230 roffset_proc[proc]=roffset_proc[proc-1]+rcount_proc[proc-1];
236 roffset_proc[proc]=0;
237 soffset_proc[proc]=0;
240 if(scount_proc[proc]==0) soffset_proc[proc]=0;
244 soffset_proc[proc]=0;
246 if(rcount_proc[proc]!=0)
247 last_recv_count=rcount_proc[proc];
249 if(local_n1_proc[proc]!=0)
250 last_local_n1=local_n1_proc[proc];
254 is_evenly_distributed=0;
255 if((local_n0*nprocs_0-N[0])==0 && (local_n1*nprocs_1-N[1])==0 && nprocs_0==nprocs_1 && nprocs_0==nprocs){
256 is_evenly_distributed=1;
262 stype=
new MPI_Datatype[nprocs];
263 rtype=
new MPI_Datatype[nprocs];
268 for (
int i=0;i<nprocs;i++){
269 MPI_Type_vector(howmany,scount_proc[i],local_n0*N[1]*n_tuples, MPI_DOUBLE, &stype[i]);
270 MPI_Type_vector(howmany,rcount_proc[i],local_n1*N[0]*n_tuples, MPI_DOUBLE, &rtype[i]);
272 MPI_Type_commit(&stype[i]);
273 MPI_Type_commit(&rtype[i]);
275 soffset_proc_w[i]=soffset_proc[i]*8;
276 roffset_proc_w[i]=roffset_proc[i]*8;
280 soffset_proc_f[i]=soffset_proc[i]*howmany;
281 roffset_proc_f[i]=roffset_proc[i]*howmany;
282 scount_proc_f[i]= scount_proc[i]*howmany;
283 rcount_proc_f[i]= rcount_proc[i]*howmany;
307 buffer=Mem_mgr->buffer;
308 buffer_2=Mem_mgr->buffer_2;
309 buffer_d=Mem_mgr->buffer_d;
313 void T_Plan::which_method(T_Plan* T_plan,
double* data){
316 double * time= (
double*) malloc(
sizeof(
double)*(2*(int)log2(nprocs)+3));
317 double * g_time= (
double*) malloc(
sizeof(
double)*(2*(int)log2(nprocs)+3));
318 for (
int i=0;i<2*(int)log2(nprocs)+3;i++)
321 transpose_v5(T_plan,(
double*)data,dummy);
322 time[0]=-MPI_Wtime();
323 transpose_v5(T_plan,(
double*)data,dummy);
324 time[0]+=MPI_Wtime();
326 transpose_v6(T_plan,(
double*)data,dummy);
327 time[1]=-MPI_Wtime();
328 transpose_v6(T_plan,(
double*)data,dummy);
329 time[1]+=MPI_Wtime();
331 if(IsPowerOfTwo(nprocs) && nprocs>511){
334 for (
int i=0;i<6;i++){
335 kway=nprocs/std::pow(2,i);
336 MPI_Barrier(T_plan->comm);
337 transpose_v7(T_plan,(
double*)data,dummy,kway);
338 time[2+i]=-MPI_Wtime();
339 transpose_v7(T_plan,(
double*)data,dummy,kway);
340 time[2+i]+=MPI_Wtime();
346 for (
int i=0;i<6;i++){
347 kway=nprocs/std::pow(2,i);
348 MPI_Barrier(T_plan->comm);
349 transpose_v7(T_plan,(
double*)data,dummy,kway);
350 time[2+(int)log2(nprocs)+i]=-MPI_Wtime();
351 transpose_v7(T_plan,(
double*)data,dummy,kway);
352 time[2+(int)log2(nprocs)+i]+=MPI_Wtime();
362 MPI_Allreduce(time,g_time,(2*(
int)log2(nprocs)+3),MPI_DOUBLE,MPI_MAX, T_plan->comm);
364 if(T_plan->procid==0){
365 for(
int i=0;i<2*(int)log2(nprocs)+3;++i)
366 std::cout<<
" time["<<i<<
"]= "<<g_time[i]<<
" , ";
370 double smallest=1000;
371 for (
int i=0;i<2*(int)log2(nprocs)+3;i++)
372 smallest=std::min(smallest,g_time[i]);
374 if(g_time[0]==smallest){
377 else if(g_time[1]==smallest){
380 else if(g_time[2*(
int)log2(nprocs)+2]==smallest){
384 for (
int i=0;i<(int)log2(nprocs);i++)
385 if(g_time[2+i]==smallest){
387 T_plan->kway=nprocs/std::pow(2,i);
388 T_plan->kway_async=
true;
391 for (
int i=0;i<(int)log2(nprocs);i++)
392 if(g_time[2+(
int)log2(nprocs)+i]==smallest){
394 T_plan->kway=nprocs/std::pow(2,i);
395 T_plan->kway_async=
false;
401 PCOUT<<
"smallest= "<<smallest<<std::endl;
402 PCOUT<<
"Using transpose v"<<method<<
" kway= "<<T_plan->kway<<
" kway_async="<<T_plan->kway_async<<std::endl;
406 MPI_Barrier(T_plan->comm);
410 void T_Plan::which_fast_method(T_Plan* T_plan,
double* data){
413 double * time= (
double*) malloc(
sizeof(
double)*(2*(int)log2(nprocs)+3));
414 double * g_time= (
double*) malloc(
sizeof(
double)*(2*(int)log2(nprocs)+3));
415 for (
int i=0;i<2*(int)log2(nprocs)+3;i++)
418 fast_transpose_v1(T_plan,(
double*)data,dummy,2);
419 time[0]=-MPI_Wtime();
420 fast_transpose_v1(T_plan,(
double*)data,dummy,2);
421 time[0]+=MPI_Wtime();
423 fast_transpose_v2(T_plan,(
double*)data,dummy,2);
424 time[1]=-MPI_Wtime();
425 fast_transpose_v2(T_plan,(
double*)data,dummy,2);
426 time[1]+=MPI_Wtime();
428 if(IsPowerOfTwo(nprocs) && nprocs>511){
430 for (
int i=0;i<6;i++){
431 kway=nprocs/std::pow(2,i);
432 MPI_Barrier(T_plan->comm);
433 fast_transpose_v3(T_plan,(
double*)data,dummy,kway,2);
434 time[2+i]=-MPI_Wtime();
435 fast_transpose_v3(T_plan,(
double*)data,dummy,kway,2);
436 time[2+i]+=MPI_Wtime();
440 for (
int i=0;i<6;i++){
441 kway=nprocs/std::pow(2,i);
442 MPI_Barrier(T_plan->comm);
443 fast_transpose_v3(T_plan,(
double*)data,dummy,kway);
444 time[2+(int)log2(nprocs)+i]=-MPI_Wtime();
445 fast_transpose_v3(T_plan,(
double*)data,dummy,kway);
446 time[2+(int)log2(nprocs)+i]+=MPI_Wtime();
455 MPI_Allreduce(time,g_time,(2*(
int)log2(nprocs)+3),MPI_DOUBLE,MPI_MAX, T_plan->comm);
457 if(T_plan->procid==0){
458 for(
int i=0;i<2*(int)log2(nprocs)+3;++i)
459 std::cout<<
" time["<<i<<
"]= "<<g_time[i]<<
" , ";
464 double smallest=1000;
465 for (
int i=0;i<2*(int)log2(nprocs)+3;i++)
466 smallest=std::min(smallest,g_time[i]);
468 if(g_time[0]==smallest){
471 else if(g_time[1]==smallest){
474 else if(g_time[2*(
int)log2(nprocs)+2]==smallest){
478 for (
int i=0;i<(int)log2(nprocs);i++)
479 if(g_time[2+i]==smallest){
481 T_plan->kway=nprocs/std::pow(2,i);
482 T_plan->kway_async=
true;
485 for (
int i=0;i<(int)log2(nprocs);i++)
486 if(g_time[2+(
int)log2(nprocs)+i]==smallest){
488 T_plan->kway=nprocs/std::pow(2,i);
489 T_plan->kway_async=
false;
495 PCOUT<<
"smallest= "<<smallest<<std::endl;
496 PCOUT<<
"Using transpose v"<<method<<
" kway= "<<T_plan->kway<<
" kway_async="<<T_plan->kway_async<<std::endl;
500 MPI_Barrier(T_plan->comm);
504 void T_Plan::execute(T_Plan* T_plan,
double* data,
double *timings,
unsigned flags,
int howmany,
int tag){
509 fast_transpose_v1(T_plan,(
double*)data,timings,flags,howmany,tag);
511 fast_transpose_v2(T_plan,(
double*)data,timings,flags,howmany,tag);
513 fast_transpose_v3(T_plan,(
double*)data,timings,kway,flags,howmany,tag);
518 fast_transpose_v1_h(T_plan,(
double*)data,timings,flags,howmany,tag);
520 fast_transpose_v2_h(T_plan,(
double*)data,timings,flags,howmany,tag);
522 fast_transpose_v3_h(T_plan,(
double*)data,timings,kway,flags,howmany,tag);
525 transpose_v5(T_plan,(
double*)data,timings,flags,howmany,tag);
527 transpose_v6(T_plan,(
double*)data,timings,flags,howmany);
529 transpose_v7(T_plan,(
double*)data,timings,kway,flags,howmany);
531 transpose_v8(T_plan,(
double*)data,timings,flags,howmany,tag);
540 free(local_0_start_proc);
541 free(local_1_start_proc);
549 free(soffset_proc_w);
550 free(roffset_proc_w);
554 free(soffset_proc_f);
555 free(roffset_proc_f);
569 void local_transpose(
int r,
int c,
int n_tuples,
double * in,
double *out ){
571 if(r==0 || c==0)
return;
573 for(
int j=0;j<c;j++){
574 for(
int i=0;i<r;i++){
575 memcpy (&out[ptr],&in[(i*c+j)*n_tuples],
sizeof(
double)*n_tuples);
583 void local_transpose(
int r,
int c,
int n_tuples,
int n_tuples2,
double * in,
double *out ){
585 if(r==0 || c==0)
return;
587 for(
int j=0;j<c;j++){
588 for(
int i=0;i<r;i++){
591 memcpy (&out[ptr],&in[i*c*n_tuples+j*n_tuples2],
sizeof(
double)*n_tuples2);
595 memcpy (&out[ptr],&in[(i*c+j)*n_tuples],
sizeof(
double)*n_tuples);
604 void local_transpose_col(
int r,
int c,
int n_tuples,
int n_tuples2,
double * in,
double *out ){
606 if(r==0 || c==0)
return;
608 for(
int j=0;j<c;j++){
609 for(
int i=0;i<r;i++){
612 memcpy (&out[ptr],&in[i*((c-1)*n_tuples+n_tuples2)+(j)*n_tuples],
sizeof(
double)*n_tuples2);
616 memcpy (&out[ptr],&in[i*((c-1)*n_tuples+n_tuples2)+j*n_tuples],
sizeof(
double)*n_tuples);
626 void local_transpose(
int r,
int c,
int n_tuples,
double* A){
629 double* buff=(
double*)malloc(n_tuples*
sizeof(
double));
630 for(
size_t i=0;i<r;i++)
631 for(
size_t j=0;j<c;j++){
634 if(src==trg)
continue;
643 if(src!=trg)
continue;
645 memcpy(buff, A+trg*n_tuples, n_tuples*
sizeof(
double));
646 for(
size_t k=0;k<cycle_len;k++){
650 memcpy(A+trg*n_tuples, A+src*n_tuples, n_tuples*
sizeof(
double));
653 memcpy(A+trg*n_tuples, buff, n_tuples*
sizeof(
double));
660 void fast_transpose_v1(T_Plan* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
662 std::bitset<8> Flags(flags);
663 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
664 MPI_Barrier(T_plan->comm);
668 transpose_v5(T_plan,(
double*)data,timings,flags,howmany,tag);
669 MPI_Barrier(T_plan->comm);
672 timings[0]-=MPI_Wtime();
674 int nprocs_0, nprocs_1;
675 nprocs=T_plan->nprocs;
676 procid=T_plan->procid;
677 nprocs_0=T_plan->nprocs_0;
678 nprocs_1=T_plan->nprocs_1;
679 ptrdiff_t *N=T_plan->N;
680 double * send_recv = T_plan->buffer;
681 double * buffer_2= T_plan->buffer_2;
682 ptrdiff_t local_n0=T_plan->local_n0;
683 ptrdiff_t local_n1=T_plan->local_n1;
684 ptrdiff_t n_tuples=T_plan->n_tuples;
686 int idist=N[1]*local_n0*n_tuples;
687 int odist=N[0]*local_n1*n_tuples;
689 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
691 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
693 for(
int h=0;h<howmany;h++)
694 for(
int id=0;
id<nprocs;++id){
696 for(
int i=0;i<local_n0;i++){
697 std::cout<<std::endl;
698 for(
int j=0;j<N[1];j++){
699 std::cout<<
'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
703 MPI_Barrier(T_plan->comm);
710 shuffle_time-=MPI_Wtime();
713 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
714 #pragma omp parallel for
715 for(
int h=0;h<howmany;h++)
716 local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
718 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
719 #pragma omp parallel for
720 for(
int h=0;h<howmany;h++)
721 local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
724 shuffle_time+=MPI_Wtime();
725 timings[0]+=MPI_Wtime();
726 timings[0]+=shuffle_time;
727 timings[1]+=shuffle_time;
730 MPI_Barrier(T_plan->comm);
739 local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv );
741 else if(Flags[0]==0 && Flags[1]==0){
742 local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,T_plan->buffer_2 );
745 shuffle_time+=MPI_Wtime();
747 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
749 for(
int h=0;h<howmany;h++)
750 for(
int id=0;
id<nprocs;++id){
752 for(
int i=0;i<N[1];i++){
753 std::cout<<std::endl;
754 for(
int j=0;j<local_n0;j++){
755 std::cout<<
'\t'<<T_plan->buffer_2[ptr];
760 MPI_Barrier(T_plan->comm);
768 int* scount_proc= T_plan->scount_proc;
769 int* rcount_proc= T_plan->rcount_proc;
770 int* soffset_proc= T_plan->soffset_proc;
771 int* roffset_proc= T_plan->roffset_proc;
773 MPI_Datatype *stype=T_plan->stype;
774 MPI_Datatype *rtype=T_plan->rtype;
775 MPI_Barrier(T_plan->comm);
778 comm_time-=MPI_Wtime();
780 int soffset=0,roffset=0;
782 MPI_Request * s_request=
new MPI_Request[nprocs];
783 MPI_Request * request=
new MPI_Request[nprocs];
784 #pragma omp parallel for
785 for (
int proc=0;proc<nprocs;++proc){
786 request[proc]=MPI_REQUEST_NULL;
787 s_request[proc]=MPI_REQUEST_NULL;
791 double *s_buf, *r_buf;
793 s_buf=send_recv; r_buf=data;
795 else if(Flags[0]==0 && Flags[1]==0){
796 s_buf=buffer_2; r_buf=send_recv;
799 for (
int proc=0;proc<nprocs;++proc){
801 soffset=soffset_proc[proc];
802 roffset=roffset_proc[proc];
803 MPI_Isend(&s_buf[soffset],scount_proc[proc],MPI_DOUBLE,proc, tag,
804 T_plan->comm, &s_request[proc]);
805 MPI_Irecv(&r_buf[roffset],rcount_proc[proc],MPI_DOUBLE, proc,
806 tag, T_plan->comm, &request[proc]);
810 soffset=soffset_proc[procid];
811 roffset=roffset_proc[procid];
812 for(
int h=0;h<howmany;h++)
813 memcpy(&r_buf[h*odist+roffset],&s_buf[h*idist+soffset],
sizeof(
double)*scount_proc[procid]);
818 for (
int proc=0;proc<nprocs;++proc){
819 MPI_Wait(&request[proc], &ierr);
820 MPI_Wait(&s_request[proc], &ierr);
822 comm_time+=MPI_Wtime();
825 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
827 for(
int h=0;h<howmany;h++)
828 for(
int id=0;
id<nprocs;++id){
830 for(
int i=0;i<local_n1;i++){
831 std::cout<<std::endl;
832 for(
int j=0;j<N[0];j++){
833 std::cout<<
'\t'<<send_recv[ptr];
838 MPI_Barrier(T_plan->comm);
844 reshuffle_time-=MPI_Wtime();
847 local_transpose(N[0],local_n1,n_tuples,send_recv,data );
852 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
854 for(
int h=0;h<howmany;h++)
855 for(
int id=0;
id<nprocs_1;++id){
857 for(
int i=0;i<local_n1;i++){
858 std::cout<<std::endl;
859 for(
int j=0;j<N[0];j++){
860 std::cout<<
'\t'<<data[h*odist+(i*N[0]+j)*n_tuples];
864 MPI_Barrier(T_plan->comm);
868 reshuffle_time+=MPI_Wtime();
869 MPI_Barrier(T_plan->comm);
875 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
876 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
877 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
878 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
880 timings[0]+=MPI_Wtime();
881 timings[1]+=shuffle_time;
882 timings[2]+=comm_time;
883 timings[3]+=reshuffle_time;
886 void fast_transpose_v2(T_Plan* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
888 std::bitset<8> Flags(flags);
889 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
890 MPI_Barrier(T_plan->comm);
894 transpose_v6(T_plan,(
double*)data,timings,flags,howmany);
895 MPI_Barrier(T_plan->comm);
898 timings[0]-=MPI_Wtime();
900 int nprocs_0, nprocs_1;
901 nprocs=T_plan->nprocs;
902 procid=T_plan->procid;
903 nprocs_0=T_plan->nprocs_0;
904 nprocs_1=T_plan->nprocs_1;
905 ptrdiff_t *N=T_plan->N;
906 double * send_recv = T_plan->buffer;
907 double * buffer_2= T_plan->buffer_2;
908 ptrdiff_t local_n0=T_plan->local_n0;
909 ptrdiff_t local_n1=T_plan->local_n1;
910 ptrdiff_t n_tuples=T_plan->n_tuples;
912 int idist=N[1]*local_n0*n_tuples;
913 int odist=N[0]*local_n1*n_tuples;
915 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
917 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
919 for(
int h=0;h<howmany;h++)
920 for(
int id=0;
id<nprocs;++id){
922 for(
int i=0;i<local_n0;i++){
923 std::cout<<std::endl;
924 for(
int j=0;j<N[1];j++){
925 std::cout<<
'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
929 MPI_Barrier(T_plan->comm);
936 shuffle_time-=MPI_Wtime();
939 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
940 #pragma omp parallel for
941 for(
int h=0;h<howmany;h++)
942 local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
944 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
945 #pragma omp parallel for
946 for(
int h=0;h<howmany;h++)
947 local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
950 shuffle_time+=MPI_Wtime();
951 timings[0]+=MPI_Wtime();
952 timings[0]+=shuffle_time;
953 timings[1]+=shuffle_time;
956 MPI_Barrier(T_plan->comm);
965 local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv );
967 else if(Flags[0]==0 && Flags[1]==0){
968 local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,T_plan->buffer_2 );
971 shuffle_time+=MPI_Wtime();
973 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
975 for(
int h=0;h<howmany;h++)
976 for(
int id=0;
id<nprocs;++id){
978 for(
int i=0;i<N[1];i++){
979 std::cout<<std::endl;
980 for(
int j=0;j<local_n0;j++){
981 std::cout<<
'\t'<<T_plan->buffer_2[ptr];
986 MPI_Barrier(T_plan->comm);
994 int* scount_proc_f= T_plan->scount_proc_f;
995 int* rcount_proc_f= T_plan->rcount_proc_f;
996 int* soffset_proc_f= T_plan->soffset_proc_f;
997 int* roffset_proc_f= T_plan->roffset_proc_f;
999 MPI_Datatype *stype=T_plan->stype;
1000 MPI_Datatype *rtype=T_plan->rtype;
1001 MPI_Barrier(T_plan->comm);
1003 double *s_buf, *r_buf;
1005 s_buf=send_recv; r_buf=data;
1007 else if(Flags[0]==0 && Flags[1]==0){
1008 s_buf=buffer_2; r_buf=send_recv;
1011 comm_time-=MPI_Wtime();
1012 if(T_plan->is_evenly_distributed==0)
1013 MPI_Alltoallv(s_buf,scount_proc_f,
1014 soffset_proc_f, MPI_DOUBLE,r_buf,
1015 rcount_proc_f,roffset_proc_f, MPI_DOUBLE,
1018 MPI_Alltoall(s_buf, scount_proc_f[0], MPI_DOUBLE,
1019 r_buf, rcount_proc_f[0], MPI_DOUBLE,
1022 comm_time+=MPI_Wtime();
1025 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
1027 for(
int h=0;h<howmany;h++)
1028 for(
int id=0;
id<nprocs;++id){
1030 for(
int i=0;i<local_n1;i++){
1031 std::cout<<std::endl;
1032 for(
int j=0;j<N[0];j++){
1033 std::cout<<
'\t'<<send_recv[ptr];
1038 MPI_Barrier(T_plan->comm);
1044 reshuffle_time-=MPI_Wtime();
1047 local_transpose(N[0],local_n1,n_tuples,send_recv,data );
1052 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
1054 for(
int h=0;h<howmany;h++)
1055 for(
int id=0;
id<nprocs_1;++id){
1057 for(
int i=0;i<local_n1;i++){
1058 std::cout<<std::endl;
1059 for(
int j=0;j<N[0];j++){
1060 std::cout<<
'\t'<<data[h*odist+(i*N[0]+j)*n_tuples];
1064 MPI_Barrier(T_plan->comm);
1068 reshuffle_time+=MPI_Wtime();
1069 MPI_Barrier(T_plan->comm);
1073 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
1074 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
1075 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
1076 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1078 timings[0]+=MPI_Wtime();
1079 timings[1]+=shuffle_time;
1080 timings[2]+=comm_time;
1081 timings[3]+=reshuffle_time;
1084 void fast_transpose_v3(T_Plan* T_plan,
double * data,
double *timings,
int kway,
unsigned flags,
int howmany,
int tag ){
1086 std::bitset<8> Flags(flags);
1087 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
1088 MPI_Barrier(T_plan->comm);
1092 transpose_v7(T_plan,(
double*)data,timings,kway,flags,howmany);
1093 MPI_Barrier(T_plan->comm);
1096 timings[0]-=MPI_Wtime();
1098 int nprocs_0, nprocs_1;
1099 nprocs=T_plan->nprocs;
1100 procid=T_plan->procid;
1101 nprocs_0=T_plan->nprocs_0;
1102 nprocs_1=T_plan->nprocs_1;
1103 ptrdiff_t *N=T_plan->N;
1104 double * send_recv = T_plan->buffer;
1105 double * buffer_2= T_plan->buffer_2;
1106 ptrdiff_t local_n0=T_plan->local_n0;
1107 ptrdiff_t local_n1=T_plan->local_n1;
1108 ptrdiff_t n_tuples=T_plan->n_tuples;
1110 int idist=N[1]*local_n0*n_tuples;
1111 int odist=N[0]*local_n1*n_tuples;
1113 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1115 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
1117 for(
int h=0;h<howmany;h++)
1118 for(
int id=0;
id<nprocs;++id){
1120 for(
int i=0;i<local_n0;i++){
1121 std::cout<<std::endl;
1122 for(
int j=0;j<N[1];j++){
1123 std::cout<<
'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
1127 MPI_Barrier(T_plan->comm);
1134 shuffle_time-=MPI_Wtime();
1137 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1138 #pragma omp parallel for
1139 for(
int h=0;h<howmany;h++)
1140 local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
1142 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1143 #pragma omp parallel for
1144 for(
int h=0;h<howmany;h++)
1145 local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
1148 shuffle_time+=MPI_Wtime();
1149 timings[0]+=MPI_Wtime();
1150 timings[0]+=shuffle_time;
1151 timings[1]+=shuffle_time;
1154 MPI_Barrier(T_plan->comm);
1163 local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,send_recv );
1165 else if(Flags[0]==0 && Flags[1]==0){
1166 local_transpose_col(local_n0,nprocs_1,n_tuples*T_plan->local_n1_proc[0], n_tuples*T_plan->last_local_n1,data,T_plan->buffer_2 );
1169 shuffle_time+=MPI_Wtime();
1171 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
1173 for(
int h=0;h<howmany;h++)
1174 for(
int id=0;
id<nprocs;++id){
1176 for(
int i=0;i<N[1];i++){
1177 std::cout<<std::endl;
1178 for(
int j=0;j<local_n0;j++){
1179 std::cout<<
'\t'<<T_plan->buffer_2[ptr];
1184 MPI_Barrier(T_plan->comm);
1192 int* scount_proc_f= T_plan->scount_proc_f;
1193 int* rcount_proc_f= T_plan->rcount_proc_f;
1194 int* soffset_proc_f= T_plan->soffset_proc_f;
1195 int* roffset_proc_f= T_plan->roffset_proc_f;
1197 MPI_Datatype *stype=T_plan->stype;
1198 MPI_Datatype *rtype=T_plan->rtype;
1199 MPI_Barrier(T_plan->comm);
1201 double *s_buf, *r_buf;
1203 s_buf=send_recv; r_buf=data;
1205 else if(Flags[0]==0 && Flags[1]==0){
1206 s_buf=buffer_2; r_buf=send_recv;
1209 comm_time-=MPI_Wtime();
1210 if(T_plan->kway_async)
1211 par::Mpi_Alltoallv_dense<double,true>(s_buf , scount_proc_f, soffset_proc_f,
1212 r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
1214 par::Mpi_Alltoallv_dense<double,false>(s_buf , scount_proc_f, soffset_proc_f,
1215 r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
1217 comm_time+=MPI_Wtime();
1220 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
1222 for(
int h=0;h<howmany;h++)
1223 for(
int id=0;
id<nprocs;++id){
1225 for(
int i=0;i<local_n1;i++){
1226 std::cout<<std::endl;
1227 for(
int j=0;j<N[0];j++){
1228 std::cout<<
'\t'<<send_recv[ptr];
1233 MPI_Barrier(T_plan->comm);
1239 reshuffle_time-=MPI_Wtime();
1242 local_transpose(N[0],local_n1,n_tuples,send_recv,data );
1247 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
1249 for(
int h=0;h<howmany;h++)
1250 for(
int id=0;
id<nprocs_1;++id){
1252 for(
int i=0;i<local_n1;i++){
1253 std::cout<<std::endl;
1254 for(
int j=0;j<N[0];j++){
1255 std::cout<<
'\t'<<data[h*odist+(i*N[0]+j)*n_tuples];
1259 MPI_Barrier(T_plan->comm);
1263 reshuffle_time+=MPI_Wtime();
1264 MPI_Barrier(T_plan->comm);
1268 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
1269 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
1270 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
1271 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1273 timings[0]+=MPI_Wtime();
1274 timings[1]+=shuffle_time;
1275 timings[2]+=comm_time;
1276 timings[3]+=reshuffle_time;
1280 void fast_transpose_v1_h(T_Plan* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
1282 std::bitset<8> Flags(flags);
1283 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
1284 MPI_Barrier(T_plan->comm);
1288 transpose_v5(T_plan,(
double*)data,timings,flags,howmany,tag);
1289 MPI_Barrier(T_plan->comm);
1292 timings[0]-=MPI_Wtime();
1294 int nprocs_0, nprocs_1;
1295 nprocs=T_plan->nprocs;
1296 procid=T_plan->procid;
1297 nprocs_0=T_plan->nprocs_0;
1298 nprocs_1=T_plan->nprocs_1;
1299 ptrdiff_t *N=T_plan->N;
1300 double * send_recv = T_plan->buffer;
1301 double * buffer_2= T_plan->buffer_2;
1302 ptrdiff_t local_n0=T_plan->local_n0;
1303 ptrdiff_t local_n1=T_plan->local_n1;
1304 ptrdiff_t n_tuples=T_plan->n_tuples;
1306 int idist=N[1]*local_n0*n_tuples;
1307 int odist=N[0]*local_n1*n_tuples;
1309 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1312 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
1314 for(
int h=0;h<howmany;h++)
1315 for(
int id=0;
id<nprocs;++id){
1317 for(
int i=0;i<local_n0;i++){
1318 std::cout<<std::endl;
1319 for(
int j=0;j<N[1];j++){
1320 ptr=h*idist+(i*N[1]+j)*n_tuples;
1321 std::cout<<
'\t'<<data[ptr]<<
","<<data[ptr+1];
1325 MPI_Barrier(T_plan->comm);
1332 ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
1333 ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
1334 ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
1335 ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
1336 shuffle_time-=MPI_Wtime();
1337 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1338 #pragma omp parallel for
1339 for(
int h=0;h<howmany;h++)
1340 local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
1342 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1343 #pragma omp parallel for
1344 for(
int h=0;h<howmany;h++)
1345 local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
1348 shuffle_time+=MPI_Wtime();
1349 timings[0]+=MPI_Wtime();
1350 timings[0]+=shuffle_time;
1351 timings[1]+=shuffle_time;
1354 MPI_Barrier(T_plan->comm);
1364 for (
int proc=0;proc<nprocs_1;++proc)
1365 for(
int h=0;h<howmany;++h){
1366 for(
int i=0;i<local_n0;++i){
1372 memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],
sizeof(
double)*n_tuples*local_n1_proc[proc]);
1373 ptr+=n_tuples*local_n1_proc[proc];
1377 shuffle_time+=MPI_Wtime();
1379 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
1381 for(
int id=0;
id<nprocs;++id){
1382 for(
int h=0;h<howmany;h++)
1384 for(
int i=0;i<N[1];i++){
1385 std::cout<<std::endl;
1386 for(
int j=0;j<local_n0;j++){
1387 std::cout<<
'\t'<<buffer_2[ptr]<<
","<<buffer_2[ptr+1];
1392 MPI_Barrier(T_plan->comm);
1401 int* scount_proc= T_plan->scount_proc;
1402 int* rcount_proc= T_plan->rcount_proc;
1403 int* soffset_proc= T_plan->soffset_proc;
1404 int* roffset_proc= T_plan->roffset_proc;
1406 MPI_Barrier(T_plan->comm);
1409 comm_time-=MPI_Wtime();
1411 int soffset=0,roffset=0;
1413 MPI_Request * s_request=
new MPI_Request[nprocs];
1414 MPI_Request * request=
new MPI_Request[nprocs];
1415 #pragma omp parallel for
1416 for (
int proc=0;proc<nprocs;++proc){
1417 request[proc]=MPI_REQUEST_NULL;
1418 s_request[proc]=MPI_REQUEST_NULL;
1422 double *s_buf, *r_buf;
1423 s_buf=buffer_2; r_buf=send_recv;
1425 for (
int proc=0;proc<nprocs;++proc){
1427 soffset=soffset_proc[proc];
1428 roffset=roffset_proc[proc];
1429 MPI_Isend(&s_buf[soffset*howmany],scount_proc[proc]*howmany,MPI_DOUBLE,proc, tag,
1430 T_plan->comm, &s_request[proc]);
1431 MPI_Irecv(&r_buf[roffset*howmany],rcount_proc[proc]*howmany,MPI_DOUBLE, proc,
1432 tag, T_plan->comm, &request[proc]);
1435 soffset=soffset_proc[procid];
1436 roffset=roffset_proc[procid];
1437 memcpy(&r_buf[roffset*howmany],&s_buf[soffset*howmany],howmany*
sizeof(
double)*scount_proc[procid]);
1442 for (
int proc=0;proc<nprocs;++proc){
1443 MPI_Wait(&request[proc], &ierr);
1444 MPI_Wait(&s_request[proc], &ierr);
1446 comm_time+=MPI_Wtime();
1449 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
1451 for(
int id=0;
id<nprocs;++id){
1453 for(
int h=0;h<howmany;h++)
1454 for(
int i=0;i<local_n1;i++){
1455 std::cout<<std::endl;
1456 for(
int j=0;j<N[0];j++){
1457 std::cout<<
'\t'<<send_recv[ptr]<<
","<<send_recv[ptr+1];
1462 MPI_Barrier(T_plan->comm);
1468 reshuffle_time-=MPI_Wtime();
1471 for (
int proc=0;proc<nprocs_0;++proc)
1472 for(
int h=0;h<howmany;++h){
1473 for(
int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
1474 memcpy(&data[h*odist+(i*local_n1)*n_tuples],&send_recv[ptr],local_n1*
sizeof(
double)*n_tuples);
1476 ptr+=n_tuples*local_n1;
1488 #pragma omp parallel for
1489 for(
int h=0;h<howmany;h++)
1490 local_transpose(N[0],local_n1,n_tuples,&data[h*odist] );
1493 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
1495 for(
int id=0;
id<nprocs_1;++id){
1497 for(
int h=0;h<howmany;h++)
1498 for(
int i=0;i<N[0];i++){
1499 std::cout<<std::endl;
1500 for(
int j=0;j<local_n1;j++){
1501 ptr=h*odist+(i*local_n1+j)*n_tuples;
1502 std::cout<<
'\t'<<data[ptr]<<
","<<data[ptr+1];
1506 MPI_Barrier(T_plan->comm);
1510 reshuffle_time+=MPI_Wtime();
1511 MPI_Barrier(T_plan->comm);
1513 delete [] s_request;
1517 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
1518 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
1519 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
1520 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1522 timings[0]+=MPI_Wtime();
1523 timings[1]+=shuffle_time;
1524 timings[2]+=comm_time;
1525 timings[3]+=reshuffle_time;
1528 void fast_transpose_v2_h(T_Plan* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
1530 std::bitset<8> Flags(flags);
1531 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
1532 MPI_Barrier(T_plan->comm);
1536 transpose_v6(T_plan,(
double*)data,timings,flags,howmany);
1537 MPI_Barrier(T_plan->comm);
1540 timings[0]-=MPI_Wtime();
1542 int nprocs_0, nprocs_1;
1543 nprocs=T_plan->nprocs;
1544 procid=T_plan->procid;
1545 nprocs_0=T_plan->nprocs_0;
1546 nprocs_1=T_plan->nprocs_1;
1547 ptrdiff_t *N=T_plan->N;
1548 double * send_recv = T_plan->buffer;
1549 double * buffer_2= T_plan->buffer_2;
1550 ptrdiff_t local_n0=T_plan->local_n0;
1551 ptrdiff_t local_n1=T_plan->local_n1;
1552 ptrdiff_t n_tuples=T_plan->n_tuples;
1554 int idist=N[1]*local_n0*n_tuples;
1555 int odist=N[0]*local_n1*n_tuples;
1557 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1560 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
1562 for(
int h=0;h<howmany;h++)
1563 for(
int id=0;
id<nprocs;++id){
1565 for(
int i=0;i<local_n0;i++){
1566 std::cout<<std::endl;
1567 for(
int j=0;j<N[1];j++){
1568 ptr=h*idist+(i*N[1]+j)*n_tuples;
1569 std::cout<<
'\t'<<data[ptr]<<
","<<data[ptr+1];
1573 MPI_Barrier(T_plan->comm);
1580 ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
1581 ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
1582 ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
1583 ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
1584 shuffle_time-=MPI_Wtime();
1585 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1586 #pragma omp parallel for
1587 for(
int h=0;h<howmany;h++)
1588 local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
1590 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1591 #pragma omp parallel for
1592 for(
int h=0;h<howmany;h++)
1593 local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
1596 shuffle_time+=MPI_Wtime();
1597 timings[0]+=MPI_Wtime();
1598 timings[0]+=shuffle_time;
1599 timings[1]+=shuffle_time;
1602 MPI_Barrier(T_plan->comm);
1612 for (
int proc=0;proc<nprocs_1;++proc)
1613 for(
int h=0;h<howmany;++h){
1614 for(
int i=0;i<local_n0;++i){
1620 memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],
sizeof(
double)*n_tuples*local_n1_proc[proc]);
1621 ptr+=n_tuples*local_n1_proc[proc];
1625 shuffle_time+=MPI_Wtime();
1627 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
1629 for(
int id=0;
id<nprocs;++id){
1630 for(
int h=0;h<howmany;h++)
1632 for(
int i=0;i<N[1];i++){
1633 std::cout<<std::endl;
1634 for(
int j=0;j<local_n0;j++){
1635 std::cout<<
'\t'<<buffer_2[ptr]<<
","<<buffer_2[ptr+1];
1640 MPI_Barrier(T_plan->comm);
1649 int* scount_proc_f= T_plan->scount_proc_f;
1650 int* rcount_proc_f= T_plan->rcount_proc_f;
1651 int* soffset_proc_f= T_plan->soffset_proc_f;
1652 int* roffset_proc_f= T_plan->roffset_proc_f;
1654 MPI_Barrier(T_plan->comm);
1657 comm_time-=MPI_Wtime();
1659 int soffset=0,roffset=0;
1663 double *s_buf, *r_buf;
1664 s_buf=buffer_2; r_buf=send_recv;
1666 if(T_plan->is_evenly_distributed==0)
1667 MPI_Alltoallv(s_buf,scount_proc_f,
1668 soffset_proc_f, MPI_DOUBLE,r_buf,
1669 rcount_proc_f,roffset_proc_f, MPI_DOUBLE,
1672 MPI_Alltoall(s_buf, scount_proc_f[0], MPI_DOUBLE,
1673 r_buf, rcount_proc_f[0], MPI_DOUBLE,
1676 comm_time+=MPI_Wtime();
1680 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
1682 for(
int id=0;
id<nprocs;++id){
1684 for(
int h=0;h<howmany;h++)
1685 for(
int i=0;i<local_n1;i++){
1686 std::cout<<std::endl;
1687 for(
int j=0;j<N[0];j++){
1688 std::cout<<
'\t'<<send_recv[ptr]<<
","<<send_recv[ptr+1];
1693 MPI_Barrier(T_plan->comm);
1699 reshuffle_time-=MPI_Wtime();
1701 for (
int proc=0;proc<nprocs_0;++proc)
1702 for(
int h=0;h<howmany;++h){
1703 for(
int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
1704 memcpy(&data[h*odist+(i*local_n1)*n_tuples],&send_recv[ptr],local_n1*
sizeof(
double)*n_tuples);
1705 ptr+=n_tuples*local_n1;
1717 #pragma omp parallel for
1718 for(
int h=0;h<howmany;h++)
1719 local_transpose(N[0],local_n1,n_tuples,&data[h*odist] );
1722 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
1724 for(
int id=0;
id<nprocs_1;++id){
1726 for(
int h=0;h<howmany;h++)
1727 for(
int i=0;i<N[0];i++){
1728 std::cout<<std::endl;
1729 for(
int j=0;j<local_n1;j++){
1730 ptr=h*odist+(i*local_n1+j)*n_tuples;
1731 std::cout<<
'\t'<<data[ptr]<<
","<<data[ptr+1];
1735 MPI_Barrier(T_plan->comm);
1740 reshuffle_time+=MPI_Wtime();
1741 MPI_Barrier(T_plan->comm);
1745 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
1746 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
1747 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
1748 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1750 timings[0]+=MPI_Wtime();
1751 timings[1]+=shuffle_time;
1752 timings[2]+=comm_time;
1753 timings[3]+=reshuffle_time;
1756 void fast_transpose_v3_h(T_Plan* T_plan,
double * data,
double *timings,
int kway,
unsigned flags,
int howmany,
int tag ){
1758 std::bitset<8> Flags(flags);
1759 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
1760 MPI_Barrier(T_plan->comm);
1764 transpose_v7(T_plan,(
double*)data,timings,kway,flags,howmany);
1765 MPI_Barrier(T_plan->comm);
1768 timings[0]-=MPI_Wtime();
1770 int nprocs_0, nprocs_1;
1771 nprocs=T_plan->nprocs;
1772 procid=T_plan->procid;
1773 nprocs_0=T_plan->nprocs_0;
1774 nprocs_1=T_plan->nprocs_1;
1775 ptrdiff_t *N=T_plan->N;
1776 double * send_recv = T_plan->buffer;
1777 double * buffer_2= T_plan->buffer_2;
1778 ptrdiff_t local_n0=T_plan->local_n0;
1779 ptrdiff_t local_n1=T_plan->local_n1;
1780 ptrdiff_t n_tuples=T_plan->n_tuples;
1782 int idist=N[1]*local_n0*n_tuples;
1783 int odist=N[0]*local_n1*n_tuples;
1785 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
1788 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
1790 for(
int h=0;h<howmany;h++)
1791 for(
int id=0;
id<nprocs;++id){
1793 for(
int i=0;i<local_n0;i++){
1794 std::cout<<std::endl;
1795 for(
int j=0;j<N[1];j++){
1796 ptr=h*idist+(i*N[1]+j)*n_tuples;
1797 std::cout<<
'\t'<<data[ptr]<<
","<<data[ptr+1];
1801 MPI_Barrier(T_plan->comm);
1808 ptrdiff_t *local_n1_proc=&T_plan->local_n1_proc[0];
1809 ptrdiff_t *local_n0_proc=&T_plan->local_n0_proc[0];
1810 ptrdiff_t *local_0_start_proc=T_plan->local_0_start_proc;
1811 ptrdiff_t *local_1_start_proc=T_plan->local_1_start_proc;
1812 shuffle_time-=MPI_Wtime();
1813 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
1814 #pragma omp parallel for
1815 for(
int h=0;h<howmany;h++)
1816 local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
1818 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
1819 #pragma omp parallel for
1820 for(
int h=0;h<howmany;h++)
1821 local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
1824 shuffle_time+=MPI_Wtime();
1825 timings[0]+=MPI_Wtime();
1826 timings[0]+=shuffle_time;
1827 timings[1]+=shuffle_time;
1830 MPI_Barrier(T_plan->comm);
1840 for (
int proc=0;proc<nprocs_1;++proc)
1841 for(
int h=0;h<howmany;++h){
1842 for(
int i=0;i<local_n0;++i){
1848 memcpy(&buffer_2[ptr],&data[h*idist+(i*N[1]+local_1_start_proc[proc])*n_tuples],
sizeof(
double)*n_tuples*local_n1_proc[proc]);
1849 ptr+=n_tuples*local_n1_proc[proc];
1853 shuffle_time+=MPI_Wtime();
1855 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
1857 for(
int id=0;
id<nprocs;++id){
1858 for(
int h=0;h<howmany;h++)
1860 for(
int i=0;i<N[1];i++){
1861 std::cout<<std::endl;
1862 for(
int j=0;j<local_n0;j++){
1863 std::cout<<
'\t'<<buffer_2[ptr]<<
","<<buffer_2[ptr+1];
1868 MPI_Barrier(T_plan->comm);
1877 int* scount_proc_f= T_plan->scount_proc_f;
1878 int* rcount_proc_f= T_plan->rcount_proc_f;
1879 int* soffset_proc_f= T_plan->soffset_proc_f;
1880 int* roffset_proc_f= T_plan->roffset_proc_f;
1882 MPI_Barrier(T_plan->comm);
1885 comm_time-=MPI_Wtime();
1887 int soffset=0,roffset=0;
1891 double *s_buf, *r_buf;
1892 s_buf=buffer_2; r_buf=send_recv;
1894 if(T_plan->kway_async)
1895 par::Mpi_Alltoallv_dense<double,true>(s_buf , scount_proc_f, soffset_proc_f,
1896 r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
1898 par::Mpi_Alltoallv_dense<double,false>(s_buf , scount_proc_f, soffset_proc_f,
1899 r_buf, rcount_proc_f, roffset_proc_f, T_plan->comm,kway);
1901 comm_time+=MPI_Wtime();
1905 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
1907 for(
int id=0;
id<nprocs;++id){
1909 for(
int h=0;h<howmany;h++)
1910 for(
int i=0;i<local_n1;i++){
1911 std::cout<<std::endl;
1912 for(
int j=0;j<N[0];j++){
1913 std::cout<<
'\t'<<send_recv[ptr]<<
","<<send_recv[ptr+1];
1918 MPI_Barrier(T_plan->comm);
1924 reshuffle_time-=MPI_Wtime();
1926 for (
int proc=0;proc<nprocs_0;++proc)
1927 for(
int h=0;h<howmany;++h){
1928 for(
int i=local_0_start_proc[proc];i<local_0_start_proc[proc]+local_n0_proc[proc];++i){
1929 memcpy(&data[h*odist+(i*local_n1)*n_tuples],&send_recv[ptr],local_n1*
sizeof(
double)*n_tuples);
1930 ptr+=n_tuples*local_n1;
1942 #pragma omp parallel for
1943 for(
int h=0;h<howmany;h++)
1944 local_transpose(N[0],local_n1,n_tuples,&data[h*odist] );
1947 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
1949 for(
int id=0;
id<nprocs_1;++id){
1951 for(
int h=0;h<howmany;h++)
1952 for(
int i=0;i<N[0];i++){
1953 std::cout<<std::endl;
1954 for(
int j=0;j<local_n1;j++){
1955 ptr=h*odist+(i*local_n1+j)*n_tuples;
1956 std::cout<<
'\t'<<data[ptr]<<
","<<data[ptr+1];
1960 MPI_Barrier(T_plan->comm);
1965 reshuffle_time+=MPI_Wtime();
1966 MPI_Barrier(T_plan->comm);
1970 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
1971 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
1972 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
1973 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
1975 timings[0]+=MPI_Wtime();
1976 timings[1]+=shuffle_time;
1977 timings[2]+=comm_time;
1978 timings[3]+=reshuffle_time;
1982 void transpose_v5(T_Plan* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
1984 std::bitset<8> Flags(flags);
1985 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
1986 MPI_Barrier(T_plan->comm);
1989 timings[0]-=MPI_Wtime();
1991 int nprocs_0, nprocs_1;
1992 nprocs=T_plan->nprocs;
1993 procid=T_plan->procid;
1994 nprocs_0=T_plan->nprocs_0;
1995 nprocs_1=T_plan->nprocs_1;
1996 ptrdiff_t *N=T_plan->N;
1997 double * send_recv = T_plan->buffer;
1998 ptrdiff_t local_n0=T_plan->local_n0;
1999 ptrdiff_t local_n1=T_plan->local_n1;
2000 ptrdiff_t n_tuples=T_plan->n_tuples;
2002 int idist=N[1]*local_n0*n_tuples;
2003 int odist=N[0]*local_n1*n_tuples;
2005 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2007 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
2009 for(
int h=0;h<howmany;h++)
2010 for(
int id=0;
id<nprocs;++id){
2012 for(
int i=0;i<local_n0;i++){
2013 std::cout<<std::endl;
2014 for(
int j=0;j<N[1];j++){
2015 std::cout<<
'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
2019 MPI_Barrier(T_plan->comm);
2026 shuffle_time-=MPI_Wtime();
2030 local_transpose(local_n0,N[1],n_tuples,data );
2032 #pragma omp parallel for
2033 for(
int i=0;i<howmany;i++)
2034 local_transpose(local_n0,N[1],n_tuples,&data[i*idist] );
2037 shuffle_time+=MPI_Wtime();
2038 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
2040 for(
int h=0;h<howmany;h++)
2041 for(
int id=0;
id<nprocs;++id){
2043 for(
int i=0;i<N[1];i++){
2044 std::cout<<std::endl;
2045 for(
int j=0;j<local_n0;j++){
2046 std::cout<<
'\t'<<data[h*idist+(i*local_n0+j)*n_tuples];
2050 MPI_Barrier(T_plan->comm);
2053 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2054 #pragma omp parallel for
2055 for(
int h=0;h<howmany;h++)
2056 local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
2059 timings[0]+=MPI_Wtime();
2060 timings[0]+=shuffle_time;
2061 timings[1]+=shuffle_time;
2064 MPI_Barrier(T_plan->comm);
2073 int* scount_proc= T_plan->scount_proc;
2074 int* rcount_proc= T_plan->rcount_proc;
2075 int* soffset_proc= T_plan->soffset_proc;
2076 int* roffset_proc= T_plan->roffset_proc;
2078 MPI_Datatype *stype=T_plan->stype;
2079 MPI_Datatype *rtype=T_plan->rtype;
2080 MPI_Barrier(T_plan->comm);
2082 comm_time-=MPI_Wtime();
2084 int soffset=0,roffset=0;
2086 MPI_Request * s_request=
new MPI_Request[nprocs];
2087 MPI_Request * request=
new MPI_Request[nprocs];
2088 #pragma omp parallel for
2089 for (
int proc=0;proc<nprocs;++proc){
2090 request[proc]=MPI_REQUEST_NULL;
2091 s_request[proc]=MPI_REQUEST_NULL;
2095 for (
int proc=0;proc<nprocs;++proc){
2097 soffset=soffset_proc[proc];
2098 MPI_Isend(&data[soffset],1,stype[proc],proc, tag,
2099 T_plan->comm, &s_request[proc]);
2103 for (
int proc=0;proc<nprocs;++proc){
2105 roffset=roffset_proc[proc];
2106 MPI_Irecv(&send_recv[roffset],1, rtype[proc], proc,
2107 tag, T_plan->comm, &request[proc]);
2110 soffset=soffset_proc[proc];
2111 roffset=roffset_proc[proc];
2112 for(
int h=0;h<howmany;h++)
2113 memcpy(&send_recv[h*odist+roffset],&data[h*idist+soffset],
sizeof(
double)*scount_proc[proc]);
2117 for (
int proc=0;proc<nprocs;++proc){
2118 MPI_Wait(&request[proc], &ierr);
2119 MPI_Wait(&s_request[proc], &ierr);
2121 comm_time+=MPI_Wtime();
2123 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
2125 for(
int h=0;h<howmany;h++)
2126 for(
int id=0;
id<nprocs;++id){
2128 for(
int i=0;i<local_n1;i++){
2129 std::cout<<std::endl;
2130 for(
int j=0;j<N[0];j++){
2131 std::cout<<
'\t'<<send_recv[h*odist+(i*N[0]+j)*n_tuples];
2135 MPI_Barrier(T_plan->comm);
2141 reshuffle_time-=MPI_Wtime();
2150 int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
2152 last_ntuples=T_plan->last_recv_count/((int)local_n1);
2153 #pragma omp parallel for
2154 for(
int i=0;i<howmany;i++){
2156 memcpy(&data[i*odist],&send_recv[i*odist],T_plan->alloc_local/howmany );
2157 else if(last_ntuples!=first_ntuples){
2158 local_transpose((nprocs_0-1),local_n1,first_ntuples,&send_recv[i*odist] );
2159 local_transpose(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[i*odist],&data[i*odist] );
2161 else if(last_ntuples==first_ntuples){
2163 local_transpose(nprocs_0,local_n1,first_ntuples,&send_recv[i*odist],&data[i*odist] );
2169 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
2171 for(
int h=0;h<howmany;h++)
2172 for(
int id=0;
id<nprocs_1;++id){
2174 for(
int i=0;i<local_n1;i++){
2175 std::cout<<std::endl;
2176 for(
int j=0;j<N[0];j++){
2177 std::cout<<
'\t'<<data[h*odist+(i*N[0]+j)*n_tuples];
2181 MPI_Barrier(T_plan->comm);
2187 local_transpose(local_n1,N[0],n_tuples,data );
2189 #pragma omp parallel for
2190 for(
int h=0;h<howmany;h++)
2191 local_transpose(local_n1,N[0],n_tuples,&data[h*odist] );
2194 reshuffle_time+=MPI_Wtime();
2196 delete [] s_request;
2198 if(VERBOSE>=2) PCOUT<<
"Transposed Out"<<std::endl;
2200 for(
int h=0;h<howmany;h++)
2201 for(
int id=0;
id<nprocs_1;++id){
2203 for(
int i=0;i<N[0];i++){
2204 std::cout<<std::endl;
2205 for(
int j=0;j<local_n1;j++){
2206 std::cout<<
'\t'<<data[h*odist+(i*local_n1+j)*n_tuples];
2210 MPI_Barrier(T_plan->comm);
2212 MPI_Barrier(T_plan->comm);
2215 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
2216 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
2217 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
2218 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2220 timings[0]+=MPI_Wtime();
2221 timings[1]+=shuffle_time;
2222 timings[2]+=comm_time;
2223 timings[3]+=reshuffle_time;
2226 void transpose_v6(T_Plan* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany ){
2228 std::bitset<8> Flags(flags);
2229 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
2230 MPI_Barrier(T_plan->comm);
2233 timings[0]-=MPI_Wtime();
2235 int nprocs_0, nprocs_1;
2236 nprocs=T_plan->nprocs;
2237 procid=T_plan->procid;
2238 nprocs_0=T_plan->nprocs_0;
2239 nprocs_1=T_plan->nprocs_1;
2240 ptrdiff_t *N=T_plan->N;
2241 double * send_recv = T_plan->buffer;
2242 ptrdiff_t local_n0=T_plan->local_n0;
2243 ptrdiff_t local_n1=T_plan->local_n1;
2244 ptrdiff_t n_tuples=T_plan->n_tuples;
2245 int idist=N[1]*local_n0*n_tuples;
2246 int odist=N[0]*local_n1*n_tuples;
2248 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2250 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
2252 for (
int h=0;h<howmany;h++)
2253 for(
int id=0;
id<nprocs;++id){
2255 for(
int i=0;i<local_n0;i++){
2256 std::cout<<std::endl;
2257 for(
int j=0;j<N[1];j++){
2258 std::cout<<
'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
2262 MPI_Barrier(T_plan->comm);
2269 shuffle_time-=MPI_Wtime();
2272 for(
int i=0;i<howmany;i++)
2273 local_transpose(local_n0,N[1],n_tuples,&data[i*local_n0*N[1]*n_tuples] );
2276 shuffle_time+=MPI_Wtime();
2277 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
2279 for (
int h=0;h<howmany;h++)
2280 for(
int id=0;
id<nprocs;++id){
2282 for(
int i=0;i<N[1];i++){
2283 std::cout<<std::endl;
2284 for(
int j=0;j<local_n0;j++){
2285 std::cout<<
'\t'<<data[idist*h+(i*local_n0+j)*n_tuples]<<
","<<data[idist*h+(i*local_n0+j)*n_tuples+1];
2289 MPI_Barrier(T_plan->comm);
2292 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2293 local_transpose(local_n1,N[0],n_tuples,data );
2296 timings[0]+=MPI_Wtime();
2297 timings[0]+=shuffle_time;
2298 timings[1]+=shuffle_time;
2301 MPI_Barrier(T_plan->comm);
2310 int* scount_proc= T_plan->scount_proc;
2311 int* rcount_proc= T_plan->rcount_proc;
2312 int* soffset_proc= T_plan->soffset_proc;
2313 int* roffset_proc= T_plan->roffset_proc;
2314 MPI_Datatype *stype=T_plan->stype;
2315 MPI_Datatype *rtype=T_plan->rtype;
2317 MPI_Barrier(T_plan->comm);
2319 comm_time-=MPI_Wtime();
2324 MPI_Alltoallw(data, T_plan->scount_proc_w,
2325 T_plan->soffset_proc_w, stype,
2326 send_recv,T_plan->rcount_proc_w, T_plan->roffset_proc_w,
2327 rtype, T_plan->comm);
2329 else if(T_plan->is_evenly_distributed==0)
2330 MPI_Alltoallv(data,scount_proc,
2331 soffset_proc, MPI_DOUBLE,send_recv,
2332 rcount_proc,roffset_proc, MPI_DOUBLE,
2335 MPI_Alltoall(data, scount_proc[0], MPI_DOUBLE,
2336 send_recv, rcount_proc[0], MPI_DOUBLE,
2339 comm_time+=MPI_Wtime();
2341 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
2343 for (
int h=0;h<howmany;h++)
2344 for(
int id=0;
id<nprocs;++id){
2346 for(
int i=0;i<local_n1;i++){
2347 std::cout<<std::endl;
2348 for(
int j=0;j<N[0];j++){
2349 std::cout<<
'\t'<<send_recv[odist*h+(i*N[0]+j)*n_tuples];
2353 MPI_Barrier(T_plan->comm);
2363 reshuffle_time-=MPI_Wtime();
2372 int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
2374 last_ntuples=T_plan->last_recv_count/((int)local_n1);
2375 for(
int i=0;i<howmany;i++){
2377 memcpy(&data[i*odist],&send_recv[i*odist],T_plan->alloc_local/howmany );
2378 else if(last_ntuples!=first_ntuples){
2379 local_transpose((nprocs_0-1),local_n1,first_ntuples,&send_recv[i*odist] );
2380 local_transpose(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[i*odist],&data[i*odist] );
2382 else if(last_ntuples==first_ntuples){
2384 local_transpose(nprocs_0,local_n1,first_ntuples,&send_recv[i*odist],&data[i*odist] );
2389 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
2391 for (
int h=0;h<howmany;h++)
2392 for(
int id=0;
id<nprocs_1;++id){
2394 for(
int i=0;i<local_n1;i++){
2395 std::cout<<std::endl;
2396 for(
int j=0;j<N[0];j++){
2397 std::cout<<
'\t'<<data[odist*h+(i*N[0]+j)*n_tuples];
2401 MPI_Barrier(T_plan->comm);
2405 for (
int h=0;h<howmany;h++)
2406 local_transpose(local_n1,N[0],n_tuples,&data[odist*h] );
2408 if(VERBOSE>=2) PCOUT<<
"Transposed Out"<<std::endl;
2410 for (
int h=0;h<howmany;h++)
2411 for(
int id=0;
id<nprocs_1;++id){
2413 for(
int i=0;i<N[0];i++){
2414 std::cout<<std::endl;
2415 for(
int j=0;j<local_n1;j++){
2416 std::cout<<
'\t'<<data[odist*h+(i*local_n1+j)*n_tuples];
2420 MPI_Barrier(T_plan->comm);
2425 reshuffle_time+=MPI_Wtime();
2427 MPI_Barrier(T_plan->comm);
2429 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
2430 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
2431 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
2432 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2434 timings[0]+=MPI_Wtime();
2435 timings[1]+=shuffle_time;
2436 timings[2]+=comm_time;
2437 timings[3]+=reshuffle_time;
2440 void transpose_v7(T_Plan* T_plan,
double * data,
double *timings,
int kway,
unsigned flags,
int howmany ){
2441 std::bitset<8> Flags(flags);
2442 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
2443 MPI_Barrier(T_plan->comm);
2447 timings[0]-=MPI_Wtime();
2449 int nprocs_0, nprocs_1;
2450 nprocs=T_plan->nprocs;
2451 procid=T_plan->procid;
2452 nprocs_0=T_plan->nprocs_0;
2453 nprocs_1=T_plan->nprocs_1;
2454 ptrdiff_t *N=T_plan->N;
2455 double * send_recv = T_plan->buffer;
2456 ptrdiff_t local_n0=T_plan->local_n0;
2457 ptrdiff_t local_n1=T_plan->local_n1;
2458 ptrdiff_t n_tuples=T_plan->n_tuples;
2459 int idist=N[1]*local_n0*n_tuples;
2460 int odist=N[0]*local_n1*n_tuples;
2462 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2464 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
2466 for(
int id=0;
id<nprocs;++id){
2468 for(
int i=0;i<local_n0;i++){
2469 std::cout<<std::endl;
2470 for(
int j=0;j<N[1];j++){
2471 std::cout<<
'\t'<<data[(i*N[1]+j)*n_tuples];
2475 MPI_Barrier(T_plan->comm);
2482 shuffle_time-=MPI_Wtime();
2485 for(
int i=0;i<howmany;i++)
2486 local_transpose(local_n0,N[1],n_tuples,&data[i*local_n0*N[1]*n_tuples] );
2489 shuffle_time+=MPI_Wtime();
2490 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
2492 for(
int id=0;
id<nprocs;++id){
2494 for(
int i=0;i<N[1];i++){
2495 std::cout<<std::endl;
2496 for(
int j=0;j<local_n0;j++){
2497 std::cout<<
'\t'<<data[(i*local_n0+j)*n_tuples]<<
","<<data[(i*local_n0+j)*n_tuples+1];
2501 MPI_Barrier(T_plan->comm);
2503 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2504 local_transpose(local_n1,N[0],n_tuples,data );
2508 timings[0]+=MPI_Wtime();
2509 timings[0]+=shuffle_time;
2510 timings[1]+=shuffle_time;
2513 MPI_Barrier(T_plan->comm);
2522 int* scount_proc= T_plan->scount_proc;
2523 int* rcount_proc= T_plan->rcount_proc;
2524 int* soffset_proc= T_plan->soffset_proc;
2525 int* roffset_proc= T_plan->roffset_proc;
2527 comm_time-=MPI_Wtime();
2529 if(T_plan->kway_async)
2530 par::Mpi_Alltoallv_dense<double,true>(data , scount_proc, soffset_proc,
2531 send_recv, rcount_proc, roffset_proc, T_plan->comm,kway);
2533 par::Mpi_Alltoallv_dense<double,false>(data , scount_proc, soffset_proc,
2534 send_recv, rcount_proc, roffset_proc, T_plan->comm,kway);
2535 comm_time+=MPI_Wtime();
2537 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
2539 for(
int id=0;
id<nprocs;++id){
2541 for(
int i=0;i<local_n1;i++){
2542 std::cout<<std::endl;
2543 for(
int j=0;j<N[0];j++){
2544 std::cout<<
'\t'<<send_recv[(i*N[0]+j)*n_tuples];
2548 MPI_Barrier(T_plan->comm);
2558 reshuffle_time-=MPI_Wtime();
2567 int last_ntuples=0,first_ntuples=T_plan->local_n0_proc[0]*n_tuples;
2569 last_ntuples=T_plan->last_recv_count/((int)local_n1);
2571 for(
int i=0;i<howmany;i++){
2573 memcpy(&data[i*odist],&send_recv[i*odist],T_plan->alloc_local/howmany );
2574 else if(last_ntuples!=first_ntuples){
2575 local_transpose((nprocs_0-1),local_n1,first_ntuples,&send_recv[i*odist] );
2576 local_transpose(2,local_n1,(nprocs_0-1)*first_ntuples,last_ntuples,&send_recv[i*odist],&data[i*odist] );
2578 else if(last_ntuples==first_ntuples){
2580 local_transpose(nprocs_0,local_n1,first_ntuples,&send_recv[i*odist],&data[i*odist] );
2585 if(VERBOSE>=2) PCOUT<<
"2nd Transpose"<<std::endl;
2587 for(
int id=0;
id<nprocs_1;++id){
2589 for(
int i=0;i<local_n1;i++){
2590 std::cout<<std::endl;
2591 for(
int j=0;j<N[0];j++){
2592 std::cout<<
'\t'<<data[(i*N[0]+j)*n_tuples];
2596 MPI_Barrier(T_plan->comm);
2600 local_transpose(local_n1,N[0],n_tuples,data );
2602 reshuffle_time+=MPI_Wtime();
2603 MPI_Barrier(T_plan->comm);
2605 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
2606 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
2607 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
2608 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2610 timings[0]+=MPI_Wtime();
2611 timings[1]+=shuffle_time;
2612 timings[2]+=comm_time;
2613 timings[3]+=reshuffle_time;
2617 void transpose_v8(T_Plan* T_plan,
double * data,
double *timings,
unsigned flags,
int howmany,
int tag ){
2619 std::bitset<8> Flags(flags);
2620 if(Flags[1]==1 && Flags[0]==0 && T_plan->nprocs==1){
2621 MPI_Barrier(T_plan->comm);
2624 timings[0]-=MPI_Wtime();
2626 int nprocs_0, nprocs_1;
2627 nprocs=T_plan->nprocs;
2628 procid=T_plan->procid;
2630 nprocs_0=T_plan->nprocs_0;
2631 nprocs_1=T_plan->nprocs_1;
2632 ptrdiff_t *N=T_plan->N;
2633 double * send_recv = T_plan->buffer;
2634 ptrdiff_t local_n0=T_plan->local_n0;
2635 ptrdiff_t local_n1=T_plan->local_n1;
2636 ptrdiff_t n_tuples=T_plan->n_tuples;
2638 int idist=N[1]*local_n0*n_tuples;
2639 int odist=N[0]*local_n1*n_tuples;
2641 double comm_time=0*MPI_Wtime(), shuffle_time=0*MPI_Wtime(), reshuffle_time=0*MPI_Wtime(), total_time=0*MPI_Wtime();
2643 if(VERBOSE>=2) PCOUT<<
"INPUT:"<<std::endl;
2645 for(
int h=0;h<howmany;h++)
2646 for(
int id=0;
id<nprocs;++id){
2648 for(
int i=0;i<local_n0;i++){
2649 std::cout<<std::endl;
2650 for(
int j=0;j<N[1];j++){
2651 std::cout<<
'\t'<<data[h*idist+(i*N[1]+j)*n_tuples];
2655 MPI_Barrier(T_plan->comm);
2662 shuffle_time-=MPI_Wtime();
2663 if(nprocs==1 && Flags[0]==1 && Flags[1]==1){
2664 #pragma omp parallel for
2665 for(
int h=0;h<howmany;h++)
2666 local_transpose(local_n1,N[0],n_tuples,&data[h*idist] );
2668 if(nprocs==1 && Flags[0]==0 && Flags[1]==0){
2669 #pragma omp parallel for
2670 for(
int h=0;h<howmany;h++)
2671 local_transpose(N[0],N[1],n_tuples,&data[h*idist] );
2674 shuffle_time+=MPI_Wtime();
2675 timings[0]+=MPI_Wtime();
2676 timings[0]+=shuffle_time;
2677 timings[1]+=shuffle_time;
2680 MPI_Barrier(T_plan->comm);
2684 PCOUT<<
"\n flags="<<flags<<std::endl;
2687 #pragma omp parallel for
2688 for(
int i=0;i<howmany;i++)
2689 local_transpose(local_n0,N[1],n_tuples,&data[i*idist] );
2690 if(VERBOSE>=2) PCOUT<<
"Local Transpose:"<<std::endl;
2692 for(
int h=0;h<howmany;h++)
2693 for(
int id=0;
id<nprocs;++id){
2695 for(
int i=0;i<N[1];i++){
2696 std::cout<<std::endl;
2697 for(
int j=0;j<local_n0;j++){
2698 std::cout<<
'\t'<<data[h*idist+(i*local_n0+j)*n_tuples];
2702 MPI_Barrier(T_plan->comm);
2705 shuffle_time+=MPI_Wtime();
2714 int* scount_proc= T_plan->scount_proc_v8;
2715 int* rcount_proc= T_plan->rcount_proc_v8;
2716 int* soffset_proc= T_plan->soffset_proc_v8;
2717 int* roffset_proc= T_plan->roffset_proc_v8;
2719 MPI_Datatype* stype=T_plan->stype_v8;
2720 MPI_Datatype* rtype=T_plan->rtype_v8;
2722 for(
int proc=0;proc<nprocs;proc++){
2724 std::cout<<std::endl;
2725 std::cout<<
"proc= "<<proc<<
" l_n0= "<<local_n0<<
" l_n1= "<<local_n1<<std::endl;
2726 for (
int i=0;i<nprocs;i++){
2727 std::cout<<
" sc["<<i<<
"]= "<<scount_proc[i];
2728 std::cout<<
" so["<<i<<
"]= "<<soffset_proc[i]<<std::endl;
2730 std::cout<<std::endl;
2731 std::cout<<std::endl;
2733 MPI_Barrier(T_plan->comm);
2735 MPI_Barrier(T_plan->comm);
2736 comm_time-=MPI_Wtime();
2738 int soffset=0,roffset=0;
2740 MPI_Request * s_request=
new MPI_Request[nprocs];
2741 MPI_Request * request=
new MPI_Request[nprocs];
2742 #pragma omp parallel for
2743 for (
int proc=0;proc<nprocs;++proc){
2744 request[proc]=MPI_REQUEST_NULL;
2745 s_request[proc]=MPI_REQUEST_NULL;
2749 for (
int proc=0;proc<nprocs;++proc){
2750 soffset=soffset_proc[proc];
2751 MPI_Isend(&data[soffset],1,stype[proc],proc, tag,
2752 T_plan->comm, &s_request[proc]);
2755 for (
int proc=0;proc<nprocs;++proc){
2756 roffset=roffset_proc[proc];
2757 MPI_Irecv(&send_recv[roffset],1, rtype[proc], proc,
2758 tag, T_plan->comm, &request[proc]);
2761 for (
int proc=0;proc<nprocs;++proc){
2762 MPI_Wait(&request[proc], &ierr);
2763 MPI_Wait(&s_request[proc], &ierr);
2765 comm_time+=MPI_Wtime();
2767 for(
int i=0;i<local_n1*N[0];i++)
2768 std::cout<<
" "<<send_recv[i*n_tuples]<<std::endl;
2770 if(VERBOSE>=2) PCOUT<<
"MPIAlltoAll:"<<std::endl;
2772 for(
int h=0;h<howmany;h++)
2773 for(
int id=0;
id<nprocs;++id){
2775 for(
int i=0;i<local_n1;i++){
2776 std::cout<<std::endl;
2777 for(
int j=0;j<N[0];j++){
2778 std::cout<<
'\t'<<send_recv[h*odist+(i*N[0]+j)*n_tuples];
2782 MPI_Barrier(T_plan->comm);
2788 reshuffle_time-=MPI_Wtime();
2796 memcpy(data,send_recv,T_plan->alloc_local);
2799 local_transpose(local_n1,N[0],n_tuples,data );
2801 #pragma omp parallel for
2802 for(
int h=0;h<howmany;h++)
2803 local_transpose(local_n1,N[0],n_tuples,&data[h*odist] );
2806 if(VERBOSE>=2) PCOUT<<
"Transposed Out"<<std::endl;
2808 for(
int h=0;h<howmany;h++)
2809 for(
int id=0;
id<nprocs_1;++id){
2811 for(
int i=0;i<N[0];i++){
2812 std::cout<<std::endl;
2813 for(
int j=0;j<local_n1;j++){
2814 std::cout<<
'\t'<<data[h*odist+(i*local_n1+j)*n_tuples];
2818 MPI_Barrier(T_plan->comm);
2821 reshuffle_time+=MPI_Wtime();
2823 delete [] s_request;
2824 MPI_Barrier(T_plan->comm);
2827 PCOUT<<
"Shuffle Time= "<<shuffle_time<<std::endl;
2828 PCOUT<<
"Alltoall Time= "<<comm_time<<std::endl;
2829 PCOUT<<
"Reshuffle Time= "<<reshuffle_time<<std::endl;
2830 PCOUT<<
"Total Time= "<<(shuffle_time+comm_time+reshuffle_time)<<std::endl;
2832 timings[0]+=MPI_Wtime();
2833 timings[1]+=shuffle_time;
2834 timings[2]+=comm_time;
2835 timings[3]+=reshuffle_time;