## Introduction

Step 3 is a short tutorial for computing complex-to-complex transforms.
Most of the code is the same as in the previous steps. The exception is that
you should call *_c2c* functions instead of *_r2c*. A good thing about C2C transforms
is that there is no special considerations for inplace transform as there is no
conjugate property. The CPU/GPU code use a flag to switch between inplace/outplace
transforms.

## Code Explanation

The code for this step is very similar to step1, so I would just
point out the important differences. There is a defined flag called
*INPLACE* on top of the code. If this is defined then the code would
execute an inplace transform. Otherwise an outplace transform would be
performed.

Note that for the inplace transform, one needs to allocate enough memory
to accomodate the transform result. Although, the output will not be bigger
than the input (like in the R2C transform), but still its parallel distribution
may necessitate additional memory. So one needs to use *alloc_max* to allocate
enough memory, which
is returned by calling *accfft_local_size_dft_c2c* function:

1
2
3
4

/* Get the local pencil size and the allocation size */
int alloc_max=0;
int isize[3],osize[3],istart[3],ostart[3];
alloc_max=accfft_local_size_dft_c2c(n,isize,istart,osize,ostart,c_comm);

However, the initialize function is the same, since there is no padding involved in inplace C2C.

## Results

You can make and run the code as follows:

On my machine, I get an output like this:

## Complete CPU Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241

/*
* File: step3.cpp
* License: Please see LICENSE file.
* AccFFT: Massively Parallel FFT Library
* Created by Amir Gholami on 12/23/2014
* Email: contact@accfft.org
*/
#include <stdlib.h>
#include <math.h> // M_PI
#include <mpi.h>
#include <accfft.h>
//#define INPLACE // comment this line for outplace transform
void initialize(Complex *a, int*n, MPI_Comm c_comm);
void check_err(Complex* a, int*n, MPI_Comm c_comm);
void step3(int *n, int nthreads);
inline double testcase(double X, double Y, double Z) {
double sigma = 4;
double pi = M_PI;
double analytic;
analytic = std::exp(-sigma * ((X - pi) * (X - pi) + (Y - pi) * (Y - pi)
+ (Z - pi) * (Z - pi)));
if (analytic != analytic)
analytic = 0; /* Do you think the condition will be false always? */
return analytic;
} // end testcase
void step3(int *n, int nthreads) {
int nprocs, procid;
MPI_Comm_rank(MPI_COMM_WORLD, &procid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
/* Create Cartesian Communicator */
int c_dims[2] = { 0 };
MPI_Comm c_comm;
accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm);
#ifdef INPLACE
Complex *data;
#else
Complex *data;
Complex *data_hat;
#endif
double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0;
int alloc_max = 0;
int isize[3], osize[3], istart[3], ostart[3];
/* Get the local pencil size and the allocation size */
alloc_max = accfft_local_size_dft_c2c(n, isize, istart, osize, ostart,
c_comm);
#ifdef INPLACE
data=(Complex*)accfft_alloc(alloc_max);
#else
data = (Complex*) accfft_alloc(
isize[0] * isize[1] * isize[2] * 2 * sizeof(double));
data_hat = (Complex*) accfft_alloc(alloc_max);
#endif
accfft_init(nthreads);
/* Create FFT plan */
setup_time = -MPI_Wtime();
#ifdef INPLACE
accfft_plan * plan=accfft_plan_dft_3d_c2c(n,data,data,c_comm,ACCFFT_MEASURE);
#else
accfft_plan * plan = accfft_plan_dft_3d_c2c(n, data, data_hat, c_comm,
ACCFFT_MEASURE);
#endif
setup_time += MPI_Wtime();
/* Initialize data */
initialize(data, n, c_comm);
MPI_Barrier(c_comm);
/* Perform forward FFT */
f_time -= MPI_Wtime();
#ifdef INPLACE
accfft_execute_c2c(plan,ACCFFT_FORWARD,data,data);
#else
accfft_execute_c2c(plan, ACCFFT_FORWARD, data, data_hat);
#endif
f_time += MPI_Wtime();
MPI_Barrier(c_comm);
/* Perform backward FFT */
#ifdef INPLACE
i_time-=MPI_Wtime();
accfft_execute_c2c(plan,ACCFFT_BACKWARD,data,data);
i_time+=MPI_Wtime();
#else
Complex * data2 = (Complex*) accfft_alloc(
isize[0] * isize[1] * isize[2] * 2 * sizeof(double));
i_time -= MPI_Wtime();
accfft_execute_c2c(plan, ACCFFT_BACKWARD, data_hat, data2);
i_time += MPI_Wtime();
#endif
/* Check Error */
#ifdef INPLACE
check_err(data,n,c_comm);
#else
check_err(data2, n, c_comm);
#endif
/* Compute some timings statistics */
double g_f_time, g_i_time, g_setup_time;
MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0,
MPI_COMM_WORLD);
#ifdef INPLACE
PCOUT<<"Timing for Inplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#else
PCOUT << "Timing for Outplace FFT of size " << n[0] << "*" << n[1] << "*"
<< n[2] << std::endl;
#endif
PCOUT << "Setup \t" << g_setup_time << std::endl;
PCOUT << "FFT \t" << g_f_time << std::endl;
PCOUT << "IFFT \t" << g_i_time << std::endl;
accfft_free(data);
#ifndef INPLACE
accfft_free(data_hat);
accfft_free(data2);
#endif
accfft_destroy_plan(plan);
accfft_cleanup();
MPI_Comm_free(&c_comm);
return;
} // end step3
int main(int argc, char **argv) {
int NX, NY, NZ;
MPI_Init(&argc, &argv);
int nprocs, procid;
MPI_Comm_rank(MPI_COMM_WORLD, &procid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
/* Parsing Inputs */
if (argc == 1) {
NX = 128;
NY = 128;
NZ = 128;
} else {
NX = atoi(argv[1]);
NY = atoi(argv[2]);
NZ = atoi(argv[3]);
}
int N[3] = { NX, NY, NZ };
int nthreads = 1;
step3(N, nthreads);
MPI_Finalize();
return 0;
} // end main
void initialize(Complex *a, int *n, MPI_Comm c_comm) {
double pi = M_PI;
int istart[3], isize[3], osize[3], ostart[3];
accfft_local_size_dft_c2c(n, isize, istart, osize, ostart, c_comm);
#pragma omp parallel
{
double X, Y, Z;
long int ptr;
#pragma omp for
for (int i = 0; i < isize[0]; i++) {
for (int j = 0; j < isize[1]; j++) {
for (int k = 0; k < isize[2]; k++) {
X = 2 * pi / n[0] * (i + istart[0]);
Y = 2 * pi / n[1] * (j + istart[1]);
Z = 2 * pi / n[2] * k;
ptr = i * isize[1] * n[2] + j * n[2] + k;
a[ptr][0] = testcase(X, Y, Z); // Real Component
a[ptr][1] = testcase(X, Y, Z); // Imag Component
}
}
}
}
return;
} // end initialize
void check_err(Complex* a, int*n, MPI_Comm c_comm) {
int nprocs, procid;
MPI_Comm_rank(c_comm, &procid);
MPI_Comm_size(c_comm, &nprocs);
long long int size = n[0];
size *= n[1];
size *= n[2];
double pi = 4 * atan(1.0);
int istart[3], isize[3], osize[3], ostart[3];
accfft_local_size_dft_c2c(n, isize, istart, osize, ostart, c_comm);
double err = 0, norm = 0;
double X, Y, Z, numerical_r, numerical_c;
long int ptr;
int thid = omp_get_thread_num();
for (int i = 0; i < isize[0]; i++) {
for (int j = 0; j < isize[1]; j++) {
for (int k = 0; k < isize[2]; k++) {
X = 2 * pi / n[0] * (i + istart[0]);
Y = 2 * pi / n[1] * (j + istart[1]);
Z = 2 * pi / n[2] * k;
ptr = i * isize[1] * n[2] + j * n[2] + k;
numerical_r = a[ptr][0] / size;
if (numerical_r != numerical_r)
numerical_r = 0;
numerical_c = a[ptr][1] / size;
if (numerical_c != numerical_c)
numerical_c = 0;
err += std::abs(numerical_r - testcase(X, Y, Z))
+ std::abs(numerical_c - testcase(X, Y, Z));
norm += std::abs(testcase(X, Y, Z));
}
}
}
double g_err = 0, g_norm = 0;
MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
PCOUT << "\nL1 Error of iFF(a)-a: " << g_err << std::endl;
PCOUT << "Relative L1 Error of iFF(a)-a: " << g_err / g_norm << std::endl;
if (g_err / g_norm < 1e-10)
PCOUT << "\nResults are CORRECT!\n\n";
else
PCOUT << "\nResults are NOT CORRECT!\n\n";
return;
} // end check_err

## Complete GPU Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259

/*
* File: step3_gpu.cpp
* License: Please see LICENSE file.
* AccFFT: Massively Parallel FFT Library
* Email: contact@accfft.org
*/
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
#include <cuda_runtime_api.h>
//#include <accfft.h>
#include <accfft_gpu.h>
#define INPLACE // comment this line for outplace transform
void initialize(Complex *a, int*n, MPI_Comm c_comm);
void check_err(Complex* a, int*n, MPI_Comm c_comm);
void step3_gpu(int *n) {
int nprocs, procid;
MPI_Comm_rank(MPI_COMM_WORLD, &procid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
/* Create Cartesian Communicator */
int c_dims[2] = { 0 };
MPI_Comm c_comm;
accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm);
Complex *data, *data_cpu;
Complex *data_hat;
double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0;
int alloc_max = 0;
int isize[3], osize[3], istart[3], ostart[3];
/* Get the local pencil size and the allocation size */
alloc_max = accfft_local_size_dft_c2c_gpu(n, isize, istart, osize, ostart,
c_comm);
#ifdef INPLACE
data_cpu = (Complex*) malloc(alloc_max);
cudaMalloc((void**) &data, alloc_max);
#else
data_cpu=(Complex*)malloc(isize[0]*isize[1]*isize[2]*2*sizeof(double));
cudaMalloc((void**) &data,isize[0]*isize[1]*isize[2]*2*sizeof(double));
cudaMalloc((void**) &data_hat, alloc_max);
#endif
//accfft_init(nthreads);
setup_time = -MPI_Wtime();
/* Create FFT plan */
#ifdef INPLACE
accfft_plan_gpu * plan = accfft_plan_dft_3d_c2c_gpu(n, data, data, c_comm,
ACCFFT_MEASURE);
#else
accfft_plan_gpu * plan=accfft_plan_dft_3d_c2c_gpu(n,data,data_hat,c_comm,ACCFFT_MEASURE);
#endif
setup_time += MPI_Wtime();
/* Warmup Runs */
#ifdef INPLACE
accfft_execute_c2c_gpu(plan, ACCFFT_FORWARD, data, data);
accfft_execute_c2c_gpu(plan, ACCFFT_FORWARD, data, data);
#else
accfft_execute_c2c_gpu(plan,ACCFFT_FORWARD,data,data_hat);
accfft_execute_c2c_gpu(plan,ACCFFT_FORWARD,data,data_hat);
#endif
/* Initialize data */
initialize(data_cpu, n, c_comm);
#ifdef INPLACE
cudaMemcpy(data, data_cpu, alloc_max, cudaMemcpyHostToDevice);
#else
cudaMemcpy(data, data_cpu,isize[0]*isize[1]*isize[2]*2*sizeof(double), cudaMemcpyHostToDevice);
#endif
MPI_Barrier(c_comm);
/* Perform forward FFT */
f_time -= MPI_Wtime();
#ifdef INPLACE
accfft_execute_c2c_gpu(plan, ACCFFT_FORWARD, data, data);
#else
accfft_execute_c2c_gpu(plan,ACCFFT_FORWARD,data,data_hat);
#endif
f_time += MPI_Wtime();
MPI_Barrier(c_comm);
#ifndef INPLACE
Complex *data2_cpu, *data2;
cudaMalloc((void**) &data2, isize[0]*isize[1]*isize[2]*2*sizeof(double));
data2_cpu=(Complex*) malloc(isize[0]*isize[1]*isize[2]*2*sizeof(double));
#endif
/* Perform backward FFT */
i_time -= MPI_Wtime();
#ifdef INPLACE
accfft_execute_c2c_gpu(plan, ACCFFT_BACKWARD, data, data);
#else
accfft_execute_c2c_gpu(plan,ACCFFT_BACKWARD,data_hat,data2);
#endif
i_time += MPI_Wtime();
/* copy back results on CPU and check error*/
#ifdef INPLACE
cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost);
check_err(data_cpu, n, c_comm);
#else
cudaMemcpy(data2_cpu, data2, isize[0]*isize[1]*isize[2]*2*sizeof(double), cudaMemcpyDeviceToHost);
check_err(data2_cpu,n,c_comm);
#endif
/* Compute some timings statistics */
double g_f_time, g_i_time, g_setup_time;
MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0,
MPI_COMM_WORLD);
#ifdef INPLACE
PCOUT << "GPU Timing for Inplace FFT of size " << n[0] << "*" << n[1] << "*"
<< n[2] << std::endl;
#else
PCOUT<<"GPU Timing for Outplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#endif
PCOUT << "Setup \t" << g_setup_time << std::endl;
PCOUT << "FFT \t" << g_f_time << std::endl;
PCOUT << "IFFT \t" << g_i_time << std::endl;
MPI_Barrier(c_comm);
cudaDeviceSynchronize();
free(data_cpu);
cudaFree(data);
#ifndef INPLACE
cudaFree(data_hat);
free(data2_cpu);
cudaFree(data2);
#endif
accfft_destroy_plan_gpu(plan);
accfft_cleanup_gpu();
MPI_Comm_free(&c_comm);
return;
} // end step3_gpu
inline double testcase(double X, double Y, double Z) {
double sigma = 4;
double pi = M_PI;
double analytic;
analytic = std::exp(-sigma * ((X - pi) * (X - pi) + (Y - pi) * (Y - pi)
+ (Z - pi) * (Z - pi)));
if (analytic != analytic)
analytic = 0; /* Do you think the condition will be false always? */
return analytic;
}
void initialize(Complex *a, int*n, MPI_Comm c_comm) {
double pi = 4 * atan(1.0);
int n_tuples = (n[2]);
int istart[3], isize[3];
int ostart[3], osize[3];
accfft_local_size_dft_c2c_gpu(n, isize, istart, osize, ostart, c_comm);
#pragma omp parallel num_threads(16)
{
double X, Y, Z;
long int ptr;
#pragma omp for
for (int i = 0; i < isize[0]; i++) {
for (int j = 0; j < isize[1]; j++) {
for (int k = 0; k < isize[2]; k++) {
X = 2 * pi / n[0] * (i + istart[0]);
Y = 2 * pi / n[1] * (j + istart[1]);
Z = 2 * pi / n[2] * k;
ptr = i * isize[1] * n_tuples + j * n_tuples + k;
a[ptr][0] = testcase(X, Y, Z); //(istart[0]+i)*n_tuples*n[1]+(istart[1]+j)*n_tuples+k+istart[2];//(istart[0]+i)*n[1]+istart[1]+j;//testcase(X,Y,Z);
a[ptr][1] = testcase(X, Y, Z); //(istart[0]+i)*n_tuples*n[1]+(istart[1]+j)*n_tuples+k+istart[2];//(istart[0]+i)*n[1]+istart[1]+j;//testcase(X,Y,Z);
//std::cout<<"("<<i<<","<<j<<","<<k<<") "<<a[k+j*NZ+i*NY*NZ]<<std::endl;
}
}
}
}
return;
}
void check_err(Complex* a, int*n, MPI_Comm c_comm) {
int nprocs, procid;
MPI_Comm_rank(c_comm, &procid);
MPI_Comm_size(c_comm, &nprocs);
long long int size = n[0];
size *= n[1];
size *= n[2];
double pi = 4 * atan(1.0);
int istart[3], isize[3], osize[3], ostart[3];
accfft_local_size_dft_c2c_gpu(n, isize, istart, osize, ostart, c_comm);
double err = 0, norm = 0;
double X, Y, Z, numerical_r, numerical_c;
long int ptr;
for (int i = 0; i < isize[0]; i++) {
for (int j = 0; j < isize[1]; j++) {
for (int k = 0; k < isize[2]; k++) {
X = 2 * pi / n[0] * (i + istart[0]);
Y = 2 * pi / n[1] * (j + istart[1]);
Z = 2 * pi / n[2] * k;
ptr = i * isize[1] * n[2] + j * n[2] + k;
numerical_r = a[ptr][0] / size;
if (numerical_r != numerical_r)
numerical_r = 0;
numerical_c = a[ptr][1] / size;
if (numerical_c != numerical_c)
numerical_c = 0;
err += std::abs(numerical_r - testcase(X, Y, Z))
+ std::abs(numerical_c - testcase(X, Y, Z));
norm += std::abs(testcase(X, Y, Z));
}
}
}
double g_err = 0, g_norm = 0;
MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
PCOUT << "\nL1 Error of iFF(a)-a: " << g_err << std::endl;
PCOUT << "Relative L1 Error of iFF(a)-a: " << g_err / g_norm << std::endl;
if (g_err / g_norm < 1e-10)
PCOUT << "\nResults are CORRECT!\n\n";
else
PCOUT << "\nResults are NOT CORRECT!\n\n";
return;
} // end check_err
int main(int argc, char **argv) {
int NX, NY, NZ;
MPI_Init(&argc, &argv);
int nprocs, procid;
MPI_Comm_rank(MPI_COMM_WORLD, &procid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
/* Parsing Inputs */
if (argc == 1) {
NX = 128;
NY = 128;
NZ = 128;
} else {
NX = atoi(argv[1]);
NY = atoi(argv[2]);
NZ = atoi(argv[3]);
}
int N[3] = { NX, NY, NZ };
step3_gpu(N);
MPI_Finalize();
return 0;
} // end main