Step 3

Amir Gholami bio photo By Amir Gholami

Introduction

Step 3 is a short tutorial for computing complex-to-complex transforms. Most of the code is the same as in the previous steps. The exception is that you should call _c2c functions instead of _r2c. A good thing about C2C transforms is that there is no special considerations for inplace transform as there is no conjugate property. The CPU/GPU code use a flag to switch between inplace/outplace transforms.

Code Explanation

The code for this step is very similar to step1, so I would just point out the important differences. There is a defined flag called INPLACE on top of the code. If this is defined then the code would execute an inplace transform. Otherwise an outplace transform would be performed.

Note that for the inplace transform, one needs to allocate enough memory to accomodate the transform result. Although, the output will not be bigger than the input (like in the R2C transform), but still its parallel distribution may necessitate additional memory. So one needs to use alloc_max to allocate enough memory, which is returned by calling accfft_local_size_dft_c2c function:

1
2
3
4
  /* Get the local pencil size and the allocation size */
  int alloc_max=0;
  int isize[3],osize[3],istart[3],ostart[3];
  alloc_max=accfft_local_size_dft_c2c(n,isize,istart,osize,ostart,c_comm);

However, the initialize function is the same, since there is no padding involved in inplace C2C.


Results

You can make and run the code as follows:

make
ibrun/mpirun -np 4 step2 128 128 128

On my machine, I get an output like this:

ERROR c_dims!=nprocs --> 0*0 !=4
Switching to c_dims_0=2 , c_dims_1=2

L1 Error of iFF(a)-a: 1.33786e-11
Relative L1 Error of iFF(a)-a: 2.27345e-15

Results are CORRECT!

Timing for Outplace FFT of size 128*128*128
Setup   0.161251
FFT     0.0186059
IFFT    0.019779

Complete CPU Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/*
 * File: step3.cpp
 * License: Please see LICENSE file.
 * AccFFT: Massively Parallel FFT Library
 * Created by Amir Gholami on 12/23/2014
 * Email: contact@accfft.org
 */

#include <stdlib.h>
#include <math.h> // M_PI
#include <mpi.h>
#include <accfft.h>
//#define INPLACE // comment this line for outplace transform

void initialize(Complex *a, int*n, MPI_Comm c_comm);
void check_err(Complex* a, int*n, MPI_Comm c_comm);
void step3(int *n, int nthreads);

inline double testcase(double X, double Y, double Z) {

	double sigma = 4;
	double pi = M_PI;
	double analytic;
	analytic = std::exp(-sigma * ((X - pi) * (X - pi) + (Y - pi) * (Y - pi)
							+ (Z - pi) * (Z - pi)));
	if (analytic != analytic)
		analytic = 0; /* Do you think the condition will be false always? */
	return analytic;
} // end testcase

void step3(int *n, int nthreads) {
	int nprocs, procid;
	MPI_Comm_rank(MPI_COMM_WORLD, &procid);
	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

	/* Create Cartesian Communicator */
	int c_dims[2] = { 0 };
	MPI_Comm c_comm;
	accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm);

#ifdef INPLACE
	Complex *data;
#else
	Complex *data;
	Complex *data_hat;
#endif
	double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0;
	int alloc_max = 0;

	int isize[3], osize[3], istart[3], ostart[3];
	/* Get the local pencil size and the allocation size */
	alloc_max = accfft_local_size_dft_c2c(n, isize, istart, osize, ostart,
			c_comm);

#ifdef INPLACE
	data=(Complex*)accfft_alloc(alloc_max);
#else
	data = (Complex*) accfft_alloc(
			isize[0] * isize[1] * isize[2] * 2 * sizeof(double));
	data_hat = (Complex*) accfft_alloc(alloc_max);
#endif

	accfft_init(nthreads);

	/* Create FFT plan */
	setup_time = -MPI_Wtime();
#ifdef INPLACE
	accfft_plan * plan=accfft_plan_dft_3d_c2c(n,data,data,c_comm,ACCFFT_MEASURE);
#else
	accfft_plan * plan = accfft_plan_dft_3d_c2c(n, data, data_hat, c_comm,
			ACCFFT_MEASURE);
#endif
	setup_time += MPI_Wtime();

	/*  Initialize data */
	initialize(data, n, c_comm);
	MPI_Barrier(c_comm);

	/* Perform forward FFT */
	f_time -= MPI_Wtime();
#ifdef INPLACE
	accfft_execute_c2c(plan,ACCFFT_FORWARD,data,data);
#else
	accfft_execute_c2c(plan, ACCFFT_FORWARD, data, data_hat);
#endif
	f_time += MPI_Wtime();

	MPI_Barrier(c_comm);

	/* Perform backward FFT */
#ifdef INPLACE
	i_time-=MPI_Wtime();
	accfft_execute_c2c(plan,ACCFFT_BACKWARD,data,data);
	i_time+=MPI_Wtime();
#else
	Complex * data2 = (Complex*) accfft_alloc(
			isize[0] * isize[1] * isize[2] * 2 * sizeof(double));
	i_time -= MPI_Wtime();
	accfft_execute_c2c(plan, ACCFFT_BACKWARD, data_hat, data2);
	i_time += MPI_Wtime();
#endif

	/* Check Error */
#ifdef INPLACE
	check_err(data,n,c_comm);
#else
	check_err(data2, n, c_comm);
#endif

	/* Compute some timings statistics */
	double g_f_time, g_i_time, g_setup_time;
	MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0,
			MPI_COMM_WORLD);

#ifdef INPLACE
	PCOUT<<"Timing for Inplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#else
	PCOUT << "Timing for Outplace FFT of size " << n[0] << "*" << n[1] << "*"
			<< n[2] << std::endl;
#endif
	PCOUT << "Setup \t" << g_setup_time << std::endl;
	PCOUT << "FFT \t" << g_f_time << std::endl;
	PCOUT << "IFFT \t" << g_i_time << std::endl;

	accfft_free(data);
#ifndef INPLACE
	accfft_free(data_hat);
	accfft_free(data2);
#endif
	accfft_destroy_plan(plan);
	accfft_cleanup();
	MPI_Comm_free(&c_comm);
	return;

} // end step3

int main(int argc, char **argv) {

	int NX, NY, NZ;
	MPI_Init(&argc, &argv);
	int nprocs, procid;
	MPI_Comm_rank(MPI_COMM_WORLD, &procid);
	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

	/* Parsing Inputs  */
	if (argc == 1) {
		NX = 128;
		NY = 128;
		NZ = 128;
	} else {
		NX = atoi(argv[1]);
		NY = atoi(argv[2]);
		NZ = atoi(argv[3]);
	}
	int N[3] = { NX, NY, NZ };

	int nthreads = 1;
	step3(N, nthreads);

	MPI_Finalize();
	return 0;
} // end main

void initialize(Complex *a, int *n, MPI_Comm c_comm) {
	double pi = M_PI;
	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_c2c(n, isize, istart, osize, ostart, c_comm);

#pragma omp parallel
	{
		double X, Y, Z;
		long int ptr;
#pragma omp for
		for (int i = 0; i < isize[0]; i++) {
			for (int j = 0; j < isize[1]; j++) {
				for (int k = 0; k < isize[2]; k++) {
					X = 2 * pi / n[0] * (i + istart[0]);
					Y = 2 * pi / n[1] * (j + istart[1]);
					Z = 2 * pi / n[2] * k;
					ptr = i * isize[1] * n[2] + j * n[2] + k;
					a[ptr][0] = testcase(X, Y, Z); // Real Component
					a[ptr][1] = testcase(X, Y, Z); // Imag Component
				}
			}
		}
	}
	return;
} // end initialize

void check_err(Complex* a, int*n, MPI_Comm c_comm) {
	int nprocs, procid;
	MPI_Comm_rank(c_comm, &procid);
	MPI_Comm_size(c_comm, &nprocs);
	long long int size = n[0];
	size *= n[1];
	size *= n[2];
	double pi = 4 * atan(1.0);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_c2c(n, isize, istart, osize, ostart, c_comm);

	double err = 0, norm = 0;

	double X, Y, Z, numerical_r, numerical_c;
	long int ptr;
	int thid = omp_get_thread_num();
	for (int i = 0; i < isize[0]; i++) {
		for (int j = 0; j < isize[1]; j++) {
			for (int k = 0; k < isize[2]; k++) {
				X = 2 * pi / n[0] * (i + istart[0]);
				Y = 2 * pi / n[1] * (j + istart[1]);
				Z = 2 * pi / n[2] * k;
				ptr = i * isize[1] * n[2] + j * n[2] + k;
				numerical_r = a[ptr][0] / size;
				if (numerical_r != numerical_r)
					numerical_r = 0;
				numerical_c = a[ptr][1] / size;
				if (numerical_c != numerical_c)
					numerical_c = 0;
				err += std::abs(numerical_r - testcase(X, Y, Z))
						+ std::abs(numerical_c - testcase(X, Y, Z));
				norm += std::abs(testcase(X, Y, Z));

			}
		}
	}

	double g_err = 0, g_norm = 0;
	MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	PCOUT << "\nL1 Error of iFF(a)-a: " << g_err << std::endl;
	PCOUT << "Relative L1 Error of iFF(a)-a: " << g_err / g_norm << std::endl;
	if (g_err / g_norm < 1e-10)
		PCOUT << "\nResults are CORRECT!\n\n";
	else
		PCOUT << "\nResults are NOT CORRECT!\n\n";

	return;
} // end check_err

Complete GPU Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
/*
 * File: step3_gpu.cpp
 * License: Please see LICENSE file.
 * AccFFT: Massively Parallel FFT Library
 * Email: contact@accfft.org
 */
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
#include <cuda_runtime_api.h>
//#include <accfft.h>
#include <accfft_gpu.h>
#define INPLACE // comment this line for outplace transform

void initialize(Complex *a, int*n, MPI_Comm c_comm);
void check_err(Complex* a, int*n, MPI_Comm c_comm);

void step3_gpu(int *n) {

	int nprocs, procid;
	MPI_Comm_rank(MPI_COMM_WORLD, &procid);
	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

	/* Create Cartesian Communicator */
	int c_dims[2] = { 0 };
	MPI_Comm c_comm;
	accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm);

	Complex *data, *data_cpu;
	Complex *data_hat;
	double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0;
	int alloc_max = 0;

	int isize[3], osize[3], istart[3], ostart[3];
	/* Get the local pencil size and the allocation size */
	alloc_max = accfft_local_size_dft_c2c_gpu(n, isize, istart, osize, ostart,
			c_comm);

#ifdef INPLACE
	data_cpu = (Complex*) malloc(alloc_max);
	cudaMalloc((void**) &data, alloc_max);
#else
	data_cpu=(Complex*)malloc(isize[0]*isize[1]*isize[2]*2*sizeof(double));
	cudaMalloc((void**) &data,isize[0]*isize[1]*isize[2]*2*sizeof(double));
	cudaMalloc((void**) &data_hat, alloc_max);
#endif

	//accfft_init(nthreads);
	setup_time = -MPI_Wtime();

	/* Create FFT plan */
#ifdef INPLACE
	accfft_plan_gpu * plan = accfft_plan_dft_3d_c2c_gpu(n, data, data, c_comm,
			ACCFFT_MEASURE);
#else
	accfft_plan_gpu * plan=accfft_plan_dft_3d_c2c_gpu(n,data,data_hat,c_comm,ACCFFT_MEASURE);
#endif
	setup_time += MPI_Wtime();

	/* Warmup Runs */
#ifdef INPLACE
	accfft_execute_c2c_gpu(plan, ACCFFT_FORWARD, data, data);
	accfft_execute_c2c_gpu(plan, ACCFFT_FORWARD, data, data);
#else
	accfft_execute_c2c_gpu(plan,ACCFFT_FORWARD,data,data_hat);
	accfft_execute_c2c_gpu(plan,ACCFFT_FORWARD,data,data_hat);
#endif

	/*  Initialize data */
	initialize(data_cpu, n, c_comm);
#ifdef INPLACE
	cudaMemcpy(data, data_cpu, alloc_max, cudaMemcpyHostToDevice);
#else
	cudaMemcpy(data, data_cpu,isize[0]*isize[1]*isize[2]*2*sizeof(double), cudaMemcpyHostToDevice);
#endif

	MPI_Barrier(c_comm);

	/* Perform forward FFT */
	f_time -= MPI_Wtime();
#ifdef INPLACE
	accfft_execute_c2c_gpu(plan, ACCFFT_FORWARD, data, data);
#else
	accfft_execute_c2c_gpu(plan,ACCFFT_FORWARD,data,data_hat);
#endif
	f_time += MPI_Wtime();

	MPI_Barrier(c_comm);

#ifndef INPLACE
	Complex *data2_cpu, *data2;
	cudaMalloc((void**) &data2, isize[0]*isize[1]*isize[2]*2*sizeof(double));
	data2_cpu=(Complex*) malloc(isize[0]*isize[1]*isize[2]*2*sizeof(double));
#endif

	/* Perform backward FFT */
	i_time -= MPI_Wtime();
#ifdef INPLACE
	accfft_execute_c2c_gpu(plan, ACCFFT_BACKWARD, data, data);
#else
	accfft_execute_c2c_gpu(plan,ACCFFT_BACKWARD,data_hat,data2);
#endif
	i_time += MPI_Wtime();

	/* copy back results on CPU and check error*/
#ifdef INPLACE
	cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost);
	check_err(data_cpu, n, c_comm);
#else
	cudaMemcpy(data2_cpu, data2, isize[0]*isize[1]*isize[2]*2*sizeof(double), cudaMemcpyDeviceToHost);
	check_err(data2_cpu,n,c_comm);
#endif

	/* Compute some timings statistics */
	double g_f_time, g_i_time, g_setup_time;
	MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0,
			MPI_COMM_WORLD);

#ifdef INPLACE
	PCOUT << "GPU Timing for Inplace FFT of size " << n[0] << "*" << n[1] << "*"
			<< n[2] << std::endl;
#else
	PCOUT<<"GPU Timing for Outplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#endif
	PCOUT << "Setup \t" << g_setup_time << std::endl;
	PCOUT << "FFT \t" << g_f_time << std::endl;
	PCOUT << "IFFT \t" << g_i_time << std::endl;

	MPI_Barrier(c_comm);
	cudaDeviceSynchronize();
	free(data_cpu);
	cudaFree(data);
#ifndef INPLACE
	cudaFree(data_hat);
	free(data2_cpu);
	cudaFree(data2);
#endif
	accfft_destroy_plan_gpu(plan);
	accfft_cleanup_gpu();
	MPI_Comm_free(&c_comm);
	return;

} // end step3_gpu

inline double testcase(double X, double Y, double Z) {

	double sigma = 4;
	double pi = M_PI;
	double analytic;
	analytic = std::exp(-sigma * ((X - pi) * (X - pi) + (Y - pi) * (Y - pi)
							+ (Z - pi) * (Z - pi)));
	if (analytic != analytic)
		analytic = 0; /* Do you think the condition will be false always? */
	return analytic;
}

void initialize(Complex *a, int*n, MPI_Comm c_comm) {
	double pi = 4 * atan(1.0);
	int n_tuples = (n[2]);
	int istart[3], isize[3];
	int ostart[3], osize[3];
	accfft_local_size_dft_c2c_gpu(n, isize, istart, osize, ostart, c_comm);
#pragma omp parallel num_threads(16)
	{
		double X, Y, Z;
		long int ptr;
#pragma omp for
		for (int i = 0; i < isize[0]; i++) {
			for (int j = 0; j < isize[1]; j++) {
				for (int k = 0; k < isize[2]; k++) {
					X = 2 * pi / n[0] * (i + istart[0]);
					Y = 2 * pi / n[1] * (j + istart[1]);
					Z = 2 * pi / n[2] * k;
					ptr = i * isize[1] * n_tuples + j * n_tuples + k;
					a[ptr][0] = testcase(X, Y, Z); //(istart[0]+i)*n_tuples*n[1]+(istart[1]+j)*n_tuples+k+istart[2];//(istart[0]+i)*n[1]+istart[1]+j;//testcase(X,Y,Z);
					a[ptr][1] = testcase(X, Y, Z); //(istart[0]+i)*n_tuples*n[1]+(istart[1]+j)*n_tuples+k+istart[2];//(istart[0]+i)*n[1]+istart[1]+j;//testcase(X,Y,Z);
					//std::cout<<"("<<i<<","<<j<<","<<k<<")  "<<a[k+j*NZ+i*NY*NZ]<<std::endl;
				}
			}
		}
	}
	return;
}

void check_err(Complex* a, int*n, MPI_Comm c_comm) {
	int nprocs, procid;
	MPI_Comm_rank(c_comm, &procid);
	MPI_Comm_size(c_comm, &nprocs);
	long long int size = n[0];
	size *= n[1];
	size *= n[2];
	double pi = 4 * atan(1.0);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_c2c_gpu(n, isize, istart, osize, ostart, c_comm);

	double err = 0, norm = 0;

	double X, Y, Z, numerical_r, numerical_c;
	long int ptr;
	for (int i = 0; i < isize[0]; i++) {
		for (int j = 0; j < isize[1]; j++) {
			for (int k = 0; k < isize[2]; k++) {
				X = 2 * pi / n[0] * (i + istart[0]);
				Y = 2 * pi / n[1] * (j + istart[1]);
				Z = 2 * pi / n[2] * k;
				ptr = i * isize[1] * n[2] + j * n[2] + k;
				numerical_r = a[ptr][0] / size;
				if (numerical_r != numerical_r)
					numerical_r = 0;
				numerical_c = a[ptr][1] / size;
				if (numerical_c != numerical_c)
					numerical_c = 0;
				err += std::abs(numerical_r - testcase(X, Y, Z))
						+ std::abs(numerical_c - testcase(X, Y, Z));
				norm += std::abs(testcase(X, Y, Z));
			}
		}
	}

	double g_err = 0, g_norm = 0;
	MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	PCOUT << "\nL1 Error of iFF(a)-a: " << g_err << std::endl;
	PCOUT << "Relative L1 Error of iFF(a)-a: " << g_err / g_norm << std::endl;
	if (g_err / g_norm < 1e-10)
		PCOUT << "\nResults are CORRECT!\n\n";
	else
		PCOUT << "\nResults are NOT CORRECT!\n\n";

	return;
} // end check_err
int main(int argc, char **argv) {

	int NX, NY, NZ;
	MPI_Init(&argc, &argv);
	int nprocs, procid;
	MPI_Comm_rank(MPI_COMM_WORLD, &procid);
	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

	/* Parsing Inputs  */
	if (argc == 1) {
		NX = 128;
		NY = 128;
		NZ = 128;
	} else {
		NX = atoi(argv[1]);
		NY = atoi(argv[2]);
		NZ = atoi(argv[3]);
	}
	int N[3] = { NX, NY, NZ };

	step3_gpu(N);

	MPI_Finalize();
	return 0;
} // end main