Exemplo n.º 1
0
void step1_gpu(int *n) {

	int nprocs, procid;
	MPI_Comm_rank(MPI_COMM_WORLD, &procid);
	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

	/* Create Cartesian Communicator */
	int c_dims[2] = { 0 };
	MPI_Comm c_comm;
	accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm);

	double *data, *data_cpu;
	Complex *data_hat;
	double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0;
	int alloc_max = 0;

	int isize[3], osize[3], istart[3], ostart[3];
	/* Get the local pencil size and the allocation size */
	alloc_max = accfft_local_size_dft_r2c_gpu(n, isize, istart, osize, ostart,
			c_comm);

	data_cpu = (double*) malloc(
			isize[0] * isize[1] * isize[2] * sizeof(double));
	//data_hat=(Complex*)accfft_alloc(alloc_max);
	cudaMalloc((void**) &data, isize[0] * isize[1] * isize[2] * sizeof(double));
	cudaMalloc((void**) &data_hat, alloc_max);

	//accfft_init(nthreads);

	/* Create FFT plan */
	setup_time = -MPI_Wtime();
	accfft_plan_gpu * plan = accfft_plan_dft_3d_r2c_gpu(n, data,
			(double*) data_hat, c_comm, ACCFFT_MEASURE);
	setup_time += MPI_Wtime();

	/* Warm Up */
	accfft_execute_r2c_gpu(plan, data, data_hat);
	accfft_execute_r2c_gpu(plan, data, data_hat);

	/*  Initialize data */
	initialize(data_cpu, n, c_comm);
	cudaMemcpy(data, data_cpu, isize[0] * isize[1] * isize[2] * sizeof(double),
			cudaMemcpyHostToDevice);
	// initialize_gpu(data,n,isize,istart); // GPU version of initialize function

	MPI_Barrier(c_comm);

	/* Perform forward FFT */
	f_time -= MPI_Wtime();
	accfft_execute_r2c_gpu(plan, data, data_hat);
	f_time += MPI_Wtime();

	MPI_Barrier(c_comm);

	double *data2_cpu, *data2;
	cudaMalloc((void**) &data2,
			isize[0] * isize[1] * isize[2] * sizeof(double));
	data2_cpu = (double*) malloc(
			isize[0] * isize[1] * isize[2] * sizeof(double));

	/* Perform backward FFT */
	i_time -= MPI_Wtime();
	accfft_execute_c2r_gpu(plan, data_hat, data2);
	i_time += MPI_Wtime();

	/* copy back results on CPU */
	cudaMemcpy(data2_cpu, data2,
			isize[0] * isize[1] * isize[2] * sizeof(double),
			cudaMemcpyDeviceToHost);

	/* Check Error */
	double err = 0, g_err = 0;
	double norm = 0, g_norm = 0;
	for (int i = 0; i < isize[0] * isize[1] * isize[2]; ++i) {
		err += std::abs(data2_cpu[i] / n[0] / n[1] / n[2] - data_cpu[i]);
		norm += std::abs(data_cpu[i]);
	}
	MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

	PCOUT << "\nL1 Error is " << g_err << std::endl;
	PCOUT << "Relative L1 Error is " << g_err / g_norm << std::endl;
	if (g_err / g_norm < 1e-10)
		PCOUT << "\nResults are CORRECT!\n\n";
	else
		PCOUT << "\nResults are NOT CORRECT!\n\n";

	check_err(data2_cpu, n, c_comm);

	/* Compute some timings statistics */
	double g_f_time, g_i_time, g_setup_time;
	MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0,
			MPI_COMM_WORLD);

	PCOUT << "GPU Timing for FFT of size " << n[0] << "*" << n[1] << "*" << n[2]
			<< std::endl;
	PCOUT << "Setup \t" << g_setup_time << std::endl;
	PCOUT << "FFT \t" << g_f_time << std::endl;
	PCOUT << "IFFT \t" << g_i_time << std::endl;

	free(data_cpu);
	free(data2_cpu);
	cudaFree(data);
	cudaFree(data_hat);
	cudaFree(data2);
	accfft_destroy_plan_gpu(plan);
	accfft_cleanup_gpu();
	MPI_Comm_free(&c_comm);
	return;

} // end step1_gpu
Exemplo n.º 2
0
void accfft_execute_r2c_gpu_t(accfft_plan_gpu* plan, T* data,Tc* data_out, double * timer,std::bitset<3> XYZ){
  accfft_execute_r2c_gpu(plan,data,data_out,timer,XYZ);
  return;
}