int dfft_cuda_create_plan(dfft_plan *p, int ndim, int *gdim, int *inembed, int *oembed, int *pdim, int *pidx, int row_m, int input_cyclic, int output_cyclic, MPI_Comm comm, int *proc_map) { int res = dfft_create_plan_common(p, ndim, gdim, inembed, oembed, pdim, pidx, row_m, input_cyclic, output_cyclic, comm, proc_map, 1); #ifndef ENABLE_MPI_CUDA /* allocate staging bufs */ /* we need to use posix_memalign/cudaHostRegister instead * of cudaHostAlloc, because cudaHostAlloc doesn't have hooks * in the MPI library, and using it would lead to data corruption */ int size = p->scratch_size*sizeof(cuda_cpx_t); int page_size = getpagesize(); size = ((size + page_size - 1) / page_size) * page_size; posix_memalign((void **)&(p->h_stage_in),page_size,size); posix_memalign((void **)&(p->h_stage_out),page_size,size); cudaHostRegister(p->h_stage_in, size, cudaHostAllocDefault); CHECK_CUDA(); cudaHostRegister(p->h_stage_out, size, cudaHostAllocDefault); CHECK_CUDA(); #endif /* allocate memory for passing variables */ cudaMalloc((void **)&(p->d_pidx), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_pdim), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_iembed), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_oembed), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_length), sizeof(int)*ndim); CHECK_CUDA(); /* initialize cuda buffers */ int *h_length = (int *)malloc(sizeof(int)*ndim); int i; for (i = 0; i < ndim; ++i) h_length[i] = gdim[i]/pdim[i]; cudaMemcpy(p->d_pidx, pidx, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_pdim, pdim, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_iembed, p->inembed, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_oembed, p->oembed, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_length, h_length, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); free(h_length); int dmax = p->max_depth + 2; p->d_rev_j1 = (int **) malloc(sizeof(int *)*dmax); p->d_rev_global = (int **) malloc(sizeof(int *)*dmax); p->d_rev_partial = (int **) malloc(sizeof(int *)*dmax); p->d_c0 = (int **) malloc(sizeof(int *)*dmax); p->d_c1 = (int **) malloc(sizeof(int *)*dmax); if (p->max_depth) { p->h_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth); p->d_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth); } int d; for (d = 0; d < dmax; ++d) { cudaMalloc((void **)&(p->d_rev_j1[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_rev_partial[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_rev_global[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_c0[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_c1[d]), sizeof(int)*ndim); CHECK_CUDA(); } for (d = 0; d < p->max_depth; ++d) { cudaMalloc((void **)&(p->d_alpha[d]), sizeof(cuda_scalar_t)*ndim); CHECK_CUDA(); p->h_alpha[d] = (cuda_scalar_t *) malloc(sizeof(cuda_scalar_t)*ndim); } /* perform initialization run */ dfft_cuda_execute(NULL, NULL, 0, p); /* initialization finished */ p->init = 0; return res; }
/* Compare a 3d FFT against a reference FFT */ void test_distributed_fft_3d_compare() { int s,p; MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &s); int nd = 3; int *pdim; pdim = (int *) malloc(sizeof(int)*nd); /* choose a decomposition */ int r = powf(p,1.0/(double)nd)+0.5; int root = 1; while (root < r) root*=2; int ptot = 1; if (!s) printf("Processor grid: "); int i; for (i = 0; i < nd-1; ++i) { pdim[i] = (((ptot*root) > p) ? 1 : root); ptot *= pdim[i]; if (!s) printf("%d x ",pdim[i]); } pdim[nd-1] = p/ptot; if (!s) printf("%d\n", pdim[nd-1]); /* determine processor index */ int *pidx; pidx = (int*)malloc(nd*sizeof(int)); int idx = s; for (i = nd-1; i >= 0; --i) { pidx[i] = idx % pdim[i]; idx /= pdim[i]; } int *dim_glob; dim_glob = (int *) malloc(sizeof(int)*nd); // Do a pdim[0]*4 x pdim[1]* 8 x pdim[2] * 16 FFT (powers of two) int local_nx = 4; int local_ny = 8; int local_nz = 16; dim_glob[0] = pdim[0]*local_nx; dim_glob[1] = pdim[1]*local_ny; dim_glob[2] = pdim[2]*local_nz; for (i = 0; i < nd-1; ++i) if (!s) printf("%d x ",dim_glob[i]); if (!s) printf("%d matrix\n", dim_glob[nd-1]); float scale = dim_glob[0]*dim_glob[1]*dim_glob[2]; /* assume 0.5 sig digit loss per addition/twiddling (empirical)*/ float sig_digits = 7.0-0.5*logf(scale)/logf(2.0); double tol = powf(10.0,-sig_digits); double abs_tol = 1.0*tol; printf("Testing with %f sig digits, rel precision %f, abs precision %f\n", sig_digits, tol, abs_tol); kiss_fft_cpx *in_kiss; in_kiss = (kiss_fft_cpx *)malloc(sizeof(kiss_fft_cpx)*dim_glob[0]*dim_glob[1]*dim_glob[2]); srand(12345); // fill table with complex random numbers in row major order int x,y,z; int nx = dim_glob[0]; int ny = dim_glob[1]; int nz = dim_glob[2]; for (x = 0; x < dim_glob[0]; ++x) for (y = 0; y < dim_glob[1]; ++y) for (z = 0; z < dim_glob[2]; ++z) { // KISS has column-major storage in_kiss[z+nz*(y+ny*x)].r = (float)rand()/(float)RAND_MAX; in_kiss[z+nz*(y+ny*x)].i =(float)rand()/(float)RAND_MAX; } kiss_fft_cpx *out_kiss; out_kiss = (kiss_fft_cpx *)malloc(sizeof(kiss_fft_cpx)*dim_glob[0]*dim_glob[1]*dim_glob[2]); // construct forward transform kiss_fftnd_cfg cfg = kiss_fftnd_alloc(dim_glob,3,0,NULL,NULL); // carry out conventional FFT kiss_fftnd(cfg, in_kiss, out_kiss); // compare to distributed FFT cuda_cpx_t * in_d, *in_h; cudaMalloc((void **)&in_d,sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz); in_h = (cuda_cpx_t *) malloc(sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz); int x_local, y_local, z_local; for (x = 0; x < nx; ++x) for (y = 0; y < ny; ++y) for (z = 0; z < nz; ++z) { if (x>=pidx[0]*local_nx && x < (pidx[0]+1)*local_nx && y>=pidx[1]*local_ny && y < (pidx[1]+1)*local_ny && z>=pidx[2]*local_nz && z < (pidx[2]+1)*local_nz) { x_local = x - pidx[0]*local_nx; y_local = y - pidx[1]*local_ny; z_local = z - pidx[2]*local_nz; CUDA_RE(in_h[z_local+local_nz*(y_local+local_ny*x_local)]) = in_kiss[z+nz*(y+ny*x)].r; CUDA_IM(in_h[z_local+local_nz*(y_local+local_ny*x_local)]) = in_kiss[z+nz*(y+ny*x)].i; } } cuda_cpx_t *out_d, *out_h; cudaMalloc((void **)&out_d,sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz); out_h = (cuda_cpx_t *) malloc(sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz); dfft_plan plan; dfft_cuda_create_plan(&plan,3, dim_glob, NULL, NULL, pdim, pidx,0, 0, 0, MPI_COMM_WORLD, proc_map); dfft_cuda_check_errors(&plan,1); /* copy data to device */ cudaMemcpy(in_d, in_h, sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz, cudaMemcpyDefault); // forward transform dfft_cuda_execute(in_d, out_d, 0, &plan); /* copy data back to host */ cudaMemcpy(out_h, out_d, sizeof(cuda_cpx_t)*local_nx*local_ny*local_nz, cudaMemcpyDefault); // do comparison int n_wave_local = local_nx * local_ny * local_nz; for (i = 0; i < n_wave_local; ++i) { x_local = i / local_ny / local_nz; y_local = (i - x_local*local_ny*local_nz)/local_nz; z_local = i % local_nz; x = pidx[0]*local_nx + x_local; y = pidx[1]*local_ny + y_local; z = pidx[2]*local_nz + z_local; double re = CUDA_RE(out_h[i]); double im = CUDA_IM(out_h[i]); double re_kiss = out_kiss[z+nz*(y+ny*x)].r; double im_kiss = out_kiss[z+nz*(y+ny*x)].i; if (fabs(re_kiss) < abs_tol) { CHECK_SMALL(re,2*abs_tol); } else { CHECK_CLOSE(re,re_kiss, tol); } if (fabs(im_kiss) < abs_tol) { CHECK_SMALL(im,2*abs_tol); } else { CHECK_CLOSE(im, im_kiss, tol); } } free(in_kiss); free(out_kiss); cudaFree(out_d); cudaFree(in_d); free(in_h); free(out_h); free(pidx); free(dim_glob); dfft_cuda_destroy_plan(plan); }
/* Basic functionality test for distributed 3D FFT This tests the integrity of a forward transform, followed by an inverse transform */ void test_distributed_fft_nd(int nd) { int p,s; MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &s); int *pdim; pdim = (int *) malloc(sizeof(int)*nd); /* choose a decomposition */ int r = powf(p,1.0/(double)nd)+0.5; int root = 1; while (root < r) root*=2; int ptot = 1; if (!s) printf("Processor grid: "); int i; for (i = 0; i < nd-1; ++i) { pdim[i] = (((ptot*root) > p) ? 1 : root); ptot *= pdim[i]; if (!s) printf("%d x ",pdim[i]); } pdim[nd-1] = p/ptot; if (!s) printf("%d\n", pdim[nd-1]); /* determine processor index */ int *pidx; pidx = (int*)malloc(nd*sizeof(int)); int idx = s; for (i = nd-1; i >= 0; --i) { pidx[i] = idx % pdim[i]; idx /= pdim[i]; } double tol = 0.001; double abs_tol = 0.2; int *dim_glob; dim_glob = (int *) malloc(sizeof(int)*nd); /* test with a 1x1 matrix, but only if we are executing on a single rank */ if (p == 1) { cuda_cpx_t *in_1_d; cuda_cpx_t *in_1_h; in_1_h = (cuda_cpx_t *) malloc(sizeof(cuda_cpx_t)*1); cudaMalloc((void **)&in_1_d,sizeof(cuda_cpx_t)*1); for (i = 0; i < nd; ++i) dim_glob[i] = 1*pdim[i]; CUDA_RE(in_1_h[0]) = (double) s; CUDA_IM(in_1_h[0]) = 0.0f; /* copy to device */ cudaMemcpy(in_1_d, in_1_h, sizeof(cuda_cpx_t)*1,cudaMemcpyDefault); dfft_plan plan_1; int pidx[1]; pidx[0] =s; dfft_cuda_create_plan(&plan_1, nd, dim_glob, NULL, NULL, pdim, pidx, 0, 0, 0, MPI_COMM_WORLD, proc_map); dfft_cuda_check_errors(&plan_1,1); /* forward transform */ dfft_cuda_execute(in_1_d, in_1_d, 0, &plan_1); /* backward transform */ dfft_cuda_execute(in_1_d, in_1_d, 1, &plan_1); /* copy back to host */ cudaMemcpy(in_1_h, in_1_d, sizeof(cuda_cpx_t)*1,cudaMemcpyDefault); if (s == 0) { CHECK_SMALL(CUDA_RE(in_1_h[0]),abs_tol); CHECK_SMALL(CUDA_IM(in_1_h[0]),abs_tol); } else { CHECK_CLOSE(CUDA_RE(in_1_h[0]),(double)s/(double)p,tol); CHECK_SMALL(CUDA_IM(in_1_h[0]),abs_tol); } cudaFree(in_1_d); free(in_1_h); dfft_cuda_destroy_plan(plan_1); } /* now do a test with a 2^n local matrix */ cuda_cpx_t *in_2_d; cuda_cpx_t *in_2_h; int size_2n = 1; for (i = 0; i < nd; ++i) size_2n *= 2; cudaMalloc((void **)&in_2_d,sizeof(cuda_cpx_t)*size_2n); in_2_h = (cuda_cpx_t *) malloc(sizeof(cuda_cpx_t)*size_2n); for (i = 0; i < size_2n; ++i) { CUDA_RE(in_2_h[i]) =(i+s*(double)size_2n); CUDA_IM(in_2_h[i]) =0.0f; } /* copy data to device */ cudaMemcpy(in_2_d, in_2_h, sizeof(cuda_cpx_t)*size_2n,cudaMemcpyDefault); for (i = 0; i < nd; ++i) dim_glob[i] = 2*pdim[i]; for (i = 0; i < nd-1; ++i) if (!s) printf("%d x ",dim_glob[i]); if (!s) printf("%d matrix\n", dim_glob[nd-1]); dfft_plan plan_2; dfft_cuda_create_plan(&plan_2, nd, dim_glob, NULL, NULL, pdim, pidx, 0, 0, 0, MPI_COMM_WORLD, proc_map); dfft_cuda_check_errors(&plan_2,1); /* forward transfom */ dfft_cuda_execute(in_2_d, in_2_d, 0, &plan_2); /* inverse transform */ dfft_cuda_execute(in_2_d, in_2_d, 1, &plan_2); /* copy back to host */ cudaMemcpy(in_2_h, in_2_d, sizeof(cuda_cpx_t)*size_2n, cudaMemcpyDefault); for (i = 0; i < size_2n; ++i) { if (s == 0 && i == 0) { CHECK_SMALL(CUDA_RE(in_2_h[i]), abs_tol); CHECK_SMALL(CUDA_IM(in_2_h[i]), abs_tol); } else { CHECK_CLOSE(CUDA_RE(in_2_h[i]), size_2n*p*(i+(double)size_2n*(double)s),tol); CHECK_SMALL(CUDA_IM(in_2_h[i]), abs_tol); } } cudaFree(in_2_d); free(in_2_h); dfft_cuda_destroy_plan(plan_2); // test with a padded 4^n local matrix (two ghost cells per axis) cuda_cpx_t *in_3_d; cuda_cpx_t *out_3_d; cuda_cpx_t *in_3_h; int *inembed; inembed = (int *) malloc(sizeof(int)*nd); int size_4n_embed = 1; int size_4n = 1; for (i = 0; i < nd; ++i) { inembed[i] = 4; size_4n_embed *= 4; size_4n *= 2; dim_glob[i]= 2*pdim[i]; } for (i = 0; i < nd-1; ++i) if (!s) printf("%d x ",dim_glob[i]); if (!s) printf("%d matrix with padding\n", dim_glob[nd-1]); cudaMalloc((void **)&in_3_d,sizeof(cuda_cpx_t)*size_4n_embed); cudaMalloc((void **)&out_3_d,sizeof(cuda_cpx_t)*size_4n); in_3_h = (cuda_cpx_t *) malloc(sizeof(cuda_cpx_t)*size_4n_embed); int *lidx; int *gidx; lidx = (int *) malloc(nd*sizeof(int)); gidx = (int *) malloc(nd*sizeof(int)); for (i = 0; i < size_4n_embed; ++i) { //processor grid in column major int idx = i; // find local coordinate tuple int j; for (j = nd-1; j >= 0; --j) { int length = inembed[j]; lidx[j] = idx % length; idx /= length; } // global coordinates in block distribution for (j = 0; j<nd; ++j) gidx[j] = lidx[j] + inembed[j]*pidx[j]; int val=0; for (j = 0; j<nd; ++j) { val *= inembed[j]; val += gidx[j]; } int ghost_cell = 0; for (j = 0; j < nd; ++j) if (lidx[j] == 0 || lidx[j] == 3) ghost_cell = 1; if (ghost_cell) { CUDA_RE(in_3_h[i]) = 99.0; CUDA_IM(in_3_h[i]) = 123.0; } else { CUDA_RE(in_3_h[i]) = (double) val; CUDA_IM(in_3_h[i]) = 0.0; } } /* copy data to device */ cudaMemcpy(in_3_d, in_3_h, sizeof(cuda_cpx_t)*size_4n_embed,cudaMemcpyDefault); dfft_plan plan_3_fw,plan_3_bw; dfft_cuda_create_plan(&plan_3_fw, nd, dim_glob, inembed, NULL, pdim, pidx, 0, 0, 0, MPI_COMM_WORLD, proc_map); dfft_cuda_check_errors(&plan_3_fw,1); dfft_cuda_create_plan(&plan_3_bw, nd, dim_glob, NULL, inembed, pdim, pidx, 0, 0, 0, MPI_COMM_WORLD, proc_map); dfft_cuda_check_errors(&plan_3_bw,1); int offset = 0; int n_ghost = 2; for (i = 0; i < nd; i++) { offset *= 4; offset += n_ghost/2; } /* forward transform */ dfft_cuda_execute(in_3_d+offset, out_3_d, 0, &plan_3_fw); /* inverse transform */ dfft_cuda_execute(out_3_d,in_3_d+offset, 1, &plan_3_bw); /* copy data back to host */ cudaMemcpy(in_3_h, in_3_d, sizeof(cuda_cpx_t)*size_4n_embed,cudaMemcpyDefault); /* check results */ for (i = 0; i < size_4n_embed; ++i) { //processor grid in column major int idx = i; // find local coordinate tuple int j; for (j = nd-1; j >= 0; --j) { int length = inembed[j]; lidx[j] = idx % length; idx /= length; } // global coordinates in block distribution for (j = 0; j<nd; ++j) gidx[j] = lidx[j] + inembed[j]*pidx[j]; int val=0; for (j = 0; j<nd; ++j) { val *= inembed[j]; val += gidx[j]; } int ghost_cell = 0; for (j = 0; j < nd; ++j) if (lidx[j] == 0 || lidx[j] == 3) ghost_cell = 1; if (ghost_cell) { //CHECK_CLOSE(CUDA_RE(in_3[i]),99.0,tol); //CHECK_CLOSE(CUDA_IM(in_3[i]), 123.0,tol); } else { CHECK_CLOSE(CUDA_RE(in_3_h[i]), (double) val*(double)p*(double)size_4n,tol); CHECK_SMALL(CUDA_IM(in_3_h[i]), abs_tol); } } cudaFree(in_3_d); cudaFree(out_3_d); free(in_3_h); free(lidx); free(gidx); dfft_cuda_destroy_plan(plan_3_fw); dfft_cuda_destroy_plan(plan_3_bw); free(pidx); free(pdim); free(dim_glob); }
/* Compare a 1d FFT against a reference FFT */ void test_distributed_fft_1d_compare(int n) { int s,p; MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &s); int pdim[1]; pdim[0]=p; /* determine processor index */ int pidx[1]; pidx[0]=s; float scale = n; /* assume 0.5 sig digit loss per addition/twiddling (empirical)*/ float sig_digits = 7.0-0.5*logf(scale)/logf(2.0); double tol = powf(10.0,-sig_digits); double abs_tol = 1.0*tol; printf("Testing with %f sig digits, rel precision %f, abs precision %f\n", sig_digits, tol, abs_tol); int dim_glob[1]; dim_glob[0] = n; // Do a size n FFT (n = power of two) kiss_fft_cpx *in_kiss; in_kiss = (kiss_fft_cpx *)malloc(sizeof(kiss_fft_cpx)*n); srand(45678); // fill vector with complex random numbers in row major order int i; for (i = 0; i < n; ++i) { in_kiss[i].r = (float)rand()/(float)RAND_MAX; in_kiss[i].i =(float)rand()/(float)RAND_MAX; } kiss_fft_cpx *out_kiss; out_kiss = (kiss_fft_cpx *)malloc(sizeof(kiss_fft_cpx)*n); // construct forward transform kiss_fft_cfg cfg = kiss_fft_alloc(n,0,NULL,NULL); // carry out conventional FFT kiss_fft(cfg, in_kiss, out_kiss); // compare to distributed FFT cuda_cpx_t * in_d, *in_h; cudaMalloc((void **)&in_d,sizeof(cuda_cpx_t)*n/p); in_h = (cuda_cpx_t *) malloc(sizeof(cuda_cpx_t)*n/p); for (i = 0; i < n/p; ++i) { CUDA_RE(in_h[i]) = in_kiss[s*n/p+i].r; CUDA_IM(in_h[i]) = in_kiss[s*n/p+i].i; } cuda_cpx_t *out_d,*out_h; cudaMalloc((void **)&out_d,sizeof(cuda_cpx_t)*n/p); out_h = (cuda_cpx_t *)malloc(sizeof(cuda_cpx_t)*n/p); dfft_plan plan; dfft_cuda_create_plan(&plan,1, dim_glob, NULL, NULL, pdim, pidx,0, 0, 0, MPI_COMM_WORLD, proc_map); dfft_cuda_check_errors(&plan,1); /* copy data to device */ cudaMemcpy(in_d, in_h, sizeof(cuda_cpx_t)*n/p, cudaMemcpyDefault); /* forward transform */ dfft_cuda_execute(in_d, out_d, 0, &plan); /* copy data back to host */ cudaMemcpy(out_h, out_d, sizeof(cuda_cpx_t)*n/p, cudaMemcpyDefault); /* do comparison */ for (i = 0; i < n/p; ++i) { int j = s*n/p + i; double re = CUDA_RE(out_h[i]); double im = CUDA_IM(out_h[i]); double re_kiss = out_kiss[j].r; double im_kiss = out_kiss[j].i; if (fabs(re_kiss) < abs_tol) { CHECK_SMALL(re,2*abs_tol); } else { CHECK_CLOSE(re,re_kiss, tol); } if (fabs(im_kiss) < abs_tol) { CHECK_SMALL(im,2*abs_tol); } else { CHECK_CLOSE(im, im_kiss, tol); } } free(in_kiss); free(out_kiss); cudaFree(out_d); cudaFree(in_d); free(in_h); free(out_h); dfft_cuda_destroy_plan(plan); }