void fft_1D(cl_mem a,cl_mem b,cl_mem c, int N, cl_kernel init, cl_kernel knl,cl_command_queue queue,int direction,int offset_line) { //handle complex-to-complex fft, accutal size = 2 * N //size_t ldim[] = { 128 }; //size_t gdim[] = { (N /ldim[0])/2}; int Ns = 1; int y =0; SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,offset_line,y); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init, 1, NULL, gdim, ldim, 0, NULL, NULL)); for(Ns=4; Ns<N; Ns<<=2) { SET_6_KERNEL_ARGS(knl, b, c, N, Ns,direction,offset_line); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, knl, 1, NULL, gdim, ldim, 0, NULL, NULL)); clEnqueueCopyBuffer(queue,c,b, offset_line*N*2*sizeof(float), offset_line*N*2*sizeof(float), sizeof(float)*N*2,0,NULL,NULL); //VecCopy(c,b,N,offset_line,vec_copy,queue); } }
void mgv(ftype f[], ftype u[], ftype dx, unsigned n1,unsigned n2,unsigned n3, size_t field_size, unsigned points, int use_alignment, unsigned dim_x, cl_context ctx, cl_command_queue queue, cl_kernel poisson_knl, int wg_dims , int wg_x, int wg_y, int wg_z, int z_div, int fetch_per_pt, int flops_per_pt){ // mgv does one v-cycle for the Poisson problem on a grid with mesh size dx // Inputs: f is right hand side, u is current approx dx is mesh size, n1 number of sweeps // on downward branch, n2 number of sweeps on upwardbranch, n3 number of sweeps on // coarsest grid. // Output: It just returns an updated version of u cl_ulong start_big; #ifdef DO_TIMING cl_event evt; cl_event *evt_ptr = &evt; #else cl_event *evt_ptr = NULL; #endif size_t i, isweep; item * ugrid, * head, * curr; int l = 0; ftype dxval[POINTS/2] = {0}; // this is huge and unnecessary. Try to cut downif time!! ftype h; unsigned nx[POINTS/2] = {0}; // --- Allocate common gpu memory---- cl_int status; cl_mem dev_buf_u = clCreateBuffer(ctx, CL_MEM_READ_WRITE, field_size * sizeof(ftype), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem dev_buf_f = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem dev_buf_hist_u = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); //cl_mem read_buf = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // ----------------------------------- dxval[0] = dx; nx[0] = points; //const size_t max_size = POINTS * POINTS * ((POINTS + 15)/16) * 16; // --------------- Allocatig the finest grid -------------------- ugrid = (item *)malloc(sizeof(item)); ugrid->uvec = malloc(field_size * sizeof(ftype)); ugrid->fvec = malloc(field_size * sizeof(ftype)); ugrid->rvec = malloc(field_size * sizeof(ftype)); ugrid->dim_other = nx[0]; ugrid->dim_x = dim_x; for(i = 0; i < field_size; i++){ ugrid->uvec[i] = u[i]; ugrid->fvec[i] = f[i]; ugrid->rvec[i] = 0; } head = ugrid; // head will always be the first one // ---------------- Set up the coarse grids ---------------------- while((nx[l] - 1) % 2 == 0 && nx[l] > 3){ l = l+1; nx[l] = (nx[l - 1] - 1) / 2 + 1; dxval[l] = 2 * dxval[l-1]; curr = (item *)malloc(sizeof(item)); curr->uvec = malloc(field_size * sizeof(ftype)); curr->fvec = malloc(field_size * sizeof(ftype)); curr->rvec = malloc(field_size * sizeof(ftype)); curr->dim_other = nx[l]; curr->field_start = 0; curr->dim_x = curr->dim_other; if(use_alignment) curr->dim_x = ((nx[l] + 15)/16) * 16; // initialize vectors for(i = 0; i < field_size; i++){ curr->uvec[i] = 0; curr->fvec[i] = 0; curr->rvec[i] = 0; } ugrid->next = curr; // curr gets attached to ugrid curr->prev = ugrid; ugrid = curr; } int nl = l; // this is the maximum number of grids that were created // --- at this point head contains the finest grid and ugrid contains the coarsest ----- curr = head; head->prev = NULL; ugrid->next = NULL; // ---------------- Now relax each of the different grids descending-------- for(l = 0; l < nl; l++){ // I stop right before nl (will be treated different) // ---------------------------------------------------------------------- // -------------------- GPU DESCENDING V-CYCLE -------------------------- // ---------------------------------------------------------------------- { if(curr->dim_other < CUTOFF){ for(isweep = 0; isweep < n1; isweep++){ gsrelax(curr, dxval[l]); } } else{ // ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- // // fill in the buffers inside the GPU with the current data CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); h = dxval[l] * dxval[l]; size_t gdim[] = { curr->dim_x-16, curr->dim_x-16, curr->dim_x/z_div }; size_t ldim[] = { wg_x, wg_y, wg_z }; for(i = 0; i < n1; i++){ // ---------------------------------------------------------------------- // invoke poisson kernel // ---------------------------------------------------------------------- //size_t u_size; //CALL_CL_GUARDED(clGetMemObjectInfo, (dev_buf_u, CL_MEM_SIZE, sizeof(u_size), &u_size, 0)); //int u_size_i = u_size; //printf("u_size=%d fstart=%d dim_x=%d dim_other=%d\n" , u_size_i, curr->field_start, curr->dim_x, curr->dim_other); curr->field_start = 0; SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u, curr->field_start, curr->dim_x, curr->dim_other, h); // run the kernel CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr)); #ifdef DO_TIMING // If timing is enabled, this wait can mean a significant performance hit. CALL_CL_GUARDED(clWaitForEvents, (1, &evt)); cl_ulong start, end; CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL)); CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL)); gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt); start_big = start; seconds_taken += 1e-9*(end-start); mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6; gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt; CALL_CL_GUARDED(clReleaseEvent, (evt)); #endif CALL_CL_GUARDED(clFinish, (queue)); //ira adentro?? cl_mem tmp = dev_buf_u; dev_buf_u = dev_buf_hist_u; dev_buf_hist_u = tmp; } //when I'm done, read from buffer CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); } } resid2(curr, dxval[l]); injf2c(curr, curr->next); //this function updates f_{i+1} curr = curr->next; } // ---------------------------------------------------------------------- // --------------- GPU ON THE COARSEST GRID ----------------------------- // ---------------------------------------------------------------------- { if(curr->dim_other < CUTOFF){ for(i = 0; i < n3; i++){ gsrelax(curr, dxval[nl]); } } else{ // ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- // // fill in the buffers inside the GPU with the current data CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); h = dxval[nl] * dxval[nl]; size_t gdim[] = { curr->dim_x - 16, curr->dim_x - 16, curr->dim_x/z_div }; size_t ldim[] = { wg_x, wg_y, wg_z }; for(i = 0; i < n3; i++){ // ---------------------------------------------------------------------- // invoke poisson kernel // ---------------------------------------------------------------------- curr->field_start = 0; SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u,curr->field_start, curr->dim_x, curr->dim_other, h); // run the kernel curr->field_start = 0; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr)); #ifdef DO_TIMING // If timing is enabled, this wait can mean a significant performance hit. CALL_CL_GUARDED(clWaitForEvents, (1, &evt)); cl_ulong start, end; CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL)); CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL)); gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt); seconds_taken += 1e-9*(end-start); mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6; gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt; CALL_CL_GUARDED(clReleaseEvent, (evt)); #endif CALL_CL_GUARDED(clFinish, (queue)); //ira adentro?? cl_mem tmp = dev_buf_u; dev_buf_u = dev_buf_hist_u; dev_buf_hist_u = tmp; } //when I'm done, read from buffer CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); } } // ---------------------------------------------------------------------- // -----------Upward branch of the V-cycle ------------------------------ // ---------------------------------------------------------------------- for(l = nl-1; l >= 0; --l){ ctof(curr->prev, curr, field_size); //curr->prev is the finer of the two free(curr->uvec); //curr won't be needed anymore free(curr->fvec); free(curr->rvec); curr = curr->prev; curr->next = NULL; for(isweep = 0; isweep < n2; isweep++){ gsrelax(curr, dxval[l]); } // Update the grids n1 times using the GPU when necessary { if(curr->dim_other < CUTOFF){ for(isweep = 0; isweep < n2; isweep++){ gsrelax(curr, dxval[l]); } } else{ // ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- // // fill in the buffers inside the GPU with the current data CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); h = dxval[l] * dxval[l]; size_t gdim[] = { curr->dim_x-16, curr->dim_x-16, curr->dim_x/z_div }; size_t ldim[] = { wg_x, wg_y, wg_z }; for(i = 0; i < n1; i++){ // ---------------------------------------------------------------------- // invoke poisson kernel // ---------------------------------------------------------------------- curr->field_start = 0; SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u, curr->field_start, curr->dim_x, curr->dim_other, h); // run the kernel CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr)); #ifdef DO_TIMING // If timing is enabled, this wait can mean a significant performance hit. CALL_CL_GUARDED(clWaitForEvents, (1, &evt)); cl_ulong start, end; CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL)); CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL)); gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt); seconds_taken += 1e-9*(end-start); tot_secs += 1e-9*(end-start_big); mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6; gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt; CALL_CL_GUARDED(clReleaseEvent, (evt)); #endif CALL_CL_GUARDED(clFinish, (queue)); //ira adentro?? cl_mem tmp = dev_buf_u; dev_buf_u = dev_buf_hist_u; dev_buf_hist_u = tmp; } //when I'm done, read from buffer CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); } } } // ---------- and the solution is right there in the last curr curr->uvec for(i = 0; i < field_size; i++) u[i] = curr->uvec[i]; free(curr->uvec); //free(curr->fvec); free(curr->rvec); //free(ugrid->uvec); //free(ugrid->fvec); //free(ugrid->rvec); free(curr); CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_u)); CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_f)); CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_hist_u)); }
void fft2D_big_new(cl_mem a, cl_mem c, cl_mem b,cl_mem d, int N, cl_kernel init_big, cl_kernel clean,cl_kernel mat_trans, cl_kernel mat_trans_3D, cl_command_queue queue,int direction) { int offset_line = 0; int Ns = 1; int y =0; SET_7_KERNEL_ARGS(init_big, a, b, N, Ns,direction,offset_line,y); size_t ldim[] = { 16 }; size_t gdim[] = { N*N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init_big, 1, NULL, gdim, ldim, 0, NULL, NULL)); if(N!=64) if(N == 1024) { int Ns =1; int y =0; //cl_long offset = offset_line * N; SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y); size_t ldim[]={ 4 }; size_t gdim[] ={ N*N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, clean, 1, NULL, gdim, ldim, 0, NULL, NULL)); int option =0; float k =0; int n = 16; SET_8_KERNEL_ARGS(mat_trans_3D, c, b, n, option,k,k,k,N); size_t ldim2[] = { 16, 16 ,1}; size_t gdim2[] = { 16, 64 ,N}; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans_3D, 3, NULL, gdim2, ldim2, 0, NULL, NULL)); } else if(N ==256) { int Ns =1; int y =0; offset_line =0; SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y); size_t ldim[] ={4}; size_t gdim[] ={N*N/4}; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, clean, 1, NULL, gdim, ldim, 0, NULL, NULL)); int option =0; float k =0; int n = 4; SET_8_KERNEL_ARGS(mat_trans_3D, c, b, n, option,k,k,k,N); size_t ldim2[] = { 4, 4 ,1}; size_t gdim2[] = { 4, 64, N }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans_3D, 3, NULL, gdim2, ldim2, 0, NULL, NULL)); } else { printf("FFT not implemented for this size!!!\n"); return; } //CALL_CL_GUARDED(clFinish, (queue)); //printf("1D fine \n"); mat__trans(b,c,N,mat_trans,queue,0,1,1,1); //CALL_CL_GUARDED(clFinish, (queue)); /* for(int j= 0;j<N;j++) { //fft_1D(c,b,d,N,fft_init,fft1D,queue,direction,j); fft_1D_big(c, b,d,N, init_big, clean,mat_trans,queue,direction,j); } */ Ns =1; SET_7_KERNEL_ARGS(init_big, c, b, N, Ns,direction,offset_line,y); CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init_big, 1, NULL, gdim, ldim, 0, NULL, NULL)); if (N !=64 ) if( N == 256 || N == 1024) { int Ns =1; int y = 0; int offset_line = 0; SET_7_KERNEL_ARGS(clean, b, d, N, Ns,direction,offset_line,y); size_t ldim[] = { 4 }; size_t gdim[] = { N*N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, clean, 1, NULL, gdim, ldim, 0, NULL, NULL)); if(N == 1024) { int option =0; float k =0; int n = 16; SET_8_KERNEL_ARGS(mat_trans_3D, d, b, n, option,k,k,k,N); size_t ldim2[] = { 16, 16 ,1}; size_t gdim2[] = { 16, 64 ,N}; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans_3D, 3, NULL, gdim2, ldim2, 0, NULL, NULL)); } else if(N ==256) { int option =0; float k =0; int n = 4; SET_8_KERNEL_ARGS(mat_trans_3D, d, b, n, option,k,k,k,N); size_t ldim2[] = { 4, 4 ,1}; size_t gdim2[] = { 4, 64, N }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans_3D, 3, NULL, gdim2, ldim2, 0, NULL, NULL)); } } else { printf("FFT not implemented for this size!!!\n"); return; } //CALL_CL_GUARDED(clFinish, (queue)); if(direction == 1) mat__trans(b,c,N,mat_trans,queue,0,1,1,1); else mat__trans(b,c,N,mat_trans,queue,-1,1,1,1); }
void fft2D_new(cl_mem a, cl_mem c, cl_mem b,cl_mem d, int N, cl_kernel init,cl_kernel interm, cl_kernel fft1D,cl_kernel mat_trans, cl_command_queue queue,int direction) { #if 0 int Ns = 1; int y =0; int x =N*N; SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,y,y); size_t ldim[] = { 1 }; size_t gdim[] = { N*N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init, 1, NULL, gdim, ldim, 0, NULL, NULL)); #endif #if 1 int Ns = 1; int stride = 64; for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; int y =0; SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,offset,y); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init, 1, NULL, gdim, ldim, 0, NULL, NULL)); } #if 1 for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; if(N >= 4) { Ns = 4; SET_6_KERNEL_ARGS(interm, b, c, N, Ns,direction,offset); size_t ldim[] = { 16 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, interm, 1, NULL, gdim, ldim, 0, NULL, NULL)); } } clEnqueueCopyBuffer(queue,c,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; if(N>=16) { Ns = 16; SET_6_KERNEL_ARGS(interm, b, c, N, Ns,direction,offset); size_t ldim[] = { 16 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, interm, 1, NULL, gdim, ldim, 0, NULL, NULL)); } } clEnqueueCopyBuffer(queue,c,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); if(N >=64) #endif for(Ns=64; Ns<N; Ns<<=2) { for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; SET_6_KERNEL_ARGS(fft1D, b, c, N, Ns,direction,offset); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, fft1D, 1, NULL, gdim, ldim, 0, NULL, NULL)); //VecCopy(c,b,N,offset_line,vec_copy,queue); } clEnqueueCopyBuffer(queue,c,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); } #endif //CALL_CL_GUARDED(clFinish, (queue)); //printf("1D fine \n"); mat__trans(b,c,N,mat_trans,queue,0,1,1,1); #if 0 float test; CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, c, /*blocking*/ CL_TRUE, /*offset*/ 2*sizeof(float)*N, sizeof(float), &test, 0, NULL, NULL)); printf("test = %f\n",test); #endif //CALL_CL_GUARDED(clFinish, (queue)); #if 0 for(int j= 0;j<N;j++) { fft_1D_new(c,b,d,N,init,interm,fft1D,queue,direction,j); } #endif #if 1 Ns = 1; for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; int y =0; SET_7_KERNEL_ARGS(init, c, b, N, Ns,direction,offset,y); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init, 1, NULL, gdim, ldim, 0, NULL, NULL)); } #if 1 for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; if(N >= 4) { Ns = 4; SET_6_KERNEL_ARGS(interm, b, d, N, Ns,direction,offset); size_t ldim[] = { 16 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, interm, 1, NULL, gdim, ldim, 0, NULL, NULL)); } } clEnqueueCopyBuffer(queue,d,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; if(N>=16) { Ns = 16; SET_6_KERNEL_ARGS(interm, b, d, N, Ns,direction,offset); size_t ldim[] = { 16 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, interm, 1, NULL, gdim, ldim, 0, NULL, NULL)); } } clEnqueueCopyBuffer(queue,d,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); if(N >=64) #endif for(Ns=64; Ns<N; Ns<<=2) { for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; SET_6_KERNEL_ARGS(fft1D, b, d, N, Ns,direction,offset); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, fft1D, 1, NULL, gdim, ldim, 0, NULL, NULL)); //VecCopy(c,b,N,offset_line,vec_copy,queue); } clEnqueueCopyBuffer(queue,d,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); } #endif //CALL_CL_GUARDED(clFinish, (queue)); if(direction == 1) mat__trans(b,c,N,mat_trans,queue,0,1,1,1); else mat__trans(b,c,N,mat_trans,queue,-1,1,1,1); }
void fft_1D_big(cl_mem a,cl_mem b,cl_mem c, int N, cl_kernel init_big, cl_kernel clean,cl_kernel mat_trans,cl_command_queue queue,int direction,int offset_line) { //handle complex-to-complex fft, accutal size = 2 * N //size_t ldim[] = { 128 }; //size_t gdim[] = { (N /ldim[0])/2}; int Ns = 1; int y =0; SET_7_KERNEL_ARGS(init_big, a, b, N, Ns,direction,offset_line,y); size_t ldim[] = { 16 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init_big, 1, NULL, gdim, ldim, 0, NULL, NULL)); if (N ==64 ) return; else if( N == 256 || N == 1024) { cl_long offset = offset_line * N; SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y); ldim[0] =4; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, clean, 1, NULL, gdim, ldim, 0, NULL, NULL)); if(N == 1024) { int option =0; float k =0; int n = 16; SET_8_KERNEL_ARGS(mat_trans, c, b, n, option,k,k,k,offset); size_t ldim[] = { 16, 16 }; size_t gdim[] = { 16, 64 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans, 2, NULL, gdim, ldim, 0, NULL, NULL)); } else if(N ==256) { int option =0; float k =0; int n = 4; SET_8_KERNEL_ARGS(mat_trans, c, b, n, option,k,k,k,offset); size_t ldim[] = { 4, 4 }; size_t gdim[] = { 4, 64 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans, 2, NULL, gdim, ldim, 0, NULL, NULL)); } } else { printf("FFT not implemented for this size!!!\n"); return; } }