void fft_1D(cl_mem a,cl_mem b,cl_mem c, int N, cl_kernel init, cl_kernel knl,cl_command_queue queue,int direction,int offset_line)
{
	//handle complex-to-complex fft, accutal size = 2 * N

	//size_t ldim[] = { 128 };
	//size_t gdim[] = { (N /ldim[0])/2};
	int Ns = 1;
	int y =0;
	SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,offset_line,y);


	size_t ldim[] = { 1 };
	size_t gdim[] = { N/4 };

	CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, init,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
	
	for(Ns=4; Ns<N; Ns<<=2)
	{



			SET_6_KERNEL_ARGS(knl, b, c, N, Ns,direction,offset_line);
			size_t ldim[] = { 1 };
			size_t gdim[] = { N/4 };
			
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
					(queue, knl,
					 1, NULL, gdim, ldim,
					0, NULL, NULL));
			clEnqueueCopyBuffer(queue,c,b,
					offset_line*N*2*sizeof(float),
					offset_line*N*2*sizeof(float),
					sizeof(float)*N*2,0,NULL,NULL);
			//VecCopy(c,b,N,offset_line,vec_copy,queue);
			
			
			
		  
	}
	
}
Exemple #2
0
void mgv(ftype f[], ftype u[], ftype dx, unsigned n1,unsigned n2,unsigned n3, size_t field_size, unsigned points, int use_alignment, unsigned dim_x, cl_context ctx, cl_command_queue queue, cl_kernel poisson_knl, int wg_dims , int wg_x, int wg_y, int wg_z, int z_div, int fetch_per_pt, int flops_per_pt){
  // mgv does one v-cycle for the Poisson problem on a grid with mesh size dx
  // Inputs: f is right hand side, u is current approx dx is mesh size, n1 number of sweeps
  // on downward branch, n2 number of sweeps on upwardbranch, n3 number of sweeps on
  // coarsest grid.
  // Output:  It just returns an updated version of u
  cl_ulong start_big;
  #ifdef DO_TIMING
  cl_event evt;
  cl_event *evt_ptr = &evt;
  #else
  cl_event *evt_ptr = NULL;
  #endif
  size_t i, isweep;
  item * ugrid, * head, * curr;
  int l = 0;
  ftype dxval[POINTS/2] = {0};  // this is huge and unnecessary.  Try to cut downif time!!  
  ftype h;
  unsigned nx[POINTS/2] = {0};
  // --- Allocate common gpu memory----
  cl_int status;
  cl_mem dev_buf_u = clCreateBuffer(ctx, CL_MEM_READ_WRITE, field_size * sizeof(ftype), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  cl_mem dev_buf_f = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  cl_mem dev_buf_hist_u = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  //cl_mem read_buf = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  // -----------------------------------
  dxval[0] = dx;
  nx[0] = points;
  //const size_t max_size  = POINTS * POINTS * ((POINTS + 15)/16) * 16;
  // --------------- Allocatig the finest grid --------------------
  ugrid = (item *)malloc(sizeof(item));
  ugrid->uvec = malloc(field_size * sizeof(ftype));
  ugrid->fvec = malloc(field_size * sizeof(ftype));
  ugrid->rvec = malloc(field_size * sizeof(ftype));
  ugrid->dim_other = nx[0];
  ugrid->dim_x = dim_x;
  for(i = 0; i < field_size; i++){
     ugrid->uvec[i] = u[i];
     ugrid->fvec[i] = f[i];
     ugrid->rvec[i] = 0;
  }
  head = ugrid;  // head will always be the first one

  // ---------------- Set up the coarse grids ----------------------
  while((nx[l] - 1) % 2 == 0 && nx[l] > 3){
    l = l+1;
    nx[l] = (nx[l - 1] - 1) / 2 + 1;
    dxval[l] = 2 * dxval[l-1]; 
    curr = (item *)malloc(sizeof(item));
    curr->uvec = malloc(field_size * sizeof(ftype));
    curr->fvec = malloc(field_size * sizeof(ftype));
    curr->rvec = malloc(field_size * sizeof(ftype));

    curr->dim_other = nx[l];
    curr->field_start = 0;
    curr->dim_x = curr->dim_other;
    if(use_alignment)
    	curr->dim_x = ((nx[l] + 15)/16) * 16;

    // initialize vectors
    for(i = 0; i < field_size; i++){
 	curr->uvec[i] = 0;
	curr->fvec[i] = 0;
     	curr->rvec[i] = 0;
    }
    ugrid->next = curr; // curr gets attached to ugrid
    curr->prev = ugrid;
    ugrid = curr;
  }
  int nl = l; // this is the maximum number of grids that were created
  // --- at this point head contains the finest grid and ugrid contains the coarsest -----
  curr = head;
  head->prev = NULL;
  ugrid->next = NULL;

  // ---------------- Now relax each of the different grids descending--------
  for(l = 0; l < nl; l++){  // I stop right before nl (will be treated different)
     // ----------------------------------------------------------------------
     // -------------------- GPU DESCENDING V-CYCLE --------------------------
     // ----------------------------------------------------------------------
     {
     if(curr->dim_other < CUTOFF){
	for(isweep = 0; isweep < n1; isweep++){
	     gsrelax(curr, dxval[l]);
	}
     }

     else{
  	// ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- //
  	// fill in the buffers inside the GPU with the current data
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL));
	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	h = dxval[l] * dxval[l];
        size_t gdim[] = { curr->dim_x-16, curr->dim_x-16, curr->dim_x/z_div };
        size_t ldim[] = { wg_x, wg_y, wg_z };

  	for(i = 0; i < n1; i++){
     	   // ----------------------------------------------------------------------
     	   // invoke poisson kernel
     	   // ----------------------------------------------------------------------
	   //size_t u_size;
	   //CALL_CL_GUARDED(clGetMemObjectInfo, (dev_buf_u, CL_MEM_SIZE, sizeof(u_size), &u_size, 0));
	   //int u_size_i = u_size;
	   //printf("u_size=%d fstart=%d dim_x=%d dim_other=%d\n" , u_size_i, curr->field_start, curr->dim_x, curr->dim_other);
	   curr->field_start = 0;
     	   SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u, curr->field_start, curr->dim_x, curr->dim_other, h);
     	   // run the kernel
     	   CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr));
     	   #ifdef DO_TIMING
      	   // If timing is enabled, this wait can mean a significant performance hit.
      	   CALL_CL_GUARDED(clWaitForEvents, (1, &evt));
 
      	   cl_ulong start, end;
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL));
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL));
 
      	   gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt);
	   start_big = start;
      	   seconds_taken += 1e-9*(end-start);
      	   mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6;
      	   gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt;

      	   CALL_CL_GUARDED(clReleaseEvent, (evt));
     	   #endif
      	   CALL_CL_GUARDED(clFinish, (queue)); //ira adentro??
	   cl_mem tmp = dev_buf_u;
	   dev_buf_u = dev_buf_hist_u;
	   dev_buf_hist_u = tmp;
     	}
        //when I'm done, read from buffer
        CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
      }
     }
     resid2(curr, dxval[l]);
     injf2c(curr, curr->next); //this function updates f_{i+1}
     curr = curr->next;
  }
  // ----------------------------------------------------------------------
  // --------------- GPU ON THE COARSEST GRID -----------------------------
  // ----------------------------------------------------------------------
  {
    if(curr->dim_other < CUTOFF){
	for(i = 0; i < n3; i++){
	     gsrelax(curr, dxval[nl]);
	}
    }

    else{
  	// ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- //
  	// fill in the buffers inside the GPU with the current data
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL));
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	h = dxval[nl] * dxval[nl];
        size_t gdim[] = { curr->dim_x - 16, curr->dim_x - 16, curr->dim_x/z_div };
        size_t ldim[] = { wg_x, wg_y, wg_z };

  	for(i = 0; i < n3; i++){
     	   // ----------------------------------------------------------------------
     	   // invoke poisson kernel
     	   // ----------------------------------------------------------------------
	   curr->field_start = 0;
     	   SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u,curr->field_start, curr->dim_x, curr->dim_other, h);
     	   // run the kernel
	   curr->field_start = 0;
     	   CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr));
     	   #ifdef DO_TIMING
      	   // If timing is enabled, this wait can mean a significant performance hit.
      	   CALL_CL_GUARDED(clWaitForEvents, (1, &evt));
 
      	   cl_ulong start, end;
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL));
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL));
 
      	   gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt);
      	   seconds_taken += 1e-9*(end-start);
      	   mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6;
      	   gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt;

      	   CALL_CL_GUARDED(clReleaseEvent, (evt));
     	   #endif
      	   CALL_CL_GUARDED(clFinish, (queue)); //ira adentro??
	   cl_mem tmp = dev_buf_u;
	   dev_buf_u = dev_buf_hist_u;
	   dev_buf_hist_u = tmp;
     	}
        //when I'm done, read from buffer
        CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
     }
  }
  // ----------------------------------------------------------------------
  // -----------Upward branch of the V-cycle ------------------------------
  // ----------------------------------------------------------------------
  for(l = nl-1; l >= 0; --l){
     ctof(curr->prev, curr, field_size); //curr->prev is the finer of the two
     free(curr->uvec);  //curr won't be needed anymore
     free(curr->fvec);
     free(curr->rvec);
     curr = curr->prev;
     curr->next = NULL;
     for(isweep = 0; isweep < n2; isweep++){
	   gsrelax(curr, dxval[l]);
     }
     // Update the grids n1 times using the GPU when necessary
     {
     if(curr->dim_other < CUTOFF){
	for(isweep = 0; isweep < n2; isweep++){
	     gsrelax(curr, dxval[l]);
	}
     }

     else{
  	// ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- //
  	// fill in the buffers inside the GPU with the current data
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL));
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	h = dxval[l] * dxval[l];
        size_t gdim[] = { curr->dim_x-16, curr->dim_x-16, curr->dim_x/z_div };
        size_t ldim[] = { wg_x, wg_y, wg_z };

  	for(i = 0; i < n1; i++){
     	   // ----------------------------------------------------------------------
     	   // invoke poisson kernel
     	   // ----------------------------------------------------------------------
	   curr->field_start = 0;
     	   SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u, curr->field_start, curr->dim_x, curr->dim_other, h);
     	   // run the kernel
     	   CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr));
     	   #ifdef DO_TIMING
      	   // If timing is enabled, this wait can mean a significant performance hit.
      	   CALL_CL_GUARDED(clWaitForEvents, (1, &evt));
 
      	   cl_ulong start, end;
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL));
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL));
 
      	   gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt);
      	   seconds_taken += 1e-9*(end-start);
	   tot_secs += 1e-9*(end-start_big);
      	   mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6;
      	   gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt;

      	   CALL_CL_GUARDED(clReleaseEvent, (evt));
     	   #endif
      	   CALL_CL_GUARDED(clFinish, (queue)); //ira adentro??
	   cl_mem tmp = dev_buf_u;
	   dev_buf_u = dev_buf_hist_u;
	   dev_buf_hist_u = tmp;
     	}
        //when I'm done, read from buffer
        CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
      }
     }
  }
  // ---------- and the solution is right there in the last curr curr->uvec
  for(i = 0; i < field_size; i++)
     u[i] = curr->uvec[i];
  free(curr->uvec);
  //free(curr->fvec);
  free(curr->rvec);
  //free(ugrid->uvec);
  //free(ugrid->fvec);
  //free(ugrid->rvec);
  free(curr);
  CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_u));
  CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_f));
  CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_hist_u));
}
void fft2D_big_new(cl_mem a, cl_mem c, cl_mem b,cl_mem d, int N, cl_kernel init_big,
		cl_kernel clean,cl_kernel mat_trans, cl_kernel mat_trans_3D, cl_command_queue queue,int direction)
{
	


		int offset_line = 0;
			int Ns = 1;
			int y =0;
		SET_7_KERNEL_ARGS(init_big, a, b, N, Ns,direction,offset_line,y);


		size_t ldim[] = { 16 };
		size_t gdim[] = { N*N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, init_big,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));
		
	if(N!=64)
		if(N == 1024)
		{
		
			int Ns =1;
			int y =0;			
			//cl_long offset = offset_line * N;
			SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y);
			size_t ldim[]={ 4 };
			size_t gdim[] ={ N*N/4 };
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, clean,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));

		

			
			int option =0;
			float k =0;
			int n = 16;			
						
			SET_8_KERNEL_ARGS(mat_trans_3D, c, b, n, option,k,k,k,N);

				size_t ldim2[] = { 16, 16 ,1};
				size_t gdim2[] = { 16, 64 ,N};

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans_3D,
				3, NULL, gdim2, ldim2,
				0, NULL, NULL));

		}
		else if(N ==256)
		{

			int Ns =1;
			int y =0;			
			offset_line =0;
			SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y);
			size_t ldim[] ={4};
			size_t gdim[] ={N*N/4};

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, clean,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));


							
			int option =0;
			float k =0;
			int n = 4;
						
			SET_8_KERNEL_ARGS(mat_trans_3D, c, b, n, option,k,k,k,N);

				size_t ldim2[] = { 4, 4 ,1};
				size_t gdim2[] = { 4, 64, N };

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans_3D,
				3, NULL, gdim2, ldim2,
				0, NULL, NULL));

			
		}
		
		else
		{
			printf("FFT not implemented for this size!!!\n");

			return;
		}	
	
	//CALL_CL_GUARDED(clFinish, (queue));
	//printf("1D fine \n");

	mat__trans(b,c,N,mat_trans,queue,0,1,1,1);

	//CALL_CL_GUARDED(clFinish, (queue));
/*	for(int j= 0;j<N;j++)
	{
		//fft_1D(c,b,d,N,fft_init,fft1D,queue,direction,j);
		fft_1D_big(c, b,d,N, init_big, clean,mat_trans,queue,direction,j);
	}
*/

		Ns =1;
		SET_7_KERNEL_ARGS(init_big, c, b, N, Ns,direction,offset_line,y);



		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, init_big,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));

		
if (N !=64 )
		

		if( N == 256 || N == 1024)
		{
			int Ns =1;
			int y = 0;			
			int offset_line = 0;
			SET_7_KERNEL_ARGS(clean, b, d, N, Ns,direction,offset_line,y);
			size_t ldim[] = { 4 };
			size_t gdim[] = { N*N/4 };

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, clean,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));

			if(N == 1024)
			{
			
			int option =0;
			float k =0;
			int n = 16;			
						
			SET_8_KERNEL_ARGS(mat_trans_3D, d, b, n, option,k,k,k,N);

				size_t ldim2[] = { 16, 16 ,1};
				size_t gdim2[] = { 16, 64 ,N};

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans_3D,
				3, NULL, gdim2, ldim2,
				0, NULL, NULL));
			
			}
			else if(N ==256)
			{
		
			int option =0;
			float k =0;
			int n = 4;
						
			SET_8_KERNEL_ARGS(mat_trans_3D, d, b, n, option,k,k,k,N);

				size_t ldim2[] = { 4, 4 ,1};
				size_t gdim2[] = { 4, 64, N };

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans_3D,
				3, NULL, gdim2, ldim2,
				0, NULL, NULL));

			
			}	
		
		}

		else
		{
			printf("FFT not implemented for this size!!!\n");

			return;
		}	
	

	//CALL_CL_GUARDED(clFinish, (queue));
	if(direction == 1)
		mat__trans(b,c,N,mat_trans,queue,0,1,1,1);
	else 
		mat__trans(b,c,N,mat_trans,queue,-1,1,1,1);
	
}
void fft2D_new(cl_mem a, cl_mem c, cl_mem b,cl_mem d, int N, cl_kernel init,cl_kernel interm,
		cl_kernel fft1D,cl_kernel mat_trans, cl_command_queue queue,int direction)
{
#if 0	
		int Ns = 1;
		int y =0;
		int x =N*N;
		SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,y,y);


		size_t ldim[] = { 1 };
		size_t gdim[] = { N*N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, init,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
#endif

#if 1
	int Ns = 1;
	int stride = 64;
	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		
		int y =0;
		SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,offset,y);


		size_t ldim[] = { 1 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, init,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
	}
#if 1
	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		if(N >= 4)
		{
		Ns = 4;

		SET_6_KERNEL_ARGS(interm, b, c, N, Ns,direction,offset);
		size_t ldim[] = { 16 };
		size_t gdim[] = { N/4 };
		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		(queue, interm,
		 1, NULL, gdim, ldim,
		0, NULL, NULL));

		}
}
		
	
		clEnqueueCopyBuffer(queue,c,b,
			0,
			0,
			sizeof(float)*N*N*2,0,NULL,NULL);


	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		if(N>=16)
		{
		Ns = 16;

		SET_6_KERNEL_ARGS(interm, b, c, N, Ns,direction,offset);
		size_t ldim[] = { 16 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		(queue, interm,
		1, NULL, gdim, ldim,
		0, NULL, NULL));

		}

}

		clEnqueueCopyBuffer(queue,c,b,
		0,
		0,
		sizeof(float)*N*N*2,0,NULL,NULL);

	if(N >=64) 

		
#endif
		for(Ns=64; Ns<N; Ns<<=2)
		{

	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;

		SET_6_KERNEL_ARGS(fft1D, b, c, N, Ns,direction,offset);
		size_t ldim[] = { 1 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, fft1D,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));

		//VecCopy(c,b,N,offset_line,vec_copy,queue);




		}

		clEnqueueCopyBuffer(queue,c,b,
				0,
				0,
				sizeof(float)*N*N*2,0,NULL,NULL);
		}

#endif
	//CALL_CL_GUARDED(clFinish, (queue));
	//printf("1D fine \n");

	mat__trans(b,c,N,mat_trans,queue,0,1,1,1);

	#if 0
	float test;
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, c, /*blocking*/ CL_TRUE, /*offset*/ 2*sizeof(float)*N,
       		sizeof(float), &test,
        	0, NULL, NULL));
	

	printf("test = %f\n",test);
	#endif



	//CALL_CL_GUARDED(clFinish, (queue));

#if 0
	for(int j= 0;j<N;j++)
	{
		fft_1D_new(c,b,d,N,init,interm,fft1D,queue,direction,j);
	}
#endif
#if 1
	Ns = 1;	
	

	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		
		int y =0;
		SET_7_KERNEL_ARGS(init, c, b, N, Ns,direction,offset,y);


		size_t ldim[] = { 1 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, init,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
	}
#if 1
	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		if(N >= 4)
		{
		Ns = 4;

		SET_6_KERNEL_ARGS(interm, b, d, N, Ns,direction,offset);
		size_t ldim[] = { 16 };
		size_t gdim[] = { N/4 };
		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		(queue, interm,
		 1, NULL, gdim, ldim,
		0, NULL, NULL));

		}
}
		
	
		clEnqueueCopyBuffer(queue,d,b,
			0,
			0,
			sizeof(float)*N*N*2,0,NULL,NULL);


	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		if(N>=16)
		{
		Ns = 16;

		SET_6_KERNEL_ARGS(interm, b, d, N, Ns,direction,offset);
		size_t ldim[] = { 16 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		(queue, interm,
		1, NULL, gdim, ldim,
		0, NULL, NULL));

		}

}

		clEnqueueCopyBuffer(queue,d,b,
		0,
		0,
		sizeof(float)*N*N*2,0,NULL,NULL);

	if(N >=64) 

#endif		

		for(Ns=64; Ns<N; Ns<<=2)
		{

	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;

		SET_6_KERNEL_ARGS(fft1D, b, d, N, Ns,direction,offset);
		size_t ldim[] = { 1 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, fft1D,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));

		//VecCopy(c,b,N,offset_line,vec_copy,queue);




		}

		clEnqueueCopyBuffer(queue,d,b,
				0,
				0,
				sizeof(float)*N*N*2,0,NULL,NULL);
		}
#endif

	//CALL_CL_GUARDED(clFinish, (queue));
	if(direction == 1)
		mat__trans(b,c,N,mat_trans,queue,0,1,1,1);
	else 
		mat__trans(b,c,N,mat_trans,queue,-1,1,1,1);
	
}
void fft_1D_big(cl_mem a,cl_mem b,cl_mem c, int N, cl_kernel init_big, cl_kernel clean,cl_kernel mat_trans,cl_command_queue queue,int direction,int offset_line)
{
	//handle complex-to-complex fft, accutal size = 2 * N

	//size_t ldim[] = { 128 };
	//size_t gdim[] = { (N /ldim[0])/2};
	int Ns = 1;
	int y =0;
	SET_7_KERNEL_ARGS(init_big, a, b, N, Ns,direction,offset_line,y);


	size_t ldim[] = { 16 };
	size_t gdim[] = { N/4 };

	CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, init_big,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
	if (N ==64 )
		return;
	else
	if( N == 256 || N == 1024)
	{
		cl_long offset = offset_line * N;
		SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y);
		ldim[0] =4;

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, clean,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
		if(N == 1024)
		{
			int option =0;
			float k =0;
			int n = 16;			
			SET_8_KERNEL_ARGS(mat_trans, c, b, n, option,k,k,k,offset);

			size_t ldim[] = { 16, 16 };
			size_t gdim[] = { 16, 64 };
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans,
				2, NULL, gdim, ldim,
				0, NULL, NULL));

		}
		else if(N ==256)
		{
			int option =0;
			float k =0;
			int n = 4;			
			SET_8_KERNEL_ARGS(mat_trans, c, b, n, option,k,k,k,offset);

			size_t ldim[] = { 4, 4 };
			size_t gdim[] = { 4, 64 };
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans,
				2, NULL, gdim, ldim,
				0, NULL, NULL));

		}	
		
	}

	else
	{
		printf("FFT not implemented for this size!!!\n");

		return;
	}	
}