Example #1
0
void recursive_reduce(cl_command_queue &queue,
        cl_context &context,
        cl_kernel &reduce_kern,
        cl_mem &in, 
        cl_mem &out, 
        int len){
  /*
    this recursion function calls the kernel multiple times. at each 
    recursion, the total work size is shrinked by a factor
    of the work size. at that recursion level, the total work size is that
    shrinked  size. it does so until the local work size is equivalent
    to the global work size and the first value of the output array is
    the value of the reduced sum.
  */

  //the maximum local work size is 512.
  size_t global_work_size[1] = {len};
  size_t local_work_size[1] = {512};

  // set local work size to global work size if the global 
  // size is lesser than 512
  if(len < 512){
    local_work_size[0] = len;
  }
  
  int left_over = 0;
  cl_int err;
  
  //determine the reduced global work size
  left_over = global_work_size[0] / local_work_size[0];

  err = clSetKernelArg(reduce_kern, 0, sizeof(cl_mem), &in);
  CHK_ERR(err);
  err = clSetKernelArg(reduce_kern, 1, sizeof(cl_mem), &out);
  CHK_ERR(err);
  err = clSetKernelArg(reduce_kern, 2, sizeof(int)*local_work_size[0], NULL);
  CHK_ERR(err);
  err = clSetKernelArg(reduce_kern, 3, sizeof(int), &len);
  CHK_ERR(err);

  // call kernel
  err = clEnqueueNDRangeKernel(queue,
                  reduce_kern,
                  1,
                  0,
                  global_work_size,
                  local_work_size,
                  0,
                  NULL,
                  NULL
                  );
  CHK_ERR(err);

  // call recursion is still needs to be reduced
  if(left_over > 1){
    recursive_reduce(queue,context,reduce_kern,out,out,left_over);
  }
}
void divide_and_reduce(int iterations, int num_threads, void*(*thread_func)(void*), void* data, void*(*reduce_func)(void*,void*), int result_element_size, void* result_out)
{
    int step = iterations / num_threads;
    int over = iterations % num_threads;
    pthread_t threads[num_threads];
    thread_arg thread_args[num_threads];
    char results[result_element_size*num_threads];
    int n=0;
    for(int i=0; i<iterations; i+=step, n++)
    {
        thread_args[n].tid = n;
        thread_args[n].data = data;
        thread_args[n].result = (void*)(results+n*result_element_size);
        thread_args[n].start = i;
        if(over-->0) i++;
        thread_args[n].end = i + step > iterations ? iterations : i + step;
        int error = pthread_create(&threads[n],0,thread_func,(void*)&thread_args[n]);
        if(error)
        {
            printf("[ERROR %d] Couldn't create thread with tid=%d,start=%d,end=%d\n",error,thread_args[n].tid,thread_args[n].start,thread_args[n].end);
            exit(EXIT_FAILURE);
        }
    }
    for(int i=0; i<n; i++)
    {
        int error = pthread_join(threads[i],0);
        if(error)
        {
            printf("[ERROR %d] Couldn't join thread with tid=%d\n",error,i);
            exit(EXIT_FAILURE);
        }
    }
    if(result_out)
    {
        memcpy(result_out,recursive_reduce(reduce_func,(void*)results,0,n-1,result_element_size),result_element_size);
    }
}
Example #3
0
int main(int argc, char *argv[])
{
  std::string reduce_kernel_str;
  
  std::string reduce_name_str = 
    std::string("reduce");
  std::string reduce_kernel_file = 
    std::string("reduce.cl");

  cl_vars_t cv; 
  cl_kernel reduce;
  
  readFile(reduce_kernel_file,
	   reduce_kernel_str);
  
  initialize_ocl(cv);
  
  compile_ocl_program(reduce, cv, reduce_kernel_str.c_str(),
		      reduce_name_str.c_str());

  int *h_A, *h_Y;
  cl_mem g_Out, g_In;
  int n = (1<<24);

  int c;
  /* how long do you want your arrays? */
  while((c = getopt(argc, argv, "n:"))!=-1){
    switch(c){
      case 'n':
        n = atoi(optarg);
        break;
    }
  }
  
  if(n==0)
    return 0;

  // pad the array is not power of 2
  int padded_size = 1;
  
  while(padded_size < n){
    padded_size <<= 1;
  } 

  h_A = new int[padded_size];
  h_Y = new int[padded_size];

  for(int i = 0; i < n; i++){
    h_A[i] = 1;
    h_Y[i] = 0;
  }

  for (int i = n; i < padded_size; ++i)
  {
    h_A[i] = 0;
    h_Y[i] = 0;
  }

  cl_int err = CL_SUCCESS;
  g_Out = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
			 sizeof(int)*n,NULL,&err);
  CHK_ERR(err);  
  g_In = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,
			sizeof(int)*n,NULL,&err);
  CHK_ERR(err);

  //copy data from host CPU to GPU
  err = clEnqueueWriteBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n,
			     h_Y, 0, NULL, NULL);
  CHK_ERR(err);

  err = clEnqueueWriteBuffer(cv.commands, g_In, true, 0, sizeof(int)*n,
			     h_A, 0, NULL, NULL);
  CHK_ERR(err);
 
  size_t local_work_size[1] = {512};
  size_t global_work_size[1] = {padded_size};

  err = clSetKernelArg(reduce, 0, sizeof(cl_mem), &g_In);
  CHK_ERR(err);
  err = clSetKernelArg(reduce, 1, sizeof(cl_mem), &g_Out);
  CHK_ERR(err);
  err = clSetKernelArg(reduce, 2, sizeof(int)*512, NULL);
  CHK_ERR(err);
  err = clSetKernelArg(reduce, 3, sizeof(int), &padded_size);
  CHK_ERR(err);
  
  double t0 = timestamp();

  // calls the recursion function
  recursive_reduce(cv.commands, cv.context, reduce, g_In, g_Out, padded_size);
  t0 = timestamp()-t0;
  
  //read result of GPU on host CPU
  err = clEnqueueReadBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);
  
  int sum=0.0f;
  for(int i = 0; i < n; i++)
  {
    sum += h_A[i];
  }

  if(sum!=h_Y[0])
  {
    printf("WRONG: CPU sum = %d, GPU sum = %d\n", sum, h_Y[0]);
    printf("WRONG: difference = %d\n", sum-h_Y[0]);
    printf("Other parts = %d, %d, %d, %d\n", h_Y[1], h_Y[2], h_Y[3], h_Y[4]);
    int z=0;
    while(h_Y[z] == h_Y[z+1]){
	z++;
    }
    printf("red: %d\n", z);
  }
  else
  {
    printf("CORRECT: %d,%g\n",n,t0);
  }

  uninitialize_ocl(cv);
  
  delete [] h_A; 
  delete [] h_Y;
  
  clReleaseMemObject(g_Out); 
  clReleaseMemObject(g_In);
  
  return 0;
}
void* recursive_reduce(void*(*reduce_func)(void*,void*),void* data, int n1, int n2, int element_size)
{
    if(n2 <= n1) return (void*)((char*)data+n1*element_size);
    else return reduce_func(recursive_reduce(reduce_func,data,n1,n1+(n2-n1)/2,element_size),recursive_reduce(reduce_func,data,1+n1+(n2-n1)/2,n2,element_size));
}