void recursive_reduce(cl_command_queue &queue, cl_context &context, cl_kernel &reduce_kern, cl_mem &in, cl_mem &out, int len){ /* this recursion function calls the kernel multiple times. at each recursion, the total work size is shrinked by a factor of the work size. at that recursion level, the total work size is that shrinked size. it does so until the local work size is equivalent to the global work size and the first value of the output array is the value of the reduced sum. */ //the maximum local work size is 512. size_t global_work_size[1] = {len}; size_t local_work_size[1] = {512}; // set local work size to global work size if the global // size is lesser than 512 if(len < 512){ local_work_size[0] = len; } int left_over = 0; cl_int err; //determine the reduced global work size left_over = global_work_size[0] / local_work_size[0]; err = clSetKernelArg(reduce_kern, 0, sizeof(cl_mem), &in); CHK_ERR(err); err = clSetKernelArg(reduce_kern, 1, sizeof(cl_mem), &out); CHK_ERR(err); err = clSetKernelArg(reduce_kern, 2, sizeof(int)*local_work_size[0], NULL); CHK_ERR(err); err = clSetKernelArg(reduce_kern, 3, sizeof(int), &len); CHK_ERR(err); // call kernel err = clEnqueueNDRangeKernel(queue, reduce_kern, 1, 0, global_work_size, local_work_size, 0, NULL, NULL ); CHK_ERR(err); // call recursion is still needs to be reduced if(left_over > 1){ recursive_reduce(queue,context,reduce_kern,out,out,left_over); } }
void divide_and_reduce(int iterations, int num_threads, void*(*thread_func)(void*), void* data, void*(*reduce_func)(void*,void*), int result_element_size, void* result_out) { int step = iterations / num_threads; int over = iterations % num_threads; pthread_t threads[num_threads]; thread_arg thread_args[num_threads]; char results[result_element_size*num_threads]; int n=0; for(int i=0; i<iterations; i+=step, n++) { thread_args[n].tid = n; thread_args[n].data = data; thread_args[n].result = (void*)(results+n*result_element_size); thread_args[n].start = i; if(over-->0) i++; thread_args[n].end = i + step > iterations ? iterations : i + step; int error = pthread_create(&threads[n],0,thread_func,(void*)&thread_args[n]); if(error) { printf("[ERROR %d] Couldn't create thread with tid=%d,start=%d,end=%d\n",error,thread_args[n].tid,thread_args[n].start,thread_args[n].end); exit(EXIT_FAILURE); } } for(int i=0; i<n; i++) { int error = pthread_join(threads[i],0); if(error) { printf("[ERROR %d] Couldn't join thread with tid=%d\n",error,i); exit(EXIT_FAILURE); } } if(result_out) { memcpy(result_out,recursive_reduce(reduce_func,(void*)results,0,n-1,result_element_size),result_element_size); } }
int main(int argc, char *argv[]) { std::string reduce_kernel_str; std::string reduce_name_str = std::string("reduce"); std::string reduce_kernel_file = std::string("reduce.cl"); cl_vars_t cv; cl_kernel reduce; readFile(reduce_kernel_file, reduce_kernel_str); initialize_ocl(cv); compile_ocl_program(reduce, cv, reduce_kernel_str.c_str(), reduce_name_str.c_str()); int *h_A, *h_Y; cl_mem g_Out, g_In; int n = (1<<24); int c; /* how long do you want your arrays? */ while((c = getopt(argc, argv, "n:"))!=-1){ switch(c){ case 'n': n = atoi(optarg); break; } } if(n==0) return 0; // pad the array is not power of 2 int padded_size = 1; while(padded_size < n){ padded_size <<= 1; } h_A = new int[padded_size]; h_Y = new int[padded_size]; for(int i = 0; i < n; i++){ h_A[i] = 1; h_Y[i] = 0; } for (int i = n; i < padded_size; ++i) { h_A[i] = 0; h_Y[i] = 0; } cl_int err = CL_SUCCESS; g_Out = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(int)*n,NULL,&err); CHK_ERR(err); g_In = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(int)*n,NULL,&err); CHK_ERR(err); //copy data from host CPU to GPU err = clEnqueueWriteBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_In, true, 0, sizeof(int)*n, h_A, 0, NULL, NULL); CHK_ERR(err); size_t local_work_size[1] = {512}; size_t global_work_size[1] = {padded_size}; err = clSetKernelArg(reduce, 0, sizeof(cl_mem), &g_In); CHK_ERR(err); err = clSetKernelArg(reduce, 1, sizeof(cl_mem), &g_Out); CHK_ERR(err); err = clSetKernelArg(reduce, 2, sizeof(int)*512, NULL); CHK_ERR(err); err = clSetKernelArg(reduce, 3, sizeof(int), &padded_size); CHK_ERR(err); double t0 = timestamp(); // calls the recursion function recursive_reduce(cv.commands, cv.context, reduce, g_In, g_Out, padded_size); t0 = timestamp()-t0; //read result of GPU on host CPU err = clEnqueueReadBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); int sum=0.0f; for(int i = 0; i < n; i++) { sum += h_A[i]; } if(sum!=h_Y[0]) { printf("WRONG: CPU sum = %d, GPU sum = %d\n", sum, h_Y[0]); printf("WRONG: difference = %d\n", sum-h_Y[0]); printf("Other parts = %d, %d, %d, %d\n", h_Y[1], h_Y[2], h_Y[3], h_Y[4]); int z=0; while(h_Y[z] == h_Y[z+1]){ z++; } printf("red: %d\n", z); } else { printf("CORRECT: %d,%g\n",n,t0); } uninitialize_ocl(cv); delete [] h_A; delete [] h_Y; clReleaseMemObject(g_Out); clReleaseMemObject(g_In); return 0; }
void* recursive_reduce(void*(*reduce_func)(void*,void*),void* data, int n1, int n2, int element_size) { if(n2 <= n1) return (void*)((char*)data+n1*element_size); else return reduce_func(recursive_reduce(reduce_func,data,n1,n1+(n2-n1)/2,element_size),recursive_reduce(reduce_func,data,1+n1+(n2-n1)/2,n2,element_size)); }