// Main part of the below code is originated from Lode Vandevenne's code. // Please refer to http://lodev.org/cgtutor/juliamandelbrot.html void julia(int w, int h) { // each iteration, it calculates: new = old*old + c, // where c is a constant and old starts at current pixel // real and imaginary part of the constant c // determinate shape of the Julia Set double cRe, cIm; // you can change these to zoom and change position double zoom = 1, moveX = 0, moveY = 0; // after how much iterations the function should stop int maxIterations = COUNT_MAX; char *output_filename = "julia.ppm"; FILE *output_unit; double wtime; // pick some values for the constant c // this determines the shape of the Julia Set cRe = -0.7; cIm = 0.27015; /* Host data structures */ cl_platform_id *platforms; cl_uint num_platforms; cl_device_type dev_type = CL_DEVICE_TYPE_DEFAULT; cl_device_id dev; cl_context context; // NOTE : You might have multiple cmd_queue but whatever cl_command_queue cmd_queue; cl_program program; cl_kernel kernel; // TODO : define your variables cl_mem R,G,B; cl_int err; cl_uint num_dev = 0; // cl_event ev_bp; int i; // TODO : /* // loop through every pixel for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) */ size_t lws[2]={16,16}; size_t gws[2]={h/16,w/16}; printf( " Parallel OpenCL version\n" ); printf( "\n" ); printf( " Create an ASCII PPM image of the Julia set.\n" ); printf( "\n" ); printf( " An ASCII PPM image of the set is created using\n" ); printf( " W = %d pixels in the X direction and\n", w ); printf( " H = %d pixels in the Y direction.\n", h ); timer_init(); timer_start(0); // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if(num_platforms == 0) { ERROR("No OpenCl platform"); } printf("Number of platforms: %u\n",num_platforms); platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms,platforms,NULL); CHECK_ERROR(err); //Device for(i=0;i<num_platforms;i++) { err = clGetDeviceIDs(platforms[i],dev_type,1,&dev,&num_dev); if(err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); if(num_dev == 1) break; } if(num_dev<1) { ERROR("No device"); } // Context context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err); CHECK_ERROR(err); printf("-4"); // Command queue cmd_queue = clCreateCommandQueue(context, dev, 0, &err); CHECK_ERROR(err); printf("-3"); // Create a program // TODO : Get source code in your favor char * source_code=get_source_code("julia.cl"); printf("-2"); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, NULL, &err); CHECK_ERROR(err); // Callback data for clBuildProgram /* ev_bp=clCreateUserEvent(context,&err); CHECK_ERROR(err); bp_data_t bp_data; bp_data.dev=dev; bp_data.event=&ev_bp; */ printf("-1"); // Build the program. err = clBuildProgram(program, 1, &dev, NULL, NULL, NULL); if (err != CL_SUCCESS) { // Print the build log. size_t log_size; clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); char *log = (char *)malloc(log_size + 1); clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); fprintf(stderr,"\n"); fprintf(stderr,"---------- BUILD LOG ----------\n"); fprintf(stderr,"%s\n",log); fprintf(stderr,"-------------------------------\n"); free(log); CHECK_ERROR(err); } printf("0"); // Buffers // TODO: make and buffers int (*r) = (int (*))calloc(w * h, sizeof(int)); int (*g) = (int (*))calloc(w * h, sizeof(int)); int (*b) = (int (*))calloc(w * h, sizeof(int)); printf("1"); R=clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * w*h, NULL, &err); CHECK_ERROR(err); err=clEnqueueWriteBuffer(cmd_queue, R,CL_FALSE,0,w*h*sizeof(int),r,0,NULL,NULL); CHECK_ERROR(err); G=clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * w*h, NULL, &err); CHECK_ERROR(err); err=clEnqueueWriteBuffer(cmd_queue, G,CL_FALSE,0,w*h*sizeof(int),g,0,NULL,NULL); CHECK_ERROR(err); B=clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * w*h, NULL, &err); CHECK_ERROR(err); err=clEnqueueWriteBuffer(cmd_queue, B,CL_FALSE,0,w*h*sizeof(int),b,0,NULL,NULL); CHECK_ERROR(err); printf("2"); kernel=clCreateKernel(program,"julia",&err); CHECK_ERROR(err); printf("3"); err=clSetKernelArg(kernel,0,sizeof(int),&w); CHECK_ERROR(err); err=clSetKernelArg(kernel,1,sizeof(int),&h); CHECK_ERROR(err); err=clSetKernelArg(kernel,2,sizeof(int),&cRe); CHECK_ERROR(err); err=clSetKernelArg(kernel,3,sizeof(int),&cIm); CHECK_ERROR(err); err=clSetKernelArg(kernel,4,sizeof(cl_mem),&R); CHECK_ERROR(err); err=clSetKernelArg(kernel,5,sizeof(cl_mem),&G); CHECK_ERROR(err); err=clSetKernelArg(kernel,6,sizeof(cl_mem),&B); CHECK_ERROR(err); err=clSetKernelArg(kernel,7,sizeof(int),&zoom); CHECK_ERROR(err); err=clSetKernelArg(kernel,8,sizeof(int),&moveX); CHECK_ERROR(err); err=clSetKernelArg(kernel,9,sizeof(int),&moveY); CHECK_ERROR(err); err=clSetKernelArg(kernel,10,sizeof(int),&maxIterations); CHECK_ERROR(err); printf("4"); // Enqueue the kernel. err=clEnqueueNDRangeKernel(cmd_queue,kernel,1,NULL,gws,lws,0,NULL,NULL); CHECK_ERROR(err); printf("5"); // Read the result. err = clEnqueueReadBuffer(cmd_queue, R, CL_TRUE, 0, sizeof(int) * w*h, r, 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueReadBuffer(cmd_queue, G, CL_TRUE, 0, sizeof(int) * w*h, g, 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueReadBuffer(cmd_queue, B, CL_TRUE, 0, sizeof(int) * w*h, b, 0, NULL, NULL); CHECK_ERROR(err); timer_stop(0); wtime = timer_read(0); printf( "\n" ); printf( " Time = %lf seconds.\n", wtime ); // Write data to an ASCII PPM file. output_unit = fopen( output_filename, "wt" ); fprintf( output_unit, "P3\n" ); fprintf( output_unit, "%d %d\n", h, w ); fprintf( output_unit, "%d\n", 255 ); for ( int i = 0; i < h; i++ ) { for ( int jlo = 0; jlo < w; jlo = jlo + 4 ) { int jhi = MIN( jlo + 4, w ); for ( int j = jlo; j < jhi; j++ ) { fprintf( output_unit, " %d %d %d", r[i*w+j], g[i*w+j], b[i*w+j] ); } fprintf( output_unit, "\n" ); } } fclose( output_unit ); printf( "\n" ); printf( " Graphics data written to \"%s\".\n\n", output_filename ); // Release //clReleaseEvent(ev_bp); clReleaseMemObject(R); clReleaseMemObject(G); clReleaseMemObject(B); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); free(platforms); // Terminate. free(r); free(g); free(b); }
void mat_mul_opencl_printf(float *M_A, float *M_B, float *M_C, size_t ROW_A, size_t COL_A, size_t COL_B) { cl_platform_id *platform; cl_device_type dev_type = CL_DEVICE_TYPE_DEFAULT; cl_device_id dev; cl_context context; cl_command_queue cmd_queue; cl_program program; cl_kernel kernel; cl_mem mem_A, mem_B, mem_C; cl_int err; cl_uint num_platforms; cl_uint num_dev = 0; int i; // Get the device type to use from the environmental variable. char *dtype = getenv("CL_DEV_TYPE"); if (dtype) { if (strcasecmp(dtype, "cpu") == 0) { dev_type = CL_DEVICE_TYPE_CPU; } else if (strcasecmp(dtype, "gpu") == 0) { dev_type = CL_DEVICE_TYPE_GPU; } } // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if (num_platforms == 0) { fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } printf("Number of platforms: %u\n", num_platforms); platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); CHECK_ERROR(err); // Device for (i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev); if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); if (num_dev == 1) break; } if (num_dev < 1) { fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } print_device_name(dev); // Context context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err); CHECK_ERROR(err); // Command queue cmd_queue = clCreateCommandQueue(context, dev, 0, &err); CHECK_ERROR(err); // Create a program. size_t source_len; char *source_code = get_source_code("./kernel_printf.cl", &source_len); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_len, &err); free(source_code); CHECK_ERROR(err); // Build the program. char build_opts[200]; sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu", ROW_A, COL_A, COL_B); err = clBuildProgram(program, 1, &dev, build_opts, NULL, NULL); if (err != CL_SUCCESS) { print_build_log(program, dev); CHECK_ERROR(err); } // Kernel kernel = clCreateKernel(program, "mat_mul", &err); CHECK_ERROR(err); // Buffers mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ROW_A * COL_A, M_A, &err); CHECK_ERROR(err); mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * COL_A * COL_B, M_B, &err); CHECK_ERROR(err); mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * ROW_A * COL_B, M_C, &err); CHECK_ERROR(err); // Set the arguments. err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A); CHECK_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B); CHECK_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C); CHECK_ERROR(err); // Enqueue the kernel. size_t lws[2] = {16, 16}; size_t gws[2]; gws[1] = (size_t)ceil((double)ROW_A / lws[1]) * lws[1]; gws[0] = (size_t)ceil((double)COL_B / lws[0]) * lws[0]; timer_start(1); err = clEnqueueNDRangeKernel(cmd_queue, kernel, 2, NULL, gws, lws, 0, NULL, NULL); CHECK_ERROR(err); // Read the result. err = clEnqueueReadBuffer(cmd_queue, mem_C, CL_TRUE, 0, sizeof(float) * ROW_A * COL_B, M_C, 0, NULL, NULL); CHECK_ERROR(err); timer_stop(1); printf("Kernel time : %f sec\n", timer_read(1)); // Release clReleaseMemObject(mem_A); clReleaseMemObject(mem_B); clReleaseMemObject(mem_C); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); free(platform); }
void kmeans(int iteration_n, int class_n, int data_n, Point* centroids, Point* data, int* partitioned) { cl_int err; cl_platform_id platform; err = clGetPlatformIDs(1, &platform, NULL); CHECK_ERROR(err); cl_device_id device; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); CHECK_ERROR(err); cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); CHECK_ERROR(err); cl_command_queue queueIO; queueIO = clCreateCommandQueue(context, device, 0, &err); CHECK_ERROR(err); cl_command_queue queueSM; queueSM = clCreateCommandQueue(context, device, 0, &err); CHECK_ERROR(err); const char *source_code; size_t source_size; source_code = get_source_code("kernel.cl", &source_size); cl_program program; program = clCreateProgramWithSource(context, 1, &source_code, &source_size, &err); CHECK_ERROR(err); err = clBuildProgram(program, 1, &device, "", NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t log_size; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); log = (char*)malloc(log_size + 1); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); log[log_size] = 0; printf("Compile error:\n%s\n", log); free(log); } CHECK_ERROR(err); cl_kernel kernel; kernel = clCreateKernel(program, "classify", &err); CHECK_ERROR(err); size_t global_size, local_size = 256; global_size = (data_n + local_size - 1) / local_size * local_size; int n = (data_n + global_size - 1) / global_size * global_size; float *D = (float*)malloc(sizeof(float) * 2 * data_n); float *C = (float*)malloc(sizeof(float) * 2 * class_n); cl_uchar *E = (cl_uchar*)malloc(sizeof(cl_uchar) * n); int *F = (int*)malloc(sizeof(int) * class_n); for (int i = 0; i < data_n; ++i) { D[i * 2] = data[i].x; D[i * 2 + 1] = data[i].y; } for (int i = 0; i < class_n; ++i) { C[i * 2] = centroids[i].x; C[i * 2 + 1] = centroids[i].y; } cl_mem memD; memD = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float2) * n, NULL, &err); CHECK_ERROR(err); cl_mem memC; memC = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float2) * class_n, NULL, &err); CHECK_ERROR(err); cl_mem memE; memE = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * n, NULL, &err); CHECK_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memD); CHECK_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memC); CHECK_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memE); CHECK_ERROR(err); err = clSetKernelArg(kernel, 3, sizeof(cl_uchar), &class_n); CHECK_ERROR(err); err = clEnqueueWriteBuffer(queueIO, memD, CL_TRUE, 0, sizeof(cl_float2) * data_n, D, 0, NULL, NULL); CHECK_ERROR(err); for (int iter = 0; iter < iteration_n; ++iter) { err = clEnqueueWriteBuffer(queueIO, memC, CL_TRUE, 0, sizeof(cl_float2) * class_n, C, 0, NULL, NULL); CHECK_ERROR(err); memset(C, 0, sizeof(cl_float2) * class_n); memset(F, 0, sizeof(int) * class_n); int xSM = -1, xHost = -1; cl_uint num_events = 0; cl_event event[2]; for (int i = 0; i < n; i += global_size) { if (num_events > 0) { err = clWaitForEvents(num_events, event); CHECK_ERROR(err); } if (xSM != -1) { err = clEnqueueReadBuffer(queueIO, memE, CL_FALSE, sizeof(cl_uchar) * xSM, sizeof(cl_uchar) * global_size, &E[xSM], 0, NULL, &event[1]); CHECK_ERROR(err); xHost = xSM; } num_events = 0; size_t global_offset = i; err = clEnqueueNDRangeKernel(queueSM, kernel, 1, &global_offset, &global_size, &local_size, 0, NULL, &event[num_events++]); CHECK_ERROR(err); xSM = i; if (xHost != -1) { err = clWaitForEvents(1, &event[1]); CHECK_ERROR(err); for (size_t x = 0; x < global_size; ++x) { int idx = xHost + x; C[E[idx] * 2] += D[idx * 2]; C[E[idx] * 2 + 1] += D[idx * 2 + 1]; ++F[E[idx]]; } } } if (num_events > 0) { err = clWaitForEvents(num_events, event); CHECK_ERROR(err); } if (xSM != -1) { err = clEnqueueReadBuffer(queueIO, memE, CL_FALSE, sizeof(cl_uchar) * xSM, sizeof(cl_uchar) * global_size, &E[xSM], 0, NULL, &event[1]); CHECK_ERROR(err); xHost = xSM; } if (xHost != -1) { err = clWaitForEvents(1, &event[1]); CHECK_ERROR(err); for (size_t x = 0; x < global_size; ++x) { int idx = xHost + x; if (idx >= n) break; C[E[idx] * 2] += D[idx * 2]; C[E[idx] * 2 + 1] += D[idx * 2 + 1]; ++F[E[idx]]; } } for (int x = 0; x < class_n; ++x) { if (F[x] > 0) { C[x * 2] /= F[x]; C[x * 2 + 1] /= F[x]; } } } for (int i = 0; i < class_n; ++i) { centroids[i].x = C[i * 2]; centroids[i].y = C[i * 2 + 1]; } for (int i = 0; i < data_n; ++i) { partitioned[i] = E[i]; } free(D); free(C); free(E); free(F); clReleaseMemObject(memD); clReleaseMemObject(memC); clReleaseMemObject(memE); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queueIO); clReleaseCommandQueue(queueSM); clReleaseContext(context); }
void mat_mul_opencl_binary(float *M_A, float *M_B, float *M_C, size_t ROW_A, size_t COL_A, size_t COL_B) { cl_platform_id *platform; cl_device_type dev_type; cl_device_id dev; cl_context context; cl_command_queue cmd_queue; cl_program program; cl_kernel kernel; cl_mem mem_A, mem_B, mem_C; cl_event ev_kernel, ev_bp; cl_int err; cl_uint num_platforms; cl_uint num_dev = 0; int i; // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if (num_platforms == 0) { fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } printf("Number of platforms: %u\n", num_platforms); platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); CHECK_ERROR(err); // Device dev_type = get_device_type(); for (i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev); if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); if (num_dev == 1) break; } if (num_dev < 1) { fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } print_device_name(dev); free(platform); // Context context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err); CHECK_ERROR(err); // Create a program. char *source_code = get_source_code("./kernel_2d.cl"); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, NULL, &err); free(source_code); CHECK_ERROR(err); // Callback data for clBuildProgram ev_bp = clCreateUserEvent(context, &err); CHECK_ERROR(err); bp_data_t bp_data; bp_data.dev = dev; bp_data.event = &ev_bp; // Build the program. char build_opts[200]; sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu", ROW_A, COL_A, COL_B); err = clBuildProgram(program, 1, &dev, build_opts, build_program_callback, &bp_data); CHECK_ERROR(err); // Command queue cmd_queue = clCreateCommandQueue(context, dev, CL_QUEUE_PROFILING_ENABLE, &err); CHECK_ERROR(err); // Buffers mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ROW_A * COL_A, M_A, &err); CHECK_ERROR(err); mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * COL_A * COL_B, NULL, &err); CHECK_ERROR(err); err = clEnqueueWriteBuffer(cmd_queue, mem_B, CL_FALSE, 0, sizeof(float) * COL_A * COL_B, M_B, 0, NULL, NULL); CHECK_ERROR(err) mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * ROW_A * COL_B, NULL, &err); CHECK_ERROR(err); // Index space (gws) and work-group size (lws) size_t lws[2] = {16, 16}; size_t gws[2]; gws[1] = (size_t)ceil((double)ROW_A / lws[1]) * lws[1]; gws[0] = (size_t)ceil((double)COL_B / lws[0]) * lws[0]; // Wait for the kernel creation. clWaitForEvents(1, bp_data.event); // Kernel kernel = clCreateKernel(program, "mat_mul", &err); CHECK_ERROR(err); // Set the arguments. err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A); CHECK_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B); CHECK_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C); CHECK_ERROR(err); // Enqueue the kernel. err = clEnqueueNDRangeKernel(cmd_queue, kernel, 2, NULL, gws, lws, 0, NULL, &ev_kernel); CHECK_ERROR(err); // Read the result. err = clEnqueueReadBuffer(cmd_queue, mem_C, CL_TRUE, 0, sizeof(float) * ROW_A * COL_B, M_C, 0, NULL, NULL); CHECK_ERROR(err); // Read the profiling info. cl_ulong start_time, end_time; err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start_time, NULL); CHECK_ERROR(err); err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end_time, NULL); CHECK_ERROR(err); printf("Kernel time : %lf sec\n", (double)(end_time - start_time) / 10e9); // Release clReleaseEvent(ev_kernel); clReleaseEvent(ev_bp); clReleaseMemObject(mem_A); clReleaseMemObject(mem_B); clReleaseMemObject(mem_C); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); }
void mat_mul(float *a, float *b, float *c, size_t *dim, size_t *global_size, size_t *local_size) { cl_int err; cl_platform_id platform; err = clGetPlatformIDs(1, &platform, NULL); CHECK_ERROR(err); cl_device_id device; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); CHECK_ERROR(err); cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); CHECK_ERROR(err); cl_command_queue queueIO; queueIO = clCreateCommandQueue(context, device, 0, &err); CHECK_ERROR(err); cl_command_queue queueSM; queueSM = clCreateCommandQueue(context, device, 0, &err); CHECK_ERROR(err); const char *source_code; size_t source_size; source_code = get_source_code("kernel.cl", &source_size); cl_program program; program = clCreateProgramWithSource(context, 1, &source_code, &source_size, &err); CHECK_ERROR(err); err = clBuildProgram(program, 1, &device, "", NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t log_size; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); log = (char*)malloc(log_size + 1); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); log[log_size] = 0; printf("Compile error:\n%s\n", log); free(log); } CHECK_ERROR(err); cl_kernel kernel; kernel = clCreateKernel(program, "mat_mul", &err); CHECK_ERROR(err); cl_mem memA[2]; memA[0] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * global_size[1] * global_size[2], NULL, &err); CHECK_ERROR(err); memA[1] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * global_size[1] * global_size[2], NULL, &err); CHECK_ERROR(err); cl_mem memB[2]; memB[0] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * global_size[2] * global_size[0], NULL, &err); CHECK_ERROR(err); memB[1] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * global_size[2] * global_size[0], NULL, &err); CHECK_ERROR(err); cl_mem memC[2]; memC[0] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * global_size[1] * global_size[0], NULL, &err); CHECK_ERROR(err); memC[1] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * global_size[1] * global_size[0], NULL, &err); CHECK_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memC[0]); CHECK_ERROR(err); err = clSetKernelArg(kernel, 3, sizeof(cl_ulong), &global_size[2]); CHECK_ERROR(err); err = clSetKernelArg(kernel, 4, sizeof(cl_ulong), &global_size[0]); CHECK_ERROR(err); float *bufA, *bufB, *bufC; bufA = (float*)malloc(sizeof(float) * global_size[1] * global_size[2]); bufB = (float*)malloc(sizeof(float) * global_size[2] * global_size[0]); bufC = (float*)malloc(sizeof(float) * global_size[1] * global_size[0]); int swA = 0, swB = 0, swC = 0; cl_uint num_events = 0; cl_event event[3]; int xIO, yIO, xSM, ySM, xHost, yHost; xIO = yIO = xSM = ySM = xHost = yHost = -1; for (int i = 0; i < dim[1]; i += global_size[1]) { for (int k = 0; k < dim[2]; k += global_size[2]) { for (int j = 0; j < dim[0]; j += global_size[0]) { if (num_events > 0) { err = clWaitForEvents(num_events, event); CHECK_ERROR(err); } if (xSM != -1) { err = clEnqueueReadBuffer(queueIO, memC[swC], CL_FALSE, 0, sizeof(float) * global_size[0] * global_size[1], bufC, 0, NULL, &event[1]); CHECK_ERROR(err); swC ^= 1; err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memC[swC]); CHECK_ERROR(err); xHost = xSM; yHost = ySM; } num_events = 0; if (xIO != -1) { err = clEnqueueNDRangeKernel(queueSM, kernel, 2, NULL, global_size, local_size, 0, NULL, &event[num_events++]); CHECK_ERROR(err); xSM = xIO; ySM = yIO; } if (xHost != -1) { err = clWaitForEvents(1, &event[1]); CHECK_ERROR(err); } if (j == 0) { in2buf(a, bufA, global_size[1], global_size[2], dim[2], i, k); err = clEnqueueWriteBuffer(queueIO, memA[swA], CL_FALSE, 0, sizeof(float) * global_size[1] * global_size[2], bufA, 0, NULL, &event[num_events++]); CHECK_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memA[swA]); CHECK_ERROR(err); swA ^= 1; } in2buf(b, bufB, global_size[2], global_size[0], dim[0], k, j); err = clEnqueueWriteBuffer(queueIO, memB[swB], CL_FALSE, 0, sizeof(float) * global_size[2] * global_size[0], bufB, 0, NULL, &event[num_events++]); CHECK_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memB[swB]); CHECK_ERROR(err); swB ^= 1; xIO = i; yIO = j; if (xHost != -1) { for (int x = 0; x < global_size[1]; ++x) { for (int y = 0; y < global_size[0]; ++y) { c[(xHost + x) * dim[0] + (yHost + y)] += bufC[x * global_size[0] + y]; } } } } } } if (num_events > 0) { err = clWaitForEvents(num_events, event); CHECK_ERROR(err); } if (xSM != -1) { err = clEnqueueReadBuffer(queueIO, memC[swC], CL_FALSE, 0, sizeof(float) * global_size[0] * global_size[1], bufC, 0, NULL, &event[1]); CHECK_ERROR(err); swC ^= 1; err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memC[swC]); CHECK_ERROR(err); xHost = xSM; yHost = ySM; } num_events = 0; if (xIO != -1) { err = clEnqueueNDRangeKernel(queueSM, kernel, 2, NULL, global_size, local_size, 0, NULL, &event[num_events++]); CHECK_ERROR(err); xSM = xIO; ySM = yIO; } if (xHost != -1) { err = clWaitForEvents(1, &event[1]); CHECK_ERROR(err); for (int x = 0; x < global_size[1]; ++x) { for (int y = 0; y < global_size[0]; ++y) { c[(xHost + x) * dim[0] + (yHost + y)] += bufC[x * global_size[0] + y]; } } } if (num_events > 0) { err = clWaitForEvents(num_events, event); CHECK_ERROR(err); } if (xSM != -1) { err = clEnqueueReadBuffer(queueIO, memC[swC], CL_FALSE, 0, sizeof(float) * global_size[0] * global_size[1], bufC, 0, NULL, &event[1]); CHECK_ERROR(err); swC ^= 1; err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memC[swC]); CHECK_ERROR(err); xHost = xSM; yHost = ySM; } if (xHost != -1) { err = clWaitForEvents(1, &event[1]); CHECK_ERROR(err); for (int x = 0; x < global_size[1]; ++x) { for (int y = 0; y < global_size[0]; ++y) { c[(xHost + x) * dim[0] + (yHost + y)] += bufC[x * global_size[0] + y]; } } } free(bufA); free(bufB); free(bufC); clReleaseMemObject(memA[0]); clReleaseMemObject(memA[1]); clReleaseMemObject(memB[0]); clReleaseMemObject(memB[1]); clReleaseMemObject(memC[0]); clReleaseMemObject(memC[1]); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queueIO); clReleaseCommandQueue(queueSM); clReleaseContext(context); }
void mat_mul_opencl(float *A, float *B, float *C, int ROW_A, int COL_A, int COL_B) { cl_int err; cl_platform_id platform; err = clGetPlatformIDs(1, &platform, NULL); CHECK_ERROR(err); cl_device_id device; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); CHECK_ERROR(err); cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); CHECK_ERROR(err); cl_command_queue queue; queue = clCreateCommandQueue(context, device, 0, &err); CHECK_ERROR(err); const char *source_code; size_t source_size; source_code = get_source_code("kernel.cl", &source_size); cl_program program; program = clCreateProgramWithSource(context, 1, &source_code, &source_size, &err); CHECK_ERROR(err); err = clBuildProgram(program, 1, &device, "", NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t log_size; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); log = (char*)malloc(log_size + 1); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); log[log_size] = 0; printf("Compile error:\n%s\n", log); free(log); } CHECK_ERROR(err); cl_kernel kernel; kernel = clCreateKernel(program, "mat_mul", &err); CHECK_ERROR(err); cl_mem memA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ROW_A * COL_A, A, &err); CHECK_ERROR(err); cl_mem memB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * COL_A * COL_B, B, &err); CHECK_ERROR(err); cl_mem memC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * ROW_A * COL_B, NULL, &err); CHECK_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memA); CHECK_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memB); CHECK_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memC); CHECK_ERROR(err); err = clSetKernelArg(kernel, 3, sizeof(cl_int), &ROW_A); CHECK_ERROR(err); err = clSetKernelArg(kernel, 4, sizeof(cl_int), &COL_A); CHECK_ERROR(err); err = clSetKernelArg(kernel, 5, sizeof(cl_int), &COL_B); CHECK_ERROR(err); size_t global_size[2] = {COL_B, ROW_A}; size_t local_size[2] = {16, 16}; for (int i = 0; i < 2; ++i) global_size[i] = (global_size[i] + local_size[i] - 1) / local_size[i] * local_size[i]; err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueReadBuffer(queue, memC, CL_TRUE, 0, sizeof(float) * ROW_A * COL_B, C, 0, NULL, NULL); CHECK_ERROR(err); clReleaseMemObject(memA); clReleaseMemObject(memB); clReleaseMemObject(memC); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); }
int main(int argc, char **argv) { FILE *file; char instruction; char* filename; struct vector instruction_stream; if(argc != 2) { /* wrong number of args */ fprintf(stderr, "Error: exactly 1 HQ9+ source file as arg required\n"); exit(EXIT_FAILURE); } /* open file */ filename = argv[1]; file = fopen(filename, "r"); if (file == NULL) { /* could not open file */ perror("Error opening file"); exit(EXIT_FAILURE); } /*** prologue ***/ vector_create(&instruction_stream, 100); char prologue [] = { 0x55, // push %rbp 0x48, 0x89, 0xE5, // mov %rsp, %rbp // backup %r12 (callee saved register) 0x41, 0x54, // pushq %r12 // store %rdi content (putchar) in %r12 as callee saved 0x49, 0x89, 0xFC, // movq %rdi, %r12 // push accumulator on stack 0x6a, 0x00, // pushq $0 }; vector_push(&instruction_stream, prologue, sizeof(prologue)); int stack_offset = -0x10; // offset from %rbp int offset_accumulator = stack_offset; // accumulator address: -0x10(%rbp) // hello world write_to_stack(&instruction_stream, "Hello World\n", &stack_offset); int offset_hello_world = stack_offset; // source code char* source_code = get_source_code(filename); write_to_stack(&instruction_stream, source_code, &stack_offset); free(source_code); int offset_source = stack_offset; // lyrics char* lyrics = get_lyrics(99); write_to_stack(&instruction_stream, lyrics, &stack_offset); free(lyrics); int offset_bottles = stack_offset; // everything after accumulator is text bytes int text_bytes_on_stack = -(stack_offset - offset_accumulator); /*** parse file ***/ while((instruction = fgetc(file)) != EOF) { switch (instruction) { case 'H': { // access single chars of int char *hw = (char*) &offset_hello_world; char opcodes [] = { 0xB0, 00, // movb $0, %al 0x48, 0x8D, 0xBD, hw[0], hw[1], hw[2], hw[3], // leaq -0x<offset>(%rbp),%rdi 0x41, 0xFF, 0xD4 // callq *%r12 }; vector_push(&instruction_stream, opcodes, sizeof(opcodes)); } break; case 'Q': { // access single chars of int char *s = (char*) &offset_source; char opcodes [] = { 0xB0, 00, // movb $0, %al 0x48, 0x8D, 0xBD, s[0], s[1], s[2], s[3], // leaq -0x<offset>(%rbp),%rdi 0x41, 0xFF, 0xD4 // callq *%r12 }; vector_push(&instruction_stream, opcodes, sizeof(opcodes)); } break; case '9': { // access single chars of int char *b = (char*) &offset_bottles; char opcodes [] = { 0xB0, 00, // movb $0, %al 0x48, 0x8D, 0xBD, b[0], b[1], b[2], b[3], // leaq -0x<offset>(%rbp),%rdi // 0xBF, 0x39, 0x00, 0x00, 0x00, // mov $0x39, %edi 0x41, 0xFF, 0xD4 // callq *%r12 }; vector_push(&instruction_stream, opcodes, sizeof(opcodes)); } break; case '+': { char *acc = (char*) &offset_accumulator; char opcodes [] = { // increment the accumulator // TODO from variable instead of constant offset 0x48, 0xFF, 0x45, 0xF0, // incq -0x10(%rbp) }; vector_push(&instruction_stream, opcodes, sizeof(opcodes)); } break; } } if (!feof(file)) { perror("Error reading file"); } fclose(file); /*** epilogue ***/ // access single chars of long char *t = (char*) &text_bytes_on_stack; char epilogue [] = { // free strings 0x48, 0x81, 0xC4, t[0], t[1], t[2], t[3], // addq $<x>, %rsp // free accumulator 0x48, 0x83, 0xC4, 0x08, // addq $8, %rsp // restore callee saved register 0x41, 0x5C, // popq %r12 0x5d, // pop rbp 0xC3 // ret }; vector_push(&instruction_stream, epilogue, sizeof(epilogue)); /*** invoke generated code ***/ /* allocate and copy instruction stream into executable memory */ void* mem = mmap(NULL, instruction_stream.size, PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); memcpy(mem, instruction_stream.data, instruction_stream.size); /* typecast memory to a function pointer and call the dynamically created executable code */ void (*hq9p_program) (fn_printf) = mem; hq9p_program(printf); /* clear up */ munmap(mem, instruction_stream.size); vector_destroy(&instruction_stream); exit(EXIT_SUCCESS); }
void mandelbrot(int m, int n) { cl_platform_id *platform; cl_device_type dev_type = CL_DEVICE_TYPE_GPU; cl_device_id *devs = NULL; cl_context context; cl_command_queue *cmd_queues; cl_program program; cl_kernel *kernels; cl_mem *mem_R; cl_mem *mem_G; cl_mem *mem_B; cl_int err; cl_uint num_platforms; cl_uint num_devs = 0; cl_event *ev_kernels; int count_max = COUNT_MAX; int i, j, jhi, jlo; char *output_filename = "mandelbrot.ppm"; FILE *output_unit; double wtime; float x_max = 1.25; float x_min = - 2.25; // float x; // float x1; // float x2; float y_max = 1.75; float y_min = - 1.75; //float y; //float y1; //float y2; size_t size_color; size_color = sizeof(int) * m * n; int (*r)[n] = (int (*)[n])calloc(m * n, sizeof(int)); int (*g)[n] = (int (*)[n])calloc(m * n, sizeof(int)); int (*b)[n] = (int (*)[n])calloc(m * n, sizeof(int)); printf( " Sequential C version\n" ); printf( "\n" ); printf( " Create an ASCII PPM image of the Mandelbrot set.\n" ); printf( "\n" ); printf( " For each point C = X + i*Y\n" ); printf( " with X range [%g,%g]\n", x_min, x_max ); printf( " and Y range [%g,%g]\n", y_min, y_max ); printf( " carry out %d iterations of the map\n", count_max ); printf( " Z(n+1) = Z(n)^2 + C.\n" ); printf( " If the iterates stay bounded (norm less than 2)\n" ); printf( " then C is taken to be a member of the set.\n" ); printf( "\n" ); printf( " An ASCII PPM image of the set is created using\n" ); printf( " M = %d pixels in the X direction and\n", m ); printf( " N = %d pixels in the Y direction.\n", n ); timer_init(); timer_start(0); // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if (num_platforms == 0) { fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } printf("Number of platforms: %u\n", num_platforms); platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); CHECK_ERROR(err); // Device for (i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs); if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); num_devs = 1; //** if (num_devs >= 1) { devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs); err = clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL); break; } } if ( devs == NULL || num_devs < 1) { fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } for( i = 0; i < num_devs; ++i ) { printf("dev[%d] : ", i); print_device_name(devs[i]); } // Context context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err); CHECK_ERROR(err); // Command queue cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs); for( i = 0; i < num_devs; ++i) { cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err); CHECK_ERROR(err); } // Create a program. size_t source_len; char *source_code = get_source_code("./mandelbrot_kernel.cl", &source_len); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_len, &err); free(source_code); CHECK_ERROR(err); // Build the program. char build_opts[200]; sprintf(build_opts, "-Dm=%d -Dn=%d -Dnum_devs=%d", m, n, num_devs); err = clBuildProgram(program, num_devs, devs, build_opts, NULL, NULL); if (err != CL_SUCCESS) { print_build_log(program, devs[0]); CHECK_ERROR(err); } // Kernel kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs); for (i = 0; i < num_devs; i++) { kernels[i] = clCreateKernel(program, "mandelbrot_kernel", NULL); } // Buffers mem_R = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); mem_G = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); mem_B = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); for(i = 0; i < num_devs; i++) { mem_R[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); mem_G[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); mem_B[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); } /* // Write to Buffers for(i = 0; i < num_devs; i++) { clEnqueueWriteBuffer(cmd_queues[i], mem_CHECK[i], CL_FALSE, 0, size_CHECK / num_devs, (CHECK + (N / num_devs) * i), 0, NULL, NULL); } */ // Set the arguments. for(i = 0; i < num_devs; i++) { // flag = i * (m * n / num_devs); clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*) &mem_R[i]); clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*) &mem_G[i]); clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*) &mem_B[i]); clSetKernelArg(kernels[i], 3, sizeof(int), &count_max); clSetKernelArg(kernels[i], 4, sizeof(float), &x_max); clSetKernelArg(kernels[i], 5, sizeof(float), &x_min); clSetKernelArg(kernels[i], 6, sizeof(float), &y_max); clSetKernelArg(kernels[i], 7, sizeof(float), &y_min); } // Enqueue the kernel. size_t lws[1] = {256}; size_t gws[1] = { m * n /num_devs }; gws[0] = (size_t)ceil((double)m * n / lws[0]) * lws[0]; ev_kernels = (cl_event*)malloc(sizeof(cl_event)*num_devs); for(i = 0; i < num_devs; i++) { err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 1, NULL, gws, lws, 0, NULL, &ev_kernels[i]); CHECK_ERROR(err); } // Read the result. for(i = 0; i < num_devs; i++) { err = clEnqueueReadBuffer(cmd_queues[i], mem_R[i], CL_TRUE, 0, size_color / num_devs, r, 1, &ev_kernels[i], NULL); err = clEnqueueReadBuffer(cmd_queues[i], mem_G[i], CL_TRUE, 0, size_color / num_devs, g, 1, &ev_kernels[i], NULL); err = clEnqueueReadBuffer(cmd_queues[i], mem_B[i], CL_TRUE, 0, size_color / num_devs, b, 1, &ev_kernels[i], NULL); } // Release for( i = 0; i < num_devs; ++i ) { clFinish(cmd_queues[i]); clReleaseMemObject(mem_R[i]); clReleaseMemObject(mem_G[i]); clReleaseMemObject(mem_B[i]); clReleaseKernel(kernels[i]); clReleaseCommandQueue(cmd_queues[i]); clReleaseEvent(ev_kernels[i]); } clReleaseProgram(program); clReleaseContext(context); free(mem_R); free(mem_G); free(mem_B); free(cmd_queues); free(kernels); free(devs); free(ev_kernels); free(platform); timer_stop(0); wtime = timer_read(0); printf( "\n" ); printf( " Time = %lf seconds.\n", wtime ); // Write data to an ASCII PPM file. output_unit = fopen( output_filename, "wt" ); fprintf( output_unit, "P3\n" ); fprintf( output_unit, "%d %d\n", n, m ); fprintf( output_unit, "%d\n", 255 ); for ( i = 0; i < m; i++ ) { for ( jlo = 0; jlo < n; jlo = jlo + 4 ) { jhi = MIN( jlo + 4, n ); for ( j = jlo; j < jhi; j++ ) { fprintf( output_unit, " %d %d %d", r[i][j], g[i][j], b[i][j] ); } fprintf( output_unit, "\n" ); } } fclose( output_unit ); printf( "\n" ); printf( " Graphics data written to \"%s\".\n\n", output_filename ); // Terminate. free(r); free(g); free(b); }