static void build_program_callback(cl_program program, void *user_data) { cl_int err; cl_build_status build_status; bp_data_t *bp_data = (bp_data_t *)user_data; // Check the build status. err = clGetProgramBuildInfo(program, bp_data->dev, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL); CHECK_ERROR(err); if (build_status != CL_BUILD_SUCCESS) { print_build_log(program, bp_data->dev); exit(EXIT_FAILURE); } // Set the event status err = clSetUserEventStatus(*(bp_data->event), CL_COMPLETE); CHECK_ERROR(err); }
cl_int set_kernel(int did, cl_prop *prop) { cl_int status; prop->context = clCreateContext(0, prop->num_devices, (const cl_device_id *)prop->devices, NULL, NULL, &status); prop->queue = clCreateCommandQueueWithProperties(prop->context, prop->devices[did], 0, &status); prop->program = clCreateProgramWithSource(prop->context, prop->kcode.count, (const char **)prop->kcode.codes, NULL, &status); const char *options = "-I./include"; status = clBuildProgram(prop->program, prop->num_devices, (const cl_device_id *)prop->devices, options, NULL, NULL); if(status != CL_SUCCESS) { printf("%s[Build Error Log]%s\n", ERR_STR, CLR_STR); } else { printf("%s[Build Log]%s\n", WHT_STR, CLR_STR); } print_build_log(did, prop); if(status != CL_SUCCESS) getchar(); prop->gabor = clCreateKernel(prop->program, (const char *)"enable_gabor", NULL); prop->pooling = clCreateKernel(prop->program, (const char *)"enable_pooling", NULL); prop->feature = clCreateKernel(prop->program, (const char *)"feature_rfcn", NULL); prop->cls = clCreateKernel(prop->program, (const char *)"class_rfcn", NULL); return status; }
int main(int argc, char **argv) { cl_int status; const char *platform_name = "NVIDIA"; if (!find_platform(platform_name, &platform)) { fprintf(stderr,"Error: Platform \"%s\" not found\n", platform_name); print_platforms(); teardown(-1); } status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); checkError (status, "Error: could not query devices"); context = clCreateContext(NULL, 1, &device, NULL, NULL, &status); checkError(status, "could not create context"); const char name[] = KERNELDIR "/reduce.cl"; unsigned char *source; size_t size; if (!load_file(name, &source, &size)) { teardown(-1); } program = clCreateProgramWithSource(context, 1, (const char **) &source, &size, &status); checkError(status, "Error: failed to create program %s: ", name); status = clBuildProgram(program, 1, &device, "-I.", NULL, NULL); if (status != CL_SUCCESS) { print_build_log(program, device); checkError(status, "Error: failed to create build %s: ", name); } free(source); print_device_info(device, 0); queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status); checkError(status, "could not create command queue"); cl_ulong start, end; cl_event event; size_t width = 1024+1024; size_t buf_size = width*sizeof(cl_float); kernel = clCreateKernel(program, "reduce", &status); checkError(status, "could not create kernel"); size_t work_size = width; size_t local_size = 64; size_t local_buf_size = local_size * sizeof(cl_float); size_t groups = width / local_size; size_t res_buf_size = groups * sizeof(cl_float); float *data_in = malloc(buf_size); float *data_out = malloc(res_buf_size); if (!data_in || !data_out) { fprintf(stderr,"\nError: malloc failed\n"); teardown(-1); } for (unsigned int i = 0; i < width; ++i) { data_in[i] = (float) (i % 16); } buffer_in = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &status); checkError(status, "Error: could not create buffer_in"); buffer_out = clCreateBuffer(context, CL_MEM_READ_WRITE, res_buf_size, NULL, &status); checkError(status, "Error: could not create buffer_out"); status = clEnqueueWriteBuffer(queue, buffer_in, CL_FALSE, 0, buf_size, data_in, 0, NULL, NULL); checkError(status, "Error: could not copy data into device"); // execute kernel int arg = 0; status = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_in); status = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_out); status = clSetKernelArg(kernel, arg++, local_buf_size, NULL); status = clSetKernelArg(kernel, arg++, sizeof(cl_int), &width); checkError(status, "Error: could not set args"); status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_size, &local_size, 0, NULL, &event); checkError(status, "Error: could not enqueue kernel"); status = clWaitForEvents(1, &event); checkError(status, "Error: could not wait for event"); status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); checkError(status, "Error: could not get start profile information"); status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); checkError(status, "Error: could not get end profile information"); status = clReleaseEvent(event); checkError(status, "Error: could not release event"); // read results back status = clEnqueueReadBuffer(queue, buffer_out, CL_TRUE, 0, res_buf_size, data_out, 0, NULL, NULL); checkError(status, "Error: could not copy data into device"); status = clFinish(queue); checkError(status, "Error: could not finish successfully"); float clsum = 0; for (unsigned int i = 0; i < groups; ++i) { clsum += data_out[i]; } #ifdef DEBUG for (int i = 0; i < groups; ++i) { printf("%.0f ", data_out[i]); } #endif double elapsed = (end - start) * 1e-9f; printf("time: %f\n", elapsed); float sum = 0; for (unsigned int i = 0; i < width; ++i) { sum += data_in[i]; } if (sum != clsum) fprintf(stderr, "Compare failed: %f != %f\n", clsum, sum); free(data_in); free(data_out); teardown(0); }
void mat_mul_opencl_printf(float *M_A, float *M_B, float *M_C, size_t ROW_A, size_t COL_A, size_t COL_B) { cl_platform_id *platform; cl_device_type dev_type = CL_DEVICE_TYPE_DEFAULT; cl_device_id dev; cl_context context; cl_command_queue cmd_queue; cl_program program; cl_kernel kernel; cl_mem mem_A, mem_B, mem_C; cl_int err; cl_uint num_platforms; cl_uint num_dev = 0; int i; // Get the device type to use from the environmental variable. char *dtype = getenv("CL_DEV_TYPE"); if (dtype) { if (strcasecmp(dtype, "cpu") == 0) { dev_type = CL_DEVICE_TYPE_CPU; } else if (strcasecmp(dtype, "gpu") == 0) { dev_type = CL_DEVICE_TYPE_GPU; } } // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if (num_platforms == 0) { fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } printf("Number of platforms: %u\n", num_platforms); platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); CHECK_ERROR(err); // Device for (i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev); if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); if (num_dev == 1) break; } if (num_dev < 1) { fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } print_device_name(dev); // Context context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err); CHECK_ERROR(err); // Command queue cmd_queue = clCreateCommandQueue(context, dev, 0, &err); CHECK_ERROR(err); // Create a program. size_t source_len; char *source_code = get_source_code("./kernel_printf.cl", &source_len); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_len, &err); free(source_code); CHECK_ERROR(err); // Build the program. char build_opts[200]; sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu", ROW_A, COL_A, COL_B); err = clBuildProgram(program, 1, &dev, build_opts, NULL, NULL); if (err != CL_SUCCESS) { print_build_log(program, dev); CHECK_ERROR(err); } // Kernel kernel = clCreateKernel(program, "mat_mul", &err); CHECK_ERROR(err); // Buffers mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ROW_A * COL_A, M_A, &err); CHECK_ERROR(err); mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * COL_A * COL_B, M_B, &err); CHECK_ERROR(err); mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * ROW_A * COL_B, M_C, &err); CHECK_ERROR(err); // Set the arguments. err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A); CHECK_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B); CHECK_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C); CHECK_ERROR(err); // Enqueue the kernel. size_t lws[2] = {16, 16}; size_t gws[2]; gws[1] = (size_t)ceil((double)ROW_A / lws[1]) * lws[1]; gws[0] = (size_t)ceil((double)COL_B / lws[0]) * lws[0]; timer_start(1); err = clEnqueueNDRangeKernel(cmd_queue, kernel, 2, NULL, gws, lws, 0, NULL, NULL); CHECK_ERROR(err); // Read the result. err = clEnqueueReadBuffer(cmd_queue, mem_C, CL_TRUE, 0, sizeof(float) * ROW_A * COL_B, M_C, 0, NULL, NULL); CHECK_ERROR(err); timer_stop(1); printf("Kernel time : %f sec\n", timer_read(1)); // Release clReleaseMemObject(mem_A); clReleaseMemObject(mem_B); clReleaseMemObject(mem_C); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); free(platform); }
int main(int argc, char **argv) { cl_int status; const char *platform_name = "NVIDIA"; if (!find_platform(platform_name, &platform)) { fprintf(stderr,"Error: Platform \"%s\" not found\n", platform_name); print_platforms(); teardown(-1); } status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); checkError (status, "Error: could not query devices"); context = clCreateContext(NULL, 1, &device, NULL, NULL, &status); checkError(status, "could not create context"); const char name[] = KERNELDIR "/gauss.cl"; unsigned char *source; size_t size; if (!load_file(name, &source, &size)) { teardown(-1); } program = clCreateProgramWithSource(context, 1, (const char **) &source, &size, &status); checkError(status, "Error: failed to create program %s: ", name); status = clBuildProgram(program, 1, &device, "-I.", NULL, NULL); if (status != CL_SUCCESS) { print_build_log(program, device); checkError(status, "Error: failed to create build %s: ", name); } free(source); print_device_info(device, 0); queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status); checkError(status, "could not create command queue"); cl_ulong start, end; cl_event event; unsigned char *data; size_t datasize; if (!load_file("lena.dat", &data, &datasize)) { teardown(-1); } size_t width = 512; size_t height = 512; size_t buf_size = width*height*sizeof(cl_float); float *data_out = malloc(buf_size); if (!data_out) { fprintf(stderr,"\nError: malloc failed\n"); teardown(-1); } kernel = clCreateKernel(program, "gauss", &status); checkError(status, "could not create kernel"); cl_image_format format = { CL_R, CL_UNORM_INT8}; buffer_in = clCreateImage2D (context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format, width, height, 0, data, &status); checkError(status, "Error: could not create image"); buffer_out = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &status); checkError(status, "Error: could not create buffer_out"); // execute kernel int arg = 0; status = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_in); status = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_out); checkError(status, "Error: could not set args"); size_t work_size[] = {width, height}; size_t local_size[] = {1, 1}; status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, work_size, local_size, 0, NULL, &event); checkError(status, "Error: could not enqueue kernel"); status = clWaitForEvents(1, &event); checkError(status, "Error: could not wait for event"); status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); checkError(status, "Error: could not get start profile information"); status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); checkError(status, "Error: could not get end profile information"); status = clReleaseEvent(event); checkError(status, "Error: could not release event"); // read results back status = clEnqueueReadBuffer(queue, buffer_out, CL_FALSE, 0, buf_size, data_out, 0, NULL, NULL); checkError(status, "Error: could not copy data into device"); status = clFinish(queue); checkError(status, "Error: could not finish successfully"); double elapsed = (end - start) * 1e-9f; printf("time: %f\n", elapsed); write_bmp("gauss.bmp", data_out, width, height, NORMAL); free(data); free(data_out); teardown(0); }
void mandelbrot(int m, int n) { cl_platform_id *platform; cl_device_type dev_type = CL_DEVICE_TYPE_GPU; cl_device_id *devs = NULL; cl_context context; cl_command_queue *cmd_queues; cl_program program; cl_kernel *kernels; cl_mem *mem_R; cl_mem *mem_G; cl_mem *mem_B; cl_int err; cl_uint num_platforms; cl_uint num_devs = 0; cl_event *ev_kernels; int count_max = COUNT_MAX; int i, j, jhi, jlo; char *output_filename = "mandelbrot.ppm"; FILE *output_unit; double wtime; float x_max = 1.25; float x_min = - 2.25; // float x; // float x1; // float x2; float y_max = 1.75; float y_min = - 1.75; //float y; //float y1; //float y2; size_t size_color; size_color = sizeof(int) * m * n; int (*r)[n] = (int (*)[n])calloc(m * n, sizeof(int)); int (*g)[n] = (int (*)[n])calloc(m * n, sizeof(int)); int (*b)[n] = (int (*)[n])calloc(m * n, sizeof(int)); printf( " Sequential C version\n" ); printf( "\n" ); printf( " Create an ASCII PPM image of the Mandelbrot set.\n" ); printf( "\n" ); printf( " For each point C = X + i*Y\n" ); printf( " with X range [%g,%g]\n", x_min, x_max ); printf( " and Y range [%g,%g]\n", y_min, y_max ); printf( " carry out %d iterations of the map\n", count_max ); printf( " Z(n+1) = Z(n)^2 + C.\n" ); printf( " If the iterates stay bounded (norm less than 2)\n" ); printf( " then C is taken to be a member of the set.\n" ); printf( "\n" ); printf( " An ASCII PPM image of the set is created using\n" ); printf( " M = %d pixels in the X direction and\n", m ); printf( " N = %d pixels in the Y direction.\n", n ); timer_init(); timer_start(0); // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if (num_platforms == 0) { fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } printf("Number of platforms: %u\n", num_platforms); platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); CHECK_ERROR(err); // Device for (i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs); if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); num_devs = 1; //** if (num_devs >= 1) { devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs); err = clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL); break; } } if ( devs == NULL || num_devs < 1) { fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } for( i = 0; i < num_devs; ++i ) { printf("dev[%d] : ", i); print_device_name(devs[i]); } // Context context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err); CHECK_ERROR(err); // Command queue cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs); for( i = 0; i < num_devs; ++i) { cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err); CHECK_ERROR(err); } // Create a program. size_t source_len; char *source_code = get_source_code("./mandelbrot_kernel.cl", &source_len); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_len, &err); free(source_code); CHECK_ERROR(err); // Build the program. char build_opts[200]; sprintf(build_opts, "-Dm=%d -Dn=%d -Dnum_devs=%d", m, n, num_devs); err = clBuildProgram(program, num_devs, devs, build_opts, NULL, NULL); if (err != CL_SUCCESS) { print_build_log(program, devs[0]); CHECK_ERROR(err); } // Kernel kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs); for (i = 0; i < num_devs; i++) { kernels[i] = clCreateKernel(program, "mandelbrot_kernel", NULL); } // Buffers mem_R = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); mem_G = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); mem_B = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); for(i = 0; i < num_devs; i++) { mem_R[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); mem_G[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); mem_B[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); } /* // Write to Buffers for(i = 0; i < num_devs; i++) { clEnqueueWriteBuffer(cmd_queues[i], mem_CHECK[i], CL_FALSE, 0, size_CHECK / num_devs, (CHECK + (N / num_devs) * i), 0, NULL, NULL); } */ // Set the arguments. for(i = 0; i < num_devs; i++) { // flag = i * (m * n / num_devs); clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*) &mem_R[i]); clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*) &mem_G[i]); clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*) &mem_B[i]); clSetKernelArg(kernels[i], 3, sizeof(int), &count_max); clSetKernelArg(kernels[i], 4, sizeof(float), &x_max); clSetKernelArg(kernels[i], 5, sizeof(float), &x_min); clSetKernelArg(kernels[i], 6, sizeof(float), &y_max); clSetKernelArg(kernels[i], 7, sizeof(float), &y_min); } // Enqueue the kernel. size_t lws[1] = {256}; size_t gws[1] = { m * n /num_devs }; gws[0] = (size_t)ceil((double)m * n / lws[0]) * lws[0]; ev_kernels = (cl_event*)malloc(sizeof(cl_event)*num_devs); for(i = 0; i < num_devs; i++) { err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 1, NULL, gws, lws, 0, NULL, &ev_kernels[i]); CHECK_ERROR(err); } // Read the result. for(i = 0; i < num_devs; i++) { err = clEnqueueReadBuffer(cmd_queues[i], mem_R[i], CL_TRUE, 0, size_color / num_devs, r, 1, &ev_kernels[i], NULL); err = clEnqueueReadBuffer(cmd_queues[i], mem_G[i], CL_TRUE, 0, size_color / num_devs, g, 1, &ev_kernels[i], NULL); err = clEnqueueReadBuffer(cmd_queues[i], mem_B[i], CL_TRUE, 0, size_color / num_devs, b, 1, &ev_kernels[i], NULL); } // Release for( i = 0; i < num_devs; ++i ) { clFinish(cmd_queues[i]); clReleaseMemObject(mem_R[i]); clReleaseMemObject(mem_G[i]); clReleaseMemObject(mem_B[i]); clReleaseKernel(kernels[i]); clReleaseCommandQueue(cmd_queues[i]); clReleaseEvent(ev_kernels[i]); } clReleaseProgram(program); clReleaseContext(context); free(mem_R); free(mem_G); free(mem_B); free(cmd_queues); free(kernels); free(devs); free(ev_kernels); free(platform); timer_stop(0); wtime = timer_read(0); printf( "\n" ); printf( " Time = %lf seconds.\n", wtime ); // Write data to an ASCII PPM file. output_unit = fopen( output_filename, "wt" ); fprintf( output_unit, "P3\n" ); fprintf( output_unit, "%d %d\n", n, m ); fprintf( output_unit, "%d\n", 255 ); for ( i = 0; i < m; i++ ) { for ( jlo = 0; jlo < n; jlo = jlo + 4 ) { jhi = MIN( jlo + 4, n ); for ( j = jlo; j < jhi; j++ ) { fprintf( output_unit, " %d %d %d", r[i][j], g[i][j], b[i][j] ); } fprintf( output_unit, "\n" ); } } fclose( output_unit ); printf( "\n" ); printf( " Graphics data written to \"%s\".\n\n", output_filename ); // Terminate. free(r); free(g); free(b); }
void mat_mul_opencl_1d(float *M_A, float *M_B, float *M_C, size_t ROW_A, size_t COL_A, size_t COL_B) { cl_platform_id *platform; cl_device_type dev_type; cl_device_id dev; cl_context context; cl_command_queue cmd_queue; cl_program program; cl_kernel kernel; cl_mem mem_A, mem_B, mem_C; cl_event ev_kernel; cl_int err; cl_uint num_platforms; cl_uint num_dev = 0; int i; // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if (num_platforms == 0) { fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } printf("Number of platforms: %u\n", num_platforms); platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); CHECK_ERROR(err); // Device dev_type = get_device_type(); for (i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev); if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); if (num_dev == 1) break; } if (num_dev < 1) { fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } print_device_name(dev); free(platform); // Context context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err); CHECK_ERROR(err); // Command queue cmd_queue = clCreateCommandQueue(context, dev, CL_QUEUE_PROFILING_ENABLE, &err); CHECK_ERROR(err); // Create a program. char *source_code = get_source_code("./kernel_1d.cl"); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, NULL, &err); free(source_code); CHECK_ERROR(err); // Build the program. char build_opts[200]; sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu", ROW_A, COL_A, COL_B); err = clBuildProgram(program, 1, &dev, build_opts, NULL, NULL); if (err != CL_SUCCESS) { print_build_log(program, dev); CHECK_ERROR(err); } // Kernel kernel = clCreateKernel(program, "mat_mul", &err); CHECK_ERROR(err); // Buffers mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ROW_A * COL_A, M_A, &err); CHECK_ERROR(err); mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * COL_A * COL_B, NULL, &err); CHECK_ERROR(err); err = clEnqueueWriteBuffer(cmd_queue, mem_B, CL_FALSE, 0, sizeof(float) * COL_A * COL_B, M_B, 0, NULL, NULL); CHECK_ERROR(err) mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * ROW_A * COL_B, NULL, &err); CHECK_ERROR(err); // Set the arguments. err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A); CHECK_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B); CHECK_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C); CHECK_ERROR(err); // Enqueue the kernel. size_t lws[1] = {256}; size_t gws[1]; gws[0] = (size_t)ceil((double)ROW_A / lws[0]) * lws[0]; err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, gws, lws, 0, NULL, &ev_kernel); CHECK_ERROR(err); // Read the result. err = clEnqueueReadBuffer(cmd_queue, mem_C, CL_TRUE, 0, sizeof(float) * ROW_A * COL_B, M_C, 0, NULL, NULL); CHECK_ERROR(err); // Read the profiling info. cl_ulong start_time, end_time; err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start_time, NULL); CHECK_ERROR(err); err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end_time, NULL); CHECK_ERROR(err); printf("Kernel time : %lf sec\n", (double)(end_time - start_time) / 10e9); // Release clReleaseEvent(ev_kernel); clReleaseMemObject(mem_A); clReleaseMemObject(mem_B); clReleaseMemObject(mem_C); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); }