static int init_cladsyn(CSOUND *csound, CLADSYN *p){ int asize, ipsize, fpsize, err; cl_device_id device_ids[32], device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel1, kernel2; cl_uint num = 0, nump = 0; cl_platform_id platforms[16]; uint i; if(p->fsig->overlap > 1024) return csound->InitError(csound, "overlap is too large\n"); err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 32, device_ids, &num); if (err != CL_SUCCESS){ clGetPlatformIDs(16, platforms, &nump); int devs = 0; for(i=0; i < nump && devs < 32; i++){ char name[128]; clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 128, name, NULL); csound->Message(csound, "available platform[%d] %s\n",i, name); err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 32-devs, &device_ids[devs], &num); if (err != CL_SUCCESS) csound->InitError(csound, "failed to find an OpenCL device! %s \n", cl_error_string(err)); } devs += num; } for(i=0; i < num; i++){ char name[128]; cl_device_type type; clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 128, name, NULL); clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL); if(type & CL_DEVICE_TYPE_CPU) csound->Message(csound, "available CPU[device %d] %s\n",i, name); else if(type & CL_DEVICE_TYPE_GPU) csound->Message(csound, "available GPU[device %d] %s\n",i, name); else if(type & CL_DEVICE_TYPE_ACCELERATOR) csound->Message(csound, "available ACCELLERATOR[device %d] %s\n",i, name); else csound->Message(csound, "available generic [device %d] %s\n",i, name);; } // SELECT THE GPU HERE if(*p->idev < num) device_id = device_ids[(int)*p->idev]; else device_id = device_ids[num-1]; context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) return csound->InitError(csound, "Failed to create a compute context! %s\n", cl_error_string(err)); // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) return csound->InitError(csound, "Failed to create a command commands! %s\n", cl_error_string(err)); // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) &code, NULL, &err); if (!program) return csound->InitError(csound, "Failed to create compute program! %s\n", cl_error_string(err)); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; csound->Message(csound, "Failed to build program executable! %s\n", cl_error_string(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); return csound->InitError(csound, "%s\n", buffer); } kernel1 = clCreateKernel(program, "sample", &err); if (!kernel1 || err != CL_SUCCESS) return csound->InitError(csound, "Failed to create sample compute kernel! %s\n", cl_error_string(err)); kernel2 = clCreateKernel(program, "update", &err); if (!kernel2 || err != CL_SUCCESS) return csound->InitError(csound,"Failed to create update compute kernel! %s\n", cl_error_string(err)); char name[128]; clGetDeviceInfo(device_id, CL_DEVICE_NAME, 128, name, NULL); csound->Message(csound, "using device: %s\n",name); p->bins = (p->fsig->N)/2; if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum; p->vsamps = p->fsig->overlap; p->threads = p->bins*p->vsamps; p->mthreads = (p->bins > p->vsamps ? p->bins : p->vsamps); asize = p->vsamps*sizeof(cl_float); ipsize = (p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_long); fpsize = p->fsig->N*sizeof(cl_float); p->out = clCreateBuffer(context,0, asize, NULL, NULL); p->frame = clCreateBuffer(context, CL_MEM_READ_ONLY, fpsize, NULL, NULL); p->ph = clCreateBuffer(context,0, ipsize, NULL, NULL); p->amps = clCreateBuffer(context,0,(p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_float), NULL, NULL); // memset needed? asize = p->vsamps*sizeof(float); if(p->out_.auxp == NULL || p->out_.size < (unsigned long) asize) csound->AuxAlloc(csound, asize , &p->out_); csound->RegisterDeinitCallback(csound, p, destroy_cladsyn); p->count = 0; p->context = context; p->program = program; p->commands = commands; p->kernel1 = kernel1; p->kernel2 = kernel2; clGetKernelWorkGroupInfo(p->kernel1, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs1, NULL); clGetKernelWorkGroupInfo(p->kernel2, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs2, NULL); p->sr = csound->GetSr(csound); clSetKernelArg(p->kernel1, 0, sizeof(cl_mem), &p->out); clSetKernelArg(p->kernel1, 1, sizeof(cl_mem), &p->frame); clSetKernelArg(p->kernel1, 2, sizeof(cl_mem), &p->ph); clSetKernelArg(p->kernel1, 3, sizeof(cl_mem), &p->amps); clSetKernelArg(p->kernel1, 5, sizeof(cl_int), &p->bins); clSetKernelArg(p->kernel1, 6, sizeof(cl_int), &p->vsamps); clSetKernelArg(p->kernel1, 7, sizeof(cl_float), &p->sr); clSetKernelArg(p->kernel2, 0, sizeof(cl_mem), &p->out); clSetKernelArg(p->kernel2, 1, sizeof(cl_mem), &p->frame); clSetKernelArg(p->kernel2, 2, sizeof(cl_mem), &p->ph); clSetKernelArg(p->kernel2, 3, sizeof(cl_mem), &p->amps); clSetKernelArg(p->kernel2, 5, sizeof(cl_int), &p->bins); clSetKernelArg(p->kernel2, 6, sizeof(cl_int), &p->vsamps); clSetKernelArg(p->kernel2, 7, sizeof(cl_float), &p->sr); return OK; }
int main(int argc, char **argv){ printf("Check OpenCL environtment\n"); cl_platform_id platid; cl_device_id devid; cl_int res; size_t param; /* Query OpenCL, get some information about the returned device */ clGetPlatformIDs(1u, &platid, NULL); clGetDeviceIDs(platid, CL_DEVICE_TYPE_ALL, 1, &devid, NULL); cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; clGetDeviceInfo(devid, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, NULL); clGetDeviceInfo(devid, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); printf("Connecting to OpenCL device:\t%s %s\n", vendor_name, device_name); clGetDeviceInfo(devid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), ¶m, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS\t%d\n", param); clGetDeviceInfo(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), ¶m, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE\t%u\n", param); clGetDeviceInfo(devid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE\t%ub\n", param); /* Check if kernel source exists, we compile argv[1] passed kernel */ if(argv[1] == NULL) { printf("\nUsage: %s kernel_source.cl kernel_function\n", argv[0]); exit(1); } char *kernel_source; if(load_program_source(argv[1], &kernel_source)) return 1; printf("Building from OpenCL source: \t%s\n", argv[1]); printf("Compile/query OpenCL_program:\t%s\n", argv[2]); /* Create context and kernel program */ cl_context context = clCreateContext(0, 1, &devid, NULL, NULL, NULL); cl_program pro = clCreateProgramWithSource(context, 1, (const char **)&kernel_source, NULL, NULL); res = clBuildProgram(pro, 1, &devid, "-cl-fast-relaxed-math", NULL, NULL); if(res != CL_SUCCESS){ printf("clBuildProgram failed: %d\n", res); char buf[0x10000]; clGetProgramBuildInfo(pro, devid, CL_PROGRAM_BUILD_LOG, 0x10000, buf, NULL); printf("\n%s\n", buf); return(-1); } cl_kernel kernelobj = clCreateKernel(pro, argv[2], &res); check_return(res); /* Get the maximum work-group size for executing the kernel on the device */ size_t global, local; res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(int), &local, NULL); check_return(res); printf("CL_KERNEL_WORK_GROUP_SIZE\t%u\n", local); res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m, NULL); check_return(res); printf("CL_KERNEL_LOCAL_MEM_SIZE\t%ub\n", param); cl_command_queue cmd_queue = clCreateCommandQueue(context, devid, CL_QUEUE_PROFILING_ENABLE, NULL); if(cmd_queue == NULL) { printf("Compute device setup failed\n"); return(-1); } local = 4; int n = 2 * local; //num_group * local workgroup size global = n; int num_groups= global / local, allocated_local= sizeof(data) * local + sizeof(debug) * local; data *DP __attribute__ ((aligned(16))); DP = calloc(n, sizeof(data) *1); debug *dbg __attribute__ ((aligned(16))); dbg = calloc(n, sizeof(debug)); printf("global:%d, local:%d, (should be):%d groups\n", global, local, num_groups); printf("structs size: %db, %db, %db\n", sizeof(data), sizeof(Elliptic_Curve), sizeof(inv256)); printf("sets:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(cl_uint4) *5 *4, allocated_local); cl_mem cl_DP, cl_EC, cl_INV, DEBUG; cl_DP = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, n * sizeof(data), NULL, &res); check_return(res); cl_EC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, 1 * sizeof(Elliptic_Curve), NULL, &res); check_return(res); //_constant address space cl_INV= clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, 1 * sizeof(u8) * 0x80, NULL, &res); check_return(res); DEBUG = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_WRITE_ONLY, n * sizeof(debug), NULL, &res); check_return(res); Elliptic_Curve EC; /* Curve domain parameters, (test vectors) ------------------------------------------------------------------------------------- p: c1c627e1638fdc8e24299bb041e4e23af4bb5427 is prime a: c1c627e1638fdc8e24299bb041e4e23af4bb5424 divisor g = 62980 b: 877a6d84155a1de374b72d9f9d93b36bb563b2ab divisor g = 227169643 Gx: 010aff82b3ac72569ae645af3b527be133442131 divisor g = 32209245 Gy: 46b8ec1e6d71e5ecb549614887d57a287df573cc divisor g = 972 precomputed_per_curve_constants: U: c1c627e1638fdc8e24299bb041e4e23af4bb5425 V: 3e39d81e9c702371dbd6644fbe1b1dc50b44abd9 already prepared mod p to test: a: 07189f858e3f723890a66ec1079388ebd2ed509c b: 6043379beb0dade6eed1e9d6de64f4a0c50639d4 gx: 5ef84aacf4f0ea6752f572d0741f40049f354dca gy: 418c695435af6b3d4d7cbb72967395016ef67239 resulting point: P.x: 01718f862ebe9423bd661a65355aa1c86ba330f8 program MUST got this point !! P.y: 557e8ed53ffbfe2c990a121967b340f62e0e4fe2 taken mod p: P.x: 41da1a8f74ff8d3f1ce20ef3e9d8865c96014fe3 P.y: 73ca143c9badedf2d9d3c7573307115ccfe04f13 */ u8 *t; t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5427"); memcpy(EC.p, t, 20); t = _x_to_u8_buffer("07189f858e3f723890a66ec1079388ebd2ed509c"); memcpy(EC.a, t, 20); t = _x_to_u8_buffer("6043379beb0dade6eed1e9d6de64f4a0c50639d4"); memcpy(EC.b, t, 20); t = _x_to_u8_buffer("5ef84aacf4f0ea6752f572d0741f40049f354dca"); memcpy(EC.Gx, t, 20); t = _x_to_u8_buffer("418c695435af6b3d4d7cbb72967395016ef67239"); memcpy(EC.Gy, t, 20); t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5425"); memcpy(EC.U, t, 20); t = _x_to_u8_buffer("3e39d81e9c702371dbd6644fbe1b1dc50b44abd9"); memcpy(EC.V, t, 20); /* we need to map buffer now to load some k into data */ DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_WRITE, 0, n * sizeof(data), 0, NULL, NULL, &res); check_return(res); t = _x_to_u8_buffer("00542d46e7b3daac8aeb81e533873aabd6d74bb710"); for(u8 i = 0; i < n; i++) memcpy(DP[i].k, t, 21); free(t); //d for(u8 i = 0; i < n; i++) bn_print("", DP[i].k, 21, 1); /* we can alter just a byte into a chosen k to verify that we'll get a different point! */ //DP[2].k[2] = 0x09; //no res = clEnqueueWriteBuffer(cmd_queue, cl_DP, CL_TRUE, 0, n * sizeof(data), &DP, 0, NULL, NULL); check_return(res); res = clEnqueueWriteBuffer(cmd_queue, cl_EC, CL_TRUE, 0, 1 * sizeof(Elliptic_Curve), &EC, 0, NULL, NULL); check_return(res); res = clEnqueueWriteBuffer(cmd_queue, cl_INV, CL_TRUE, 0, 1 * sizeof(u8) * 0x80, &inv256, 0, NULL, NULL); check_return(res); res = clSetKernelArg(kernelobj, 0, sizeof(cl_mem), &cl_DP); /* i/o buffer */ res|= clSetKernelArg(kernelobj, 1, sizeof(data) * local *1, NULL); //allocate space for __local in kernel (just this!) one * localsize res|= clSetKernelArg(kernelobj, 2, sizeof(cl_mem), &cl_EC); res|= clSetKernelArg(kernelobj, 3, sizeof(cl_mem), &cl_INV); res|= clSetKernelArg(kernelobj, 4, sizeof(debug) * local *1, NULL); //allocate space for __local in kernel (just this!) one * localsize res|= clSetKernelArg(kernelobj, 5, sizeof(cl_mem), &DEBUG); //this used to debug kernel output check_return(res); // printf("n:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(debug), allocated_local); cl_event NDRangeEvent; cl_ulong start, end; /* Execute NDrange */ res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, &local, 0, NULL, &NDRangeEvent); check_return(res); // res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, NULL, 0, NULL, &NDRangeEvent); check_return(res); printf("Read back, Mapping buffer:\t%db\n", n * sizeof(data)); DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_READ, 0, n * sizeof(data), 0, NULL, NULL, &res); check_return(res); dbg =clEnqueueMapBuffer(cmd_queue, DEBUG, CL_TRUE, CL_MAP_READ, 0, n * sizeof(debug), 0, NULL, NULL, &res); check_return(res); /* using clEnqueueReadBuffer template */ // res = clEnqueueReadBuffer(cmd_queue, ST, CL_TRUE, 0, sets * sizeof(cl_uint8), dbg, 0, NULL, NULL); check_return(res); clFlush(cmd_queue); clFinish(cmd_queue); /* get NDRange execution time with internal ocl profiler */ res = clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); res|= clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); check_return(res); printf("kernel execution time:\t\t%.2f ms\n", (float) ((end - start) /1000000)); //relative to NDRange call printf("number of computes/sec:\t%.2f\n", (float) global *1000000 /((end - start))); printf("i,\tgid\tlid0\tlsize0\tgid0/lsz0,\tgsz0,\tn_gr0,\tlid5,\toffset\n"); for(int i = 0; i < n; i++) { // if(i %local == 0) { printf("%d \t", i); //printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", *p, *(p +1), *(p +2), *(p +3), *(p +4), *(p +5), *(p +6), *(p +7)); /* silence this doubled debug info printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", dbg[i].data[0], dbg[i].data[1], dbg[i].data[2], dbg[i].data[3], dbg[i].data[4], dbg[i].data[5], dbg[i].data[6], dbg[i].data[7]); */ //printf("%d %d\n", P[i].dig, P[i].c); bn_print("", DP[i].k, 21, 1); bn_print("", DP[i].rx, 20, 0); bn_print(" ", DP[i].ry, 20, 1); printf("%u(/%u) = %u*%u(/%u) +%u, offset:%u, stride:%u\n", DP[i].pad[0], DP[i].pad[1], DP[i].pad[2], DP[i].pad[3], DP[i].pad[4], DP[i].pad[5], DP[i].pad[6], DP[i].pad[7]); // } } /* Release OpenCL stuff, free the rest */ clReleaseMemObject(cl_DP); clReleaseMemObject(cl_EC); clReleaseMemObject(cl_INV); clReleaseMemObject(DEBUG); clReleaseKernel(kernelobj); clReleaseProgram(pro); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); free(kernel_source); puts("Done!"); return 0; }
void multiformat_scal_opencl_func(void *buffers[], void *args) { (void) args; int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]); cl_mem val = (cl_mem)STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "multiformat_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(n), &n); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); }
int main(int argc, char** argv) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; // Determine the platform ID: NULL platform IDs lead to // "platform specific" behavior! cl_platform_id platforms[8]; uint32_t num_platforms; err = clGetPlatformIDs(8, platforms, &num_platforms); if(err != CL_SUCCESS) { printf("Error: failed to get platform ids!\n"); return EXIT_FAILURE; } printf("%u platform ids found\n", num_platforms); // Connect to a compute device // int gpu = 1; err = clGetDeviceIDs(platforms[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
size_t select_device(int jtrUniqDevNo, struct fmt_main *fmt) { cl_int err; const char *errMsg; size_t memAllocSz; active_dev_ctr++; opencl_init("$JOHN/kernels/pbkdf2_kernel.cl", jtrUniqDevNo, NULL); globalObj[jtrUniqDevNo].krnl[0] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess_short", &err); if (err) { fprintf(stderr, "Create Kernel pbkdf2_preprocess_short FAILED\n"); return 0; } globalObj[jtrUniqDevNo].krnl[1] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess_long", &err); if (err) { fprintf(stderr, "Create Kernel pbkdf2_preprocess_long FAILED\n"); return 0; } globalObj[jtrUniqDevNo].krnl[2] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_iter", &err); if (err) { fprintf(stderr, "Create Kernel pbkdf2_iter FAILED\n"); return 0; } globalObj[jtrUniqDevNo].krnl[3] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_postprocess", &err); if (err) { fprintf(stderr, "Create Kernel pbkdf2_postprocess FAILED\n"); return 0; } errMsg = "Create Buffer FAILED"; memAllocSz = 4 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint); memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4; globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, memAllocSz, NULL, &err); if (globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu == (cl_mem)0) HANDLE_CLERROR(err,errMsg ); globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, (MAX_SALT_LENGTH / 2 + 1) * sizeof(cl_uint), NULL, &err); if (globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu == (cl_mem)0) HANDLE_CLERROR(err, errMsg); globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_WRITE_ONLY, memAllocSz, NULL, &err); if (globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu == (cl_mem)0) HANDLE_CLERROR(err, errMsg); memAllocSz = MAX_KEYS_PER_CRYPT * sizeof(temp_buf); memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4; globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, memAllocSz, NULL, &err); if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0) HANDLE_CLERROR(err, errMsg); memAllocSz = 5 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint); memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4; globalObj[jtrUniqDevNo].gpu_buffer.hmac_sha1_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, memAllocSz, NULL, &err); if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0) HANDLE_CLERROR(err, errMsg); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 0 Arg 0 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu), "Set Kernel 0 Arg 1 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 3, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 0 Arg 3 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 1 Arg 0 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 1 Arg 1 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 2, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hmac_sha1_gpu), "Set Kernel 1 Arg 2 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[2], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 2 Arg 0 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[3], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 3 Arg 0 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[3], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu), "Set Kernel 3 Arg 1 :FAILED"); if (!local_work_size) find_best_workgroup(jtrUniqDevNo, quick_bechmark(jtrUniqDevNo)); else { size_t maxsize, maxsize2; globalObj[jtrUniqDevNo].lws = local_work_size; // Obey limits HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[0], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Error querying max LWS"); HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[1], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS"); if (maxsize2 > maxsize) maxsize = maxsize2; HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[2], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS"); if (maxsize2 > maxsize) maxsize = maxsize2; HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[3], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS"); if (maxsize2 > maxsize) maxsize = maxsize2; while (globalObj[jtrUniqDevNo].lws > maxsize) globalObj[jtrUniqDevNo].lws /= 2; if (options.verbosity > 3) fprintf(stderr, "Local worksize (LWS) forced to "Zu"\n", globalObj[jtrUniqDevNo].lws); globalObj[jtrUniqDevNo].exec_time_inv = 1; } if (!global_work_size) find_best_gws(jtrUniqDevNo, fmt); else { if (options.verbosity > 3) fprintf(stderr, "Global worksize (GWS) forced to "Zu"\n", global_work_size); fmt -> params.max_keys_per_crypt = global_work_size; fmt -> params.min_keys_per_crypt = max_lws(); } return globalObj[jtrUniqDevNo].lws; }
int MemoryOptimizations::copy(cl_kernel& kernel, int vectorSize) { cl_int status; cl_event events[2]; /* Check group size against kernelWorkGroupSize */ status = clGetKernelWorkGroupInfo(kernel, devices[deviceId], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetKernelWorkGroupInfo failed.")) { return SDK_FAILURE; } if(localThreads[0] * localThreads[1] > kernelWorkGroupSize) { std::cout << "\nDevice doesn't support required work-group size!\n"; return SDK_SUCCESS; } /*** Set appropriate arguments to the kernel ***/ status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(inputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(outputBuffer)")) return SDK_FAILURE; double nsec = 0; // Reduce the iterations if verification is enabled. if(verify) Iterations = 1; /* Run the kernel for a number of iterations */ for(int i = 0; i < Iterations; i++) { /*Enqueue a kernel run call */ status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) return SDK_FAILURE; /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; /* Calculate performance */ cl_ulong startTime; cl_ulong endTime; /* Get kernel profiling info */ status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(startTime)")) return SDK_FAILURE; status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(endTime)")) return SDK_FAILURE; /* Cumulate time for each iteration */ nsec += endTime - startTime; } /* Copy bytes */ int numThreads = (int)(globalThreads[0] * globalThreads[1]); double bytes = (double)(Iterations * 2 * vectorSize * sizeof(cl_float)); double perf = (bytes / nsec) * numThreads; std::cout << ": " << perf << " GB/s" << std::endl; if(verify) { /* Enqueue readBuffer*/ status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, length * sizeof(cl_float4), output, 0, NULL, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; /* Verify data */ if(!memcmp(input, output, vectorSize * sizeof(cl_float) * length)) { std::cout << "Passed!\n"; return SDK_SUCCESS; } else { std::cout << "Failed!\n"; return SDK_FAILURE; } } return SDK_SUCCESS; }
void test_variable_opencl_func(void *buffers[], void *args) { STARPU_SKIP_IF_VALGRIND; int id, devid, ret; int factor = *(int *) args; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(buffers[0]); cl_context context; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); starpu_opencl_get_context(devid, &context); cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(int), &variable_config.copy_failed, &err); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "variable_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(fail), &fail); if (err) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(factor), &factor); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global = 1; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } err = clEnqueueReadBuffer(queue, fail, CL_TRUE, 0, sizeof(int), &variable_config.copy_failed, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); ret = starpu_opencl_unload_opencl(&opencl_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); return; }
// Helper function to create and build program and kernel // ********************************************************************* cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2) { // compile cl program size_t program_length; char *source; std::ostringstream preamble; // create the program // with type specification depending on datatype argument switch (datatype) { default: case REDUCE_INT: preamble << "#define T int" << std::endl; break; case REDUCE_FLOAT: preamble << "#define T float" << std::endl; break; } // set blockSize at compile time preamble << "#define blockSize " << blockSize << std::endl; // set isPow2 at compile time preamble << "#define nIsPow2 " << isPowOf2 << std::endl; // Load the source code and prepend the preamble source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length); oclCheckError(source != NULL, shrTRUE); cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, &program_length, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclReduction.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } // create Kernel std::ostringstream kernelName; kernelName << "reduce" << whichKernel; cl_kernel ckKernel = clCreateKernel(cpProgram, kernelName.str().c_str(), &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); size_t wgSize; ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); if (wgSize == 64) smallBlock = true; else smallBlock = false; // NOTE: the program will get deleted when the kernel is also released clReleaseProgram(cpProgram); return ckKernel; }
OPENCL_EXPERIMENTS_EXPORT cl_int opencl_plugin_voxelize_meshes(opencl_plugin plugin, float inv_element_size, float corner_x, float corner_y, float corner_z, cl_int x_cell_length, cl_int y_cell_length, cl_int z_cell_length, cl_int mesh_data_count, mesh_data *mesh_data_list, cl_uchar *voxel_grid_out) { cl_int err = CL_SUCCESS; cl_int i; cl_int next_row_offset, next_slice_offset; size_t local_work_size; cl_int num_voxels; clock_t t1; clock_t t2; clock_t t3; assert(plugin != NULL); assert(inv_element_size >= 0); assert(x_cell_length >= 0); assert(y_cell_length >= 0); assert(z_cell_length >= 0); assert(mesh_data_count >= 0); assert(mesh_data_list != NULL); t1 = clock(); /* (Re-)allocate buffer for voxel grid */ num_voxels = x_cell_length * y_cell_length * z_cell_length; if (opencl_plugin_init_voxel_buffer(plugin, num_voxels)) goto error; /* (Re-)allocate buffers for mesh data */ if (opencl_plugin_init_mesh_buffers(plugin, mesh_data_count, mesh_data_list)) goto error; err = clGetKernelWorkGroupInfo( plugin->voxelize_kernel, plugin->selected_device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL); CHECK_CL_ERROR(err); if (enqueue_zero_buffer(plugin->queue, plugin->voxel_grid_buffer, plugin->voxel_grid_buffer_capacity, 0, NULL, NULL, &err)) goto error; err = clFinish(plugin->queue); CHECK_CL_ERROR(err); t1 = clock() - t1; t2 = clock(); next_row_offset = x_cell_length; next_slice_offset = x_cell_length * y_cell_length; err |= clSetKernelArg(plugin->voxelize_kernel, 0, sizeof(cl_mem), &plugin->voxel_grid_buffer); err |= clSetKernelArg(plugin->voxelize_kernel, 1, sizeof(float), &inv_element_size); err |= clSetKernelArg(plugin->voxelize_kernel, 2, sizeof(float), &corner_x); err |= clSetKernelArg(plugin->voxelize_kernel, 3, sizeof(float), &corner_y); err |= clSetKernelArg(plugin->voxelize_kernel, 4, sizeof(float), &corner_z); err |= clSetKernelArg(plugin->voxelize_kernel, 5, sizeof(cl_int), &next_row_offset); err |= clSetKernelArg(plugin->voxelize_kernel, 6, sizeof(cl_int), &next_slice_offset); err |= clSetKernelArg(plugin->voxelize_kernel, 7, sizeof(cl_int), &x_cell_length); err |= clSetKernelArg(plugin->voxelize_kernel, 8, sizeof(cl_int), &y_cell_length); err |= clSetKernelArg(plugin->voxelize_kernel, 9, sizeof(cl_int), &z_cell_length); CHECK_CL_ERROR(err); for (i = 0; i < mesh_data_count; i++) { size_t global_work_size; cl_uint vertex_buffer_base_idx = mesh_data_list[i].vertex_buffer_base_idx; cl_uint triangle_buffer_base_idx = mesh_data_list[i].triangle_buffer_base_idx; err |= clSetKernelArg(plugin->voxelize_kernel, 10, sizeof(cl_mem), &plugin->vertex_buffer); err |= clSetKernelArg(plugin->voxelize_kernel, 11, sizeof(cl_mem), &plugin->triangle_buffer); err |= clSetKernelArg(plugin->voxelize_kernel, 12, sizeof(cl_int), &mesh_data_list[i].num_triangles); err |= clSetKernelArg(plugin->voxelize_kernel, 13, sizeof(cl_uint), &vertex_buffer_base_idx); err |= clSetKernelArg(plugin->voxelize_kernel, 14, sizeof(cl_uint), &triangle_buffer_base_idx); CHECK_CL_ERROR(err); /* As per the OpenCL spec, global_work_size must divide evenly by * local_work_size */ global_work_size = mesh_data_list[i].num_triangles / local_work_size; global_work_size *= local_work_size; if (global_work_size < (size_t)mesh_data_list[i].num_triangles) global_work_size += local_work_size; err = clEnqueueNDRangeKernel( plugin->queues[i % plugin->num_queues], plugin->voxelize_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); CHECK_CL_ERROR_MSG(err, "clEnqueueNDRangeKernel failed on mesh %d/%d", i + 1, mesh_data_count); err = clFinish(plugin->queue); CHECK_CL_ERROR_MSG(err, "clFinish failed on mesh %d/%d", i + 1, mesh_data_count); } err = clFinish(plugin->queue); CHECK_CL_ERROR(err); for (i = 0; i < plugin->num_queues; i++) { err = clFinish(plugin->queues[i]); CHECK_CL_ERROR(err); } t2 = clock() - t2; t3 = clock(); err = clEnqueueReadBuffer( plugin->queue, plugin->voxel_grid_buffer, CL_TRUE, 0, num_voxels, voxel_grid_out, 0, NULL, NULL); CHECK_CL_ERROR(err); t3 = clock() - t3; TRACE("Clock T1: %f", ((float)t1 * 1000.0f) / CLOCKS_PER_SEC); TRACE("Clock T2: %f", ((float)t2 * 1000.0f) / CLOCKS_PER_SEC); TRACE("Clock T3: %f", ((float)t3 * 1000.0f) / CLOCKS_PER_SEC); return 0; error: return -1; }