int CommandGenerate::execute(const std::vector<std::string>& p_args) { if(p_args.size() < 10) { help(); return -1; } unsigned int platformId = atol(p_args[1].c_str()); unsigned int deviceId = atol(p_args[2].c_str()); unsigned int staggerSize = atol(p_args[3].c_str()); unsigned int threadsNumber = atol(p_args[4].c_str()); unsigned int hashesNumber = atol(p_args[5].c_str()); unsigned int nonceSize = PLOT_SIZE * staggerSize; std::cerr << "Threads number: " << threadsNumber << std::endl; std::cerr << "Hashes number: " << hashesNumber << std::endl; unsigned int numjobs = (p_args.size() - 5)/4; std::cerr << numjobs << " plot(s) to do." << std::endl; unsigned int staggerMbSize = staggerSize / 4; std::cerr << "Non-GPU memory usage: " << staggerMbSize*numjobs << "MB" << std::endl; std::vector<std::string> paths(numjobs); std::vector<std::ofstream *> out_files(numjobs); std::vector<unsigned long long> addresses(numjobs); std::vector<unsigned long long> startNonces(numjobs); std::vector<unsigned long long> endNonces(numjobs); std::vector<unsigned int> noncesNumbers(numjobs); std::vector<unsigned char*> buffersCpu(numjobs); std::vector<bool> saving_thread_flags(numjobs); std::vector<std::future<void>> save_threads(numjobs); unsigned long long maxNonceNumber = 0; unsigned long long totalNonces = 0; int returnCode = 0; try { for (unsigned int i = 0; i < numjobs; i++) { std::cerr << "----" << std::endl; std::cerr << "Job number " << i << std::endl; unsigned int argstart = 6 + i*4; paths[i] = std::string(p_args[argstart]); addresses[i] = strtoull(p_args[argstart+1].c_str(), NULL, 10); startNonces[i] = strtoull(p_args[argstart+2].c_str(), NULL, 10); noncesNumbers[i] = atol(p_args[argstart+3].c_str()); maxNonceNumber = std::max(maxNonceNumber, (long long unsigned int)noncesNumbers[i]); totalNonces += noncesNumbers[i]; std::ostringstream outFile; outFile << paths[i] << "/" << addresses[i] << "_" << startNonces[i] << "_" << \ noncesNumbers[i] << "_" << staggerSize; std::ios_base::openmode file_mode = std::ios::out | std::ios::binary | std::ios::trunc; out_files[i] = new std::ofstream(outFile.str(), file_mode); assert(out_files[i]); if(noncesNumbers[i] % staggerSize != 0) { noncesNumbers[i] -= noncesNumbers[i] % staggerSize; noncesNumbers[i] += staggerSize; } endNonces[i] = startNonces[i] + noncesNumbers[i]; unsigned int noncesGbSize = noncesNumbers[i] / 4 / 1024; std::cerr << "Path: " << outFile.str() << std::endl; std::cerr << "Nonces: " << startNonces[i] << " to " << endNonces[i] << " (" << noncesGbSize << " GB)" << std::endl; std::cerr << "Creating CPU buffer" << std::endl; buffersCpu[i] = new unsigned char[nonceSize]; if(!buffersCpu[i]) { throw std::runtime_error("Unable to create the CPU buffer (probably out of host memory.)"); } saving_thread_flags[i] = false; std::cerr << "----" << std::endl; } cl_platform_id platforms[4]; cl_uint platformsNumber; cl_device_id devices[32]; cl_uint devicesNumber; cl_context context = 0; cl_command_queue commandQueue = 0; cl_mem bufferGpuGen = 0; cl_mem bufferGpuScoops = 0; cl_program program = 0; cl_kernel kernelStep1 = 0; cl_kernel kernelStep2 = 0; cl_kernel kernelStep3 = 0; int error; std::cerr << "Retrieving OpenCL platforms" << std::endl; error = clGetPlatformIDs(4, platforms, &platformsNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL platforms"); } if(platformId >= platformsNumber) { throw std::runtime_error("No platform found with the provided id"); } std::cerr << "Retrieving OpenCL GPU devices" << std::endl; error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL devices"); } if(deviceId >= devicesNumber) { throw std::runtime_error("No device found with the provided id"); } std::cerr << "Creating OpenCL context" << std::endl; context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL context"); } std::cerr << "Creating OpenCL command queue" << std::endl; commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL command queue"); } std::cerr << "Creating OpenCL GPU generation buffer" << std::endl; bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer"); } std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl; bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer"); } std::cerr << "Creating OpenCL program" << std::endl; std::string source = loadSource("kernel/nonce.cl"); const char* sources[] = {source.c_str()}; size_t sourcesLength[] = {source.length()}; program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL program"); } std::cerr << "Building OpenCL program" << std::endl; error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0); if(error != CL_SUCCESS) { size_t logSize; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize); char* log = new char[logSize]; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0); std::cerr << log << std::endl; delete[] log; throw OpenclError(error, "Unable to build the OpenCL program"); } std::cerr << "Creating OpenCL step1 kernel" << std::endl; kernelStep1 = clCreateKernel(program, "nonce_step1", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step1 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep1, 2, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step2 kernel" << std::endl; kernelStep2 = clCreateKernel(program, "nonce_step2", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step2 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep2, 1, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step3 kernel" << std::endl; kernelStep3 = clCreateKernel(program, "nonce_step3", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step3 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep3, 0, sizeof(cl_uint), (void*)&staggerSize); error = clSetKernelArg(kernelStep3, 1, sizeof(cl_mem), (void*)&bufferGpuGen); error = clSetKernelArg(kernelStep3, 2, sizeof(cl_mem), (void*)&bufferGpuScoops); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } size_t globalWorkSize = staggerSize; size_t localWorkSize = (staggerSize < threadsNumber) ? staggerSize : threadsNumber; time_t startTime = time(0); unsigned int totalNoncesCompleted = 0; for (unsigned long long nonce_ordinal = 0; nonce_ordinal < maxNonceNumber; nonce_ordinal += staggerSize) { for (unsigned int jobnum = 0; jobnum < paths.size(); jobnum += 1) { unsigned long long nonce = startNonces[jobnum] + nonce_ordinal; if (nonce > endNonces[jobnum]) { break; } std::cout << "Running with start nonce " << nonce << std::endl; // Is a cl_ulong always an unsigned long long? unsigned int error = 0; error = clSetKernelArg(kernelStep1, 0, sizeof(cl_ulong), (void*)&addresses[jobnum]); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clSetKernelArg(kernelStep1, 1, sizeof(cl_ulong), (void*)&nonce); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep1, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step1 kernel launch"); } unsigned int hashesSize = hashesNumber * HASH_SIZE; for(int hashesOffset = PLOT_SIZE ; hashesOffset > 0 ; hashesOffset -= hashesSize) { error = clSetKernelArg(kernelStep2, 0, sizeof(cl_ulong), (void*)&nonce); error = clSetKernelArg(kernelStep2, 2, sizeof(cl_uint), (void*)&hashesOffset); error = clSetKernelArg(kernelStep2, 3, sizeof(cl_uint), (void*)&hashesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step2 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep2, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel launch"); } error = clFinish(commandQueue); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel finish"); } } totalNoncesCompleted += staggerSize; double percent = 100.0 * (double)totalNoncesCompleted / totalNonces; time_t currentTime = time(0); double speed = (double)totalNoncesCompleted / difftime(currentTime, startTime) * 60.0; double estimatedTime = (double)(totalNonces - totalNoncesCompleted) / speed; std::cerr << "\r" << percent << "% (" << totalNoncesCompleted << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", ETA: " << ((int)estimatedTime / 60) << "h" << ((int)estimatedTime % 60) << "m" << ((int)(estimatedTime * 60.0) % 60) << "s"; std::cerr << "... "; error = clEnqueueNDRangeKernel(commandQueue, kernelStep3, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step3 kernel launch"); } if (saving_thread_flags[jobnum]) { save_threads[jobnum].wait(); // Wait for last job to finish saving_thread_flags[jobnum] = false; } error = clEnqueueReadBuffer(commandQueue, bufferGpuScoops, CL_TRUE, 0, sizeof(cl_uchar) * nonceSize, buffersCpu[jobnum], 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in synchronous read"); } saving_thread_flags[jobnum] = true; save_threads[jobnum] = std::async(std::launch::async, save_nonces, nonceSize, out_files[jobnum], buffersCpu[jobnum]); } } //Clean up for (unsigned int i = 0; i < paths.size(); i += 1) { if (saving_thread_flags[i]) { std::cerr << "waiting for final save to " << paths[i] << " to finish" << std::endl; save_threads[i].wait(); saving_thread_flags[i] = false; std::cerr << "done waiting for final save" << std::endl; if (buffersCpu[i]) { delete[] buffersCpu[i]; } } } if(kernelStep3) { clReleaseKernel(kernelStep3); } if(kernelStep2) { clReleaseKernel(kernelStep2); } if(kernelStep1) { clReleaseKernel(kernelStep1); } if(program) { clReleaseProgram(program); } if(bufferGpuGen) { clReleaseMemObject(bufferGpuGen); } if(bufferGpuScoops) { clReleaseMemObject(bufferGpuScoops); } if(commandQueue) { clReleaseCommandQueue(commandQueue); } if(context) { clReleaseContext(context); } time_t currentTime = time(0); double elapsedTime = difftime(currentTime, startTime) / 60.0; double speed = (double)totalNonces / elapsedTime; std::cerr << "\r100% (" << totalNonces << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", " << ((int)elapsedTime / 60) << "h" << ((int)elapsedTime % 60) << "m" << ((int)(elapsedTime * 60.0) % 60) << "s"; std::cerr << " " << std::endl; } catch(const OpenclError& ex) { std::cerr << "[ERROR] [" << ex.getCode() << "] " << ex.what() << std::endl; returnCode = -1; } catch(const std::exception& ex) { std::cerr << "[ERROR] " << ex.what() << std::endl; returnCode = -1; } return returnCode; }
int bpnn_train_kernel(BPNN *net, float *eo, float *eh) { int in, hid, out; float out_err, hid_err; in = net->input_n; hid = net->hidden_n; out = net->output_n; //int use_device = 0; // use CPU as device int use_device = 2; // use GPU as device //int use_device = 2; // use FPGA as device if(initialize(use_device)) return -1; int sourcesize = 1024*1024; char * source = (char *)calloc(sourcesize, sizeof(char)); if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * kernel_bp1 = "bpnn_layerforward_ocl"; char * kernel_bp2 = "bpnn_adjust_weights_ocl"; char * tempchar = "./backprop_kernel.cl"; char * krnl_file = "./binary/backprop_kernel_default.xclbin"; cl_int err = 0; cl_program prog; // create program from source if (use_device < 2 ) { FILE * fp = fopen(tempchar, "rb"); if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); // compile kernel err = 0; const char * slist[2] = { source, 0 }; prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } } // create program from binary else { char *krnl_bin; const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin); err = 0; prog = clCreateProgramWithBinary(context, 1, &device_list[0], &krnl_size, (const unsigned char**) &krnl_bin, NULL, &err); if ((!prog) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); exit(EXIT_FAILURE); } } err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL); { // show warnings/errors //static char log[65536]; memset(log, 0, sizeof(log)); //cl_device_id device_id = 0; //err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL); //clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); //if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); } if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; } cl_kernel kernel1; cl_kernel kernel2; kernel1 = clCreateKernel(prog, kernel_bp1, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel1) 0 => %d\n", err); return -1; } kernel2 = clCreateKernel(prog, kernel_bp2, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel2) 0 => %d\n", err); return -1; } /* clReleaseProgram(prog); */ float *input_weights_one_dim; float *input_weights_prev_one_dim; float * partial_sum; float sum; float num_blocks = in / BLOCK_SIZE; input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float)); // set global and local workitems size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 }; // this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights // todo: fix mem allocation int m = 0; for (int k = 0; k <= in; k++) { for (int j = 0; j <= hid; j++) { input_weights_one_dim[m] = net->input_weights[k][j]; input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j]; m++; } } cl_mem input_hidden_ocl; cl_mem input_ocl; cl_mem output_hidden_ocl; cl_mem hidden_partial_sum; cl_mem hidden_delta_ocl; cl_mem input_prev_weights_ocl; input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;} input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;} output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;} hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;} hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;} input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;} printf("Performing GPU computation\n"); //write buffers err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; } clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl); clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl); clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl); clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum ); clSetKernelArg(kernel1, 4, sizeof(float) * HEIGHT, (void*)NULL ); clSetKernelArg(kernel1, 5, sizeof(float ) * HEIGHT * WIDTH, (void*)NULL ); clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in); clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid); err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 3, NULL, global_work, local_work, 0, NULL, 0); if(err == CL_INVALID_KERNEL) {printf("Error is invalid kernel\n");} if(err != CL_SUCCESS) { printf("ERROR: 1 kernel1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: partial sum\n"); return -1; } for (int j = 1; j <= hid; j++) { sum = 0.0; for (int k = 0; k < num_blocks; k++) { sum += partial_sum[k * hid + j-1] ; } sum += net->input_weights[0][j]; net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum))); } bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out); bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err); bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err); bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights); err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl, 1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; } clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl); clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid); clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl); clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in); clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl); clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl ); err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_ocl\n"); return -1; } err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; } clReleaseMemObject(input_ocl); clReleaseMemObject(output_hidden_ocl); clReleaseMemObject(input_hidden_ocl); clReleaseMemObject(hidden_partial_sum); clReleaseMemObject(input_prev_weights_ocl); free(input_weights_prev_one_dim); free(partial_sum); free(input_weights_one_dim); }
void execute(float *grid, size_t gridSize, unsigned int width, unsigned int workGroupSize, unsigned int iterations, bool printResult) { cl_context context; cl_command_queue commandQueue; cl_program program; cl_kernel kernel; size_t dataBytes, kernelLength; cl_int errorCode; cl_mem gridBuffer; cl_device_id* devices; cl_device_id gpu; cl_uint numPlatforms; errorCode = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id platforms[numPlatforms]; errorCode = clGetPlatformIDs(numPlatforms, platforms, NULL); checkError(errorCode); cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0}; context = clCreateContextFromType(properties, CL_DEVICE_TYPE_ALL, 0, NULL, &errorCode); checkError(errorCode); errorCode = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); devices = malloc(dataBytes); errorCode |= clGetContextInfo(context, CL_CONTEXT_DEVICES, dataBytes, devices, NULL); gpu = devices[0]; commandQueue = clCreateCommandQueue(context, gpu, 0, &errorCode); checkError(errorCode); gridBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, gridSize, grid, &errorCode); checkError(errorCode); const char* programBuffer = readFile("kernel.cl"); kernelLength = strlen(programBuffer); program = clCreateProgramWithSource(context, 1, (const char **)&programBuffer, &kernelLength, &errorCode); checkError(errorCode); errorCode = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (errorCode == CL_BUILD_PROGRAM_FAILURE) { // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log free(log); printf("%s\n", log); } checkError(errorCode); kernel = clCreateKernel(program, "diffuse", &errorCode); checkError(errorCode); size_t localWorkSize[2] = {workGroupSize, workGroupSize}, globalWorkSize[2] = {width, width}; errorCode |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&gridBuffer); errorCode |= clSetKernelArg(kernel, 1, sizeof(float) * workGroupSize * workGroupSize, NULL); errorCode |= clSetKernelArg(kernel, 2, sizeof(int), (void *)&width); errorCode |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&workGroupSize); errorCode |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&iterations); checkError(errorCode); errorCode = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); checkError(errorCode); errorCode = clEnqueueReadBuffer(commandQueue, gridBuffer, CL_TRUE, 0, gridSize, grid, 0, NULL, NULL); checkError(errorCode); free(devices); free((void *)programBuffer); clReleaseContext(context); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(commandQueue); }
int initGPU(int n) { #pragma mark Device Information // Find the CPU CL device, as a fallback err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL); assert(err == CL_SUCCESS); // Find the GPU CL device, this is what we really want // If there is no GPU device is CL capable, fall back to CPU err |= clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) device = cpu; assert(device); // Get some information about the returned device cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err |= clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size); err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size); assert(err == CL_SUCCESS); printf("Connecting to %s %s...", vendor_name, device_name); #pragma mark Context and Command Queue // Now create a context to perform our calculation with the // specified device context = clCreateContext(0, 1, &device, NULL, NULL, &err); assert(err == CL_SUCCESS); // And also a command queue for the context cmd_queue = clCreateCommandQueue(context, device, 0, NULL); #pragma mark Program and Kernel Creation // Load the program source from disk // The kernel/program is the project directory and in Xcode the executable // is set to launch from that directory hence we use a relative path const char * filename = "kernel.cl"; char *program_source = load_program_source(filename); program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, &err); assert(err == CL_SUCCESS); err |= clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL); assert(err == CL_SUCCESS); // Now create the kernel "objects" that we want to use in the example file kernel[0] = clCreateKernel(program[0], "add", &err); assert(err == CL_SUCCESS); #pragma mark Memory Allocation // Allocate memory on the device to hold our data and store the results into buffer_size = sizeof(int) * n; mem_c_position = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); mem_c_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); mem_p_angle = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); mem_p_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); assert(err == CL_SUCCESS); mem_fitness = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size, NULL, &err); assert(err == CL_SUCCESS); // Get all of the stuff written and allocated clFinish(cmd_queue); printf(" done\n"); return err; // CL_SUCCESS }
magma_err_t magma_cgeqrf2_2q_gpu( magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex *tau, magma_err_t *info, magma_queue_t* queues) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= CGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -9, internal GPU memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaFloatComplex_ptr dwork; magmaFloatComplex *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return MAGMA_SUCCESS; nb = magma_get_cgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_cmalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* if ( MAGMA_SUCCESS != magma_cmalloc_cpu( &work, lwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } */ cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaFloatComplex)*lwork, NULL, NULL); work = (magmaFloatComplex*)clEnqueueMapBuffer(queues[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL); nbmin = 2; nx = 2*nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_cgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), 0, ldwork, queues[0], NULL); clFlush(queues[0]); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queues[1]); magma_csetmatrix_async( old_ib, old_ib, work_ref(old_i), 0, ldwork, dA(old_i, old_i), ldda, queues[1], NULL); clFlush(queues[1]); } magma_queue_sync(queues[0]); lapackf77_cgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); cpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_csetmatrix(rows, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queues[0]); cq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_csetmatrix(ib, ib, hwork, 0, ib, dwork, 0, lddwork, queues[1]); if (i+nb < k-nx){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]); magma_queue_sync(queues[1]); }else { magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]); magma_csetmatrix(ib, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queues[1]); clFlush(queues[1]); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_cgetmatrix(rows, ib, dA(i, i), ldda, work, 0, rows, queues[0]); lhwork = lwork - rows*ib; lapackf77_cgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_csetmatrix(rows, ib, work, 0, rows, dA(i, i), ldda, queues[0]); } clEnqueueUnmapMemObject(queues[0], buffer, work, 0, NULL, NULL); clReleaseMemObject(buffer); // magma_free_cpu(work); return *info; } /* magma_cgeqrf2_gpu */
int MemoryOptimizations::setupCL(void) { cl_int status = 0; size_t deviceListSize; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } // Display available devices. if(!sampleCommon->displayDevices(platform, dType)) { sampleCommon->error("sampleCommon::displayDevices() failed"); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) return SDK_FAILURE; /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; int deviceCount = (int)(deviceListSize / sizeof(cl_device_id)); if(!sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id*)malloc(deviceListSize); if(devices == NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetGetContextInfo failed.")) return SDK_FAILURE; /* Get Device specific Information */ /* Get device extensions */ char deviceExtensions[2048]; status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_EXTENSIONS, sizeof(deviceExtensions), deviceExtensions, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo failed.(extensions)")) return SDK_FAILURE; if(!strstr(deviceExtensions, "cl_khr_global_int32_base_atomics")) { sampleCommon->error("Device does not support global_int32_base_atomics!"); return SDK_EXPECTED_FAILURE; } status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed.")) return SDK_FAILURE; status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed.")) return SDK_FAILURE; maxWorkItemSizes = (size_t*)malloc(maxDimensions*sizeof(size_t)); status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)maxWorkItemSizes, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed.")) return SDK_FAILURE; { /* The block is to move the declaration of prop closer to its use */ cl_command_queue_properties prop = 0; prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue(context, devices[deviceId], prop, &status); if(!sampleCommon->checkVal(status, 0, "clCreateCommandQueue failed.")) return SDK_FAILURE; } /* Input buffer */ inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float4) * length, 0, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateBuffer failed. (inputBuffer)")) return SDK_FAILURE; /* Write data to buffer */ status = clEnqueueWriteBuffer(commandQueue, inputBuffer, 1, 0, sizeof(cl_float4) * length, input, 0, 0, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clEnqueueWriteBuffer failed. (inputBuffer)")) return SDK_FAILURE; outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * length, 0, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateBuffer failed. (outputBuffer)")) return SDK_FAILURE; /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * binary = kernelFile.source().c_str(); size_t binarySize = kernelFile.source().size(); program = clCreateProgramWithBinary(context, 1, &devices[deviceId], (const size_t *)&binarySize, (const unsigned char**)&binary, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithBinary failed.")) { return SDK_FAILURE; } } else { kernelPath.append("MemoryOptimizations_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file: " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; } /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, &devices[deviceId], NULL, NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char *buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal( status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; } /* Copy 1D Fast Path */ kernel[0] = clCreateKernel(program, "copy1DFastPath", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.(copy1DFastPath)")) return SDK_FAILURE; /* Copy 1D Complete Path */ kernel[1] = clCreateKernel(program, "copy1DCompletePath", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (copy1DCompletePath)")) return SDK_FAILURE; /* Copy 2D float */ kernel[2] = clCreateKernel(program, "copy2Dfloat", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (copy2Dfloat)")) return SDK_FAILURE; /* Copy 2D float4 */ kernel[3] = clCreateKernel(program, "copy2Dfloat4", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (copy2Dfloat4)")) return SDK_FAILURE; /* Copy 1D float4 */ kernel[4] = clCreateKernel(program, "copy1Dfloat4", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (copy1Dfloat4)")) return SDK_FAILURE; /* Copy No Coalesced */ kernel[5] = clCreateKernel(program, "NoCoal", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (NoCoal)")) return SDK_FAILURE; /* Copy Split */ kernel[6] = clCreateKernel(program, "Split", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed. (Split)")) return SDK_FAILURE; return SDK_SUCCESS; }
int main(int argc, char *argv[]) { int iGlobalSize = 1; int iCheck1, iCheck2, iCheck3, iCheck4; size_t iGlobalWorkSize = -1; size_t iLocalWorkSize = -1; if (argc > 1) // Size of input vector { iCheck1 = atoi(argv[1]); if (iCheck1 != 0) { iGlobalSize = iCheck1; } } int iNoReps = 100; // Number of repetitions. if (argc > 2) { iCheck2 = atoi(argv[2]); if (iCheck2 != 0) { iNoReps = iCheck2; } } /* if (argc > 3) // Global work size { iCheck3 = atoi(argv[3]); if (iCheck3 != 0) { iGlobalWorkSize = iCheck3; } } if (argc > 4) // Local work size { iCheck4 = atoi(argv[4]); if (iCheck4 != 0) { iLocalWorkSize = iCheck4; } } */ int bPrint = 0; if (argc > 3) // Originally 5. { bPrint = 1; } // printf("The global size is %d, the global work size is %ld, and the local work size is %ld. \n", iGlobalSize, iGlobalWorkSize, iLocalWorkSize); /* size_t * ipGlobalWorkParam = NULL; if (iGlobalWorkSize != -1) { ipGlobalWorkParam = &iGlobalWorkSize; } size_t * ipLocalWorkParam = NULL; if (iLocalWorkSize != -1) { ipLocalWorkParam = &iLocalWorkSize; } */ GCAQ * TheGCAQ = GCAQSetup(); if (TheGCAQ == NULL) { return 1; } #if BIGFLOAT const char *szFloatOpt = "-DBIGFLOAT"; #else const char *szFloatOpt = NULL; #endif const int iNoKernels = 1; char *ourKernelStrings[6] = { szDotProduct, szReduce, szDotProduct2, szReduce2, szDotProduct4, szReduce4}; GPAK *TheGPAK = GPAKSetup(TheGCAQ, iNoKernels, ourKernelStrings, szFloatOpt); if (TheGPAK == NULL) { GCAQShutdown(TheGCAQ); return 2; } INTG iTypicalWorkgroupNo = TheGPAK->TheMaxWorkGroupSizes[0]; INTG iExpOutputSize = ioutsize(iGlobalSize, iTypicalWorkgroupNo); FLPT * fExpDotProdResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT)); FLPT * fExpReduceResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT)); fdotprodexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpDotProdResult); freduceexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpReduceResult); // printvector("dot prod", iExpOutputSize, fExpDotProdResult); // printvector("reduce", iExpOutputSize, fExpReduceResult); FLPT* inputDataF = (FLPT *) malloc(iGlobalSize * sizeof(FLPT)); SetFIncrease(iGlobalSize, inputDataF); // For the dot product. FLPT* outputDataD = (FLPT *) malloc(iGlobalSize * sizeof(FLPT)); SetFNull(iGlobalSize, outputDataD); // For the reduction. FLPT* outputDataR = (FLPT *) malloc(iGlobalSize * sizeof(FLPT)); SetFNull(iGlobalSize, outputDataR); struct timespec start[iNoKernels]; struct timespec end[iNoKernels]; // create buffers for the input and ouput int err; cl_mem inputF, outputF, outputAll; inputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_READ_ONLY, iGlobalSize * sizeof(FLPT), NULL, &err); if (err != CL_SUCCESS) { printf("Error allocating for F"); return 3; } outputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err); if (err != CL_SUCCESS) { printf("Error allocating for output 7"); return 9; } outputAll = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err); if (err != CL_SUCCESS) { printf("Error allocating for output 8"); return 9; } clEnqueueWriteBuffer(TheGCAQ->TheQueue, inputF, CL_TRUE, 0, iGlobalSize * sizeof(FLPT), inputDataF, 0, NULL, NULL); int iRep; int iKernel; int i; int iLengthTotal = iGlobalSize; size_t iGlobalWorkThing = iGlobalSize; int iSomething = 1; for (iKernel = 0; iKernel < iNoKernels; iKernel++) { for (i = 0; i < iLengthTotal; i++) { outputDataD[i] = 0.0; outputDataR[i] = 0.0; } clock_gettime(CLOCK_MONOTONIC, &(start[iKernel])); for (iRep = 0; iRep < iNoReps; iRep++) { clSetKernelArg(TheGPAK->TheKernels[iKernel], 0, sizeof(int), &iLengthTotal); clSetKernelArg(TheGPAK->TheKernels[iKernel], 1, sizeof(cl_mem), &inputF); clSetKernelArg(TheGPAK->TheKernels[iKernel], 2, iSomething * iLocalWorkSize * sizeof(float), NULL); // Was 3 clSetKernelArg(TheGPAK->TheKernels[iKernel], 3, sizeof(cl_mem), &outputAll); // Was 4 clEnqueueNDRangeKernel(TheGCAQ->TheQueue, TheGPAK->TheKernels[iKernel], 1, NULL, &iGlobalWorkThing, &(TheGPAK->TheMaxWorkGroupSizes[iKernel]), 0, NULL, NULL); clFinish(TheGCAQ->TheQueue); // copy the results from out of the output buffer if (iKernel % 2 == 0) { clEnqueueReadBuffer(TheGCAQ->TheQueue, outputAll, CL_TRUE, 0, iExpOutputSize * sizeof(float), outputDataD, 0, NULL, NULL); } else { clEnqueueReadBuffer(TheGCAQ->TheQueue, outputAll, CL_TRUE, 0, iExpOutputSize * sizeof(float), outputDataR, 0, NULL, NULL); } } clock_gettime(CLOCK_MONOTONIC, &(end[iKernel])); if (bPrint) { for (i = 0; i < iExpOutputSize; i++) { if (iKernel % 2 == 0) { if (outputDataD[i] != fExpDotProdResult[i]) { printf ("A problem at kernel %d and iteration %d for actual value %f but expected value %f!\n", iKernel, i, outputDataD[i], fExpDotProdResult[i]); break; } } else { if (outputDataR[i] != fExpReduceResult[i]) { printf ("A problem at kernel %d and iteration %d for actual value %f but expected value %f!\n", iKernel, i, outputDataR[i], fExpReduceResult[i]); break; } } } } // if ((iKernel % 2) == 1) // { // iLengthTotal = iLengthTotal / 2; // iSomething = iSomething * 2; // iGlobalWorkThing = iGlobalWorkThing / 2; // } } clReleaseMemObject(inputF); clReleaseMemObject(outputF); clReleaseMemObject(outputAll); // print the results // if (bPrint) // { // printf("output %d: \n", iGlobalSize); // for(i=0;i<iExpOutputSize; i++) // { // printf("%d - %f - %f\n", i, outputDataD[i], outputDataR[i]); // } // } // cleanup - release OpenCL resources free(inputDataF); free(outputDataD); free(outputDataR); GPAKShutdown(TheGPAK); GCAQShutdown (TheGCAQ); printf("%d - ", iGlobalSize); for (iKernel = 0; iKernel < iNoKernels; iKernel++) { printf("%f - ", (1.0 * TLPERS * iGlobalSize * iNoReps) / (MEGAHERTZ * timespecDiff(&(end[iKernel]), &(start[iKernel])))); } printf("\n"); return 0; }
// Create the data array in device memory for our calculation // cl_mem device_$arg_ref = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof($arg_ref[0]) * grid_size, NULL, NULL); if (!device_$arg_ref) { printf("Error: Failed to allocate device memory!\n"); return err; } // Write our data set into the data array in device memory // err = clEnqueueWriteBuffer(commands, device_$arg_ref, CL_TRUE, 0, sizeof($arg_ref[0]) * grid_size, $arg_ref, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); return err; } // Set the arguments to our compute kernel // err = clSetKernelArg(kernel, $arg_index, sizeof(cl_mem), &device_$arg_ref); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); return err; }
void test_csrmv() { clsparseStatus status; cl_int cl_status; clsparseEnableExtendedPrecision(CLSE::control, extended_precision); if (typeid(T) == typeid(cl_float) ) { status = clsparseScsrmv(&gAlpha, &CSRE::csrSMatrix, &gX, &gBeta, &gY, CLSE::control); ASSERT_EQ(clsparseSuccess, status); float* vals = (float*)&CSRE::ublasSCsr.value_data()[0]; int* rows = &CSRE::ublasSCsr.index1_data()[0]; int* cols = &CSRE::ublasSCsr.index2_data()[0]; for (int row = 0; row < CSRE::n_rows; row++) { // Summation done at a higher precision to decrease // summation errors from rounding. hY[row] *= hBeta; int row_end = rows[row+1]; double temp_sum; temp_sum = hY[row]; for (int i = rows[row]; i < rows[row+1]; i++) { // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]]; temp_sum += hAlpha * vals[i] * hX[cols[i]]; } hY[row] = temp_sum; } T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values, CL_TRUE, CL_MAP_READ, 0, gY.num_values * sizeof(T), 0, nullptr, nullptr, &cl_status); ASSERT_EQ(CL_SUCCESS, cl_status); uint64_t max_ulps = 0; uint64_t min_ulps = UINT64_MAX; uint64_t total_ulps = 0; for (int i = 0; i < hY.size(); i++) { long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]); intDiff = llabs(intDiff); total_ulps += intDiff; if (max_ulps < intDiff) max_ulps = intDiff; if (min_ulps > intDiff) min_ulps = intDiff; // Debug printouts. //std::cout << "Row " << i << " Float Ulps: " << intDiff << std::endl; //std::cout.precision(9); //std::cout << "\tFloat hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint32_t *)&hY[i] << "), " << std::dec; //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint32_t *)&host_result[i] << ")" << std::dec << std::endl; } #ifndef NDEBUG if (extended_precision) { std::cout << "Float Min ulps: " << min_ulps << std::endl; std::cout << "Float Max ulps: " << max_ulps << std::endl; std::cout << "Float Total ulps: " << total_ulps << std::endl; std::cout << "Float Average ulps: " << (double)total_ulps/(double)hY.size() << " (Size: " << hY.size() << ")" << std::endl; } #endif for (int i = 0; i < hY.size(); i++) { double compare_val = 0.; if (extended_precision) { // The limit here is somewhat weak because some GPUs don't // support correctly rounded denorms in SPFP mode. if (boost::math::isnormal(hY[i])) compare_val = fabs(hY[i]*1e-3); } else { if (boost::math::isnormal(hY[i])) compare_val = fabs(hY[i]*0.1); } if (compare_val < 10*FLT_EPSILON) compare_val = 10*FLT_EPSILON; ASSERT_NEAR(hY[i], host_result[i], compare_val); } cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values, host_result, 0, nullptr, nullptr); ASSERT_EQ(CL_SUCCESS, cl_status); } if (typeid(T) == typeid(cl_double) ) { status = clsparseDcsrmv(&gAlpha, &CSRE::csrDMatrix, &gX, &gBeta, &gY, CLSE::control); ASSERT_EQ(clsparseSuccess, status); double* vals = (double*)&CSRE::ublasDCsr.value_data()[0]; int* rows = &CSRE::ublasDCsr.index1_data()[0]; int* cols = &CSRE::ublasDCsr.index2_data()[0]; for (int row = 0; row < CSRE::n_rows; row++) { // Summation done using a compensated summation to decrease // summation errors from rounding. This allows us to get // smaller errors without requiring quad precision support. // This method is like performing summation at quad precision and // casting down to double in the end. hY[row] *= hBeta; int row_end = rows[row+1]; double temp_sum; temp_sum = hY[row]; T sumk_err = 0.; for (int i = rows[row]; i < rows[row+1]; i++) { // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]]; temp_sum = two_sum(temp_sum, hAlpha*vals[i]*hX[cols[i]], &sumk_err); } hY[row] = temp_sum + sumk_err; } T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values, CL_TRUE, CL_MAP_READ, 0, gY.num_values * sizeof(T), 0, nullptr, nullptr, &cl_status); ASSERT_EQ(CL_SUCCESS, cl_status); uint64_t max_ulps = 0; uint64_t min_ulps = ULLONG_MAX; uint64_t total_ulps = 0; for (int i = 0; i < hY.size(); i++) { long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]); intDiff = llabs(intDiff); total_ulps += intDiff; if (max_ulps < intDiff) max_ulps = intDiff; if (min_ulps > intDiff) min_ulps = intDiff; // Debug printouts. //std::cout << "Row " << i << " Double Ulps: " << intDiff << std::endl; //std::cout.precision(17); //std::cout << "\tDouble hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint64_t *)&hY[i] << "), " << std::dec; //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint64_t *)&host_result[i] << ")" << std::dec << std::endl; } if (extended_precision) { #ifndef NDEBUG std::cout << "Double Min ulps: " << min_ulps << std::endl; std::cout << "Double Max ulps: " << max_ulps << std::endl; std::cout << "Double Total ulps: " << total_ulps << std::endl; std::cout << "Double Average ulps: " << (double)total_ulps/(double)hY.size() << " (Size: " << hY.size() << ")" << std::endl; #endif for (int i = 0; i < hY.size(); i++) { double compare_val = fabs(hY[i]*1e-14); if (compare_val < 10*DBL_EPSILON) compare_val = 10*DBL_EPSILON; ASSERT_NEAR(hY[i], host_result[i], compare_val); } } else { for (int i = 0; i < hY.size(); i++) { double compare_val = 0.; if (boost::math::isnormal(hY[i])) compare_val = fabs(hY[i]*0.1); if (compare_val < 10*DBL_EPSILON) compare_val = 10*DBL_EPSILON; ASSERT_NEAR(hY[i], host_result[i], compare_val); } } cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values, host_result, 0, nullptr, nullptr); ASSERT_EQ(CL_SUCCESS, cl_status); } // Reset output buffer for next test. ::clReleaseMemObject(gY.values); clsparseInitVector(&gY); gY.values = clCreateBuffer(CLSE::context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, hY.size() * sizeof(T), hY.data().begin(), &cl_status); gY.num_values = hY.size(); ASSERT_EQ(CL_SUCCESS, cl_status); }
inline void vector_sum(const int arraySize, const double* inputA, const double* inputB, double* output) { /* Allocate memory buffers */ /* * Ask the OpenCL implementation to allocate buffers for the data. * We ask the OpenCL implemenation to allocate memory rather than * allocating it on the CPU to avoid having to copy the data later. * The read/write flags relate to accesses to the memory from within * the kernel. */ bool createMemoryObjectSuccess = true; int numberOfMemoryObjects = 3; cl_mem memoryObjects[3] = {0, 0, 0}; int errorNumber = 0; int bufferSize = arraySize*sizeof(double); memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, (void*)inputA, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 1."); memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, (void*)inputB, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 2."); memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, output, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 3."); /* Enqueue commands and kernels */ /* Enqueue to the command queues the commands that control the sequence * and synchronization of kernel execution, reading and writing of data, * and manipulation of memory objects */ /* Execute a kernel function */ /* Call clSetKernelArg() for each parameter in the kernel */ bool setKernelArgumentsSuccess = true; setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2])); if (not setKernelArgumentsSuccess) { cleanUpOpenCL(); std::cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ << ":"<< __LINE__ << std::endl; exit(1); } /* Determine the work-group size and index space for the kernel */ const size_t globalWorkSize[1] = {arraySize}; const size_t localWorkSize[1] = { 1 }; /* Enqueue the kernel for execution in the command queue */ //for (int j = 0; j < ITER; j++) { if (not checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL))) { cleanUpOpenCL(); std::cerr << "Failed enqueuing the kernel. " << __FILE__ << ":" << __LINE__ <<std::endl; exit(1); } //} /* Get a pointer to the output data */ output = (double*)clEnqueueMapBuffer(commandQueue, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, arraySize, 0, NULL, NULL, &errorNumber); if (not checkSuccess(errorNumber)) { cleanUpOpenCL(); std::cerr << "Failed to map buffer " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } /* Wait for kernel execution */ if (not checkSuccess(clFinish(commandQueue))) { cleanUpOpenCL(); std::cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << std::endl; exit(1); } /* Unmap the memory objects as we finished using them in the CPU */ if (not checkSuccess(clReleaseMemObject(memoryObjects[0]))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } if (not checkSuccess(clReleaseMemObject(memoryObjects[1]))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } if (not checkSuccess(clEnqueueUnmapMemObject(commandQueue, memoryObjects[2], output, 0, NULL, NULL))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } }
int main() { // START:context cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); // END:context // START:queue cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); // END:queue // START:kernel char* source = read_source("multiply_arrays.cl"); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, NULL); free(source); clBuildProgram(program, 0, NULL, NULL, NULL, NULL); cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL); // END:kernel // START:buffers cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS]; random_fill(a, NUM_ELEMENTS); random_fill(b, NUM_ELEMENTS); cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, a, NULL); cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, b, NULL); cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL); // END:buffers // START:execute clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA); clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB); clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); size_t work_units = NUM_ELEMENTS; clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL); // END:execute // START:results cl_float results[NUM_ELEMENTS]; clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS, results, 0, NULL, NULL); // END:results // START:cleanup clReleaseMemObject(inputA); clReleaseMemObject(inputB); clReleaseMemObject(output); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); // END:cleanup for (int i = 0; i < NUM_ELEMENTS; ++i) { printf("%f * %f = %f\n", a[i], b[i], results[i]); } return 0; }
int32_t init_kernel_platform() { cl_uint plat_num; cl_platform_id plat_id = NULL; cl_uint dev_num = 0; cl_device_id *devices; ret = clGetPlatformIDs(0, NULL, &plat_num); if (ret < 0) { LOGD("MU1 Error: Getting plat_ids!\n"); return -1; } if(plat_num > 0) { cl_platform_id* plat_ids = (cl_platform_id* )malloc(plat_num* sizeof(cl_platform_id)); ret = clGetPlatformIDs(plat_num, plat_ids, NULL); plat_id = plat_ids[0]; free(plat_ids); } ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, 0, NULL, &dev_num); if (dev_num == 0) { LOGD("MU1: No GPU device available.\n"); LOGD("MU1: Choose CPU as default device.\n"); ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, 0, NULL, &dev_num); devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id)); ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, dev_num, devices, NULL); } else { LOGD("MU1: Choose GPU as default device. dev_num %d\n", dev_num); devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id)); ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, dev_num, devices, NULL); } context = clCreateContext(NULL,1, devices,NULL,NULL,NULL); commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); char filename[] = "/data/mu1_kernel.cl"; char file_context[10*1024]={0}; const char *source = &file_context[0]; ret = read_cl(filename, &file_context[0]); size_t sourceSize[10] = {strlen(source)}; cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize[0], NULL); ret = clBuildProgram(program, 1, devices, NULL, NULL, NULL); if(ret < 0) { LOGD("MU1 Error: clBuildProgram error\n"); return 0; } kernel = clCreateKernel(program, "process_iq", NULL); inputBuffer_i = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 512*1024*4, (void *)(&table_i[0][0]), NULL); inputBuffer_q = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 512*1024*4, (void *)(&table_q[0][0]), NULL); inputBuffer_o = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, 512*1024*4, (void *)(&table_o[0][0]), NULL); ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer_i); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputBuffer_q); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&inputBuffer_o); if(devices != NULL) { free(devices);} LOGD("MU1: init cl plat success"); return 0; }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; //OpenCL platform cl_device_id cdDevice; //OpenCL device cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command que cl_mem d_Input, d_Output; //OpenCL memory buffer objects cl_int ciErrNum; float *h_Input, *h_OutputCPU, *h_OutputGPU; const uint imageW = 2048, imageH = 2048, stride = 2048; const int dir = DCT_FORWARD; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("oclDCT8x8.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_Input = (float *)malloc(imageH * stride * sizeof(float)); h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float)); h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float)); srand(2009); for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++) h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX; shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL DCT 8x8...\n"); initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Creating OpenCL memory objects...\n"); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride * sizeof(cl_float), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW); //Just a single iteration or a warmup iteration DCT8x8( cqCommandQueue, d_Output, d_Input, stride, imageH, imageW, dir ); #define GPU_PROFILING 1 #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int iter = 0; iter < numIterations; iter++) DCT8x8( NULL, d_Output, d_Input, stride, imageH, imageW, dir ); ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); //Get profiler time cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations); #endif shrLog("Reading back OpenCL results...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir); double sum = 0, delta = 0; double L2norm; for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++){ sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j]; delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]); } L2norm = sqrt(delta / sum); shrLog("Relative L2 norm: %.3e\n\n", L2norm); shrLog("Shutting down...\n"); //Release kernels and program closeDCT8x8(); //Release other OpenCL objects ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); //Release host buffers free(h_OutputGPU); free(h_OutputCPU); free(h_Input); //Finish shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-3) ? QA_PASSED : QA_FAILED); }
int main(int argc, char **argv) { printf("enter demo main\n"); fflush(stdout); putenv("POCL_VERBOSE=1"); putenv("POCL_DEVICES=basic"); putenv("POCL_LEAVE_TEMP_DIRS=1"); putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1"); putenv("POCL_TEMP_DIR=pocl"); putenv("POCL_CACHE_DIR=pocl"); putenv("POCL_WORK_GROUP_METHOD=spmd"); if(argc >= 2){ printf("argv[1]:%s:\n",argv[1]); if(!strcmp(argv[1], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[1], "c")) putenv("POCL_CROSS_COMPILE=1"); } if(argc >= 3){ printf("argv[2]:%s:\n",argv[2]); if(!strcmp(argv[2], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[2], "c")) putenv("POCL_CROSS_COMPILE=1"); } //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib"); //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath()); cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } if (platforms_n == 0) return 1; cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", devices_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err)); cl_kernel kernel = 0; cl_mem memObjects[2] = {0,0}; // Create OpenCL program - first attempt to load cached binary. // If that is not available, then create the program from source // and store the binary for future use. std::cout << "Attempting to create program from binary..." << std::endl; cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin"); if (program == NULL) { std::cout << "Binary not loaded, create from source..." << std::endl; program = CreateProgram(context, devices[1], "kernel.cl"); if (program == NULL) { Cleanup(context, queue, program, kernel, memObjects); return 1; } std::cout << "Save program binary for future run..." << std::endl; if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false) { std::cerr << "Failed to write program binary" << std::endl; Cleanup(context, queue, program, kernel, memObjects); return 1; } } else { std::cout << "Read program from binary." << std::endl; } printf("attempting to create input buffer\n"); fflush(stdout); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*NUM_DATA, NULL, &_err)); printf("attempting to create output buffer\n"); fflush(stdout); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double)*NUM_DATA, NULL, &_err)); memObjects[0] = input_buffer; memObjects[1] = output_buffer; double factor = ((double)rand()/(double)(RAND_MAX)) * 100.0;; printf("attempting to create kernel\n"); fflush(stdout); kernel = CL_CHECK_ERR(clCreateKernel(program, "daxpy", &_err)); printf("setting up kernel args cl_mem:%lx \n",input_buffer); fflush(stdout); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); printf("attempting to enqueue write buffer\n"); fflush(stdout); for (int i=0; i<NUM_DATA; i++) { double in = ((double)rand()/(double)(RAND_MAX)) * 100.0;; CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(double), 8, &in, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; printf("attempting to enqueue kernel\n"); fflush(stdout); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); printf("Enqueue'd kerenel\n"); fflush(stdout); cl_ulong time_start, time_end; CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL)); double elapsed = time_end - time_start; printf("time(ns):%lg\n",elapsed); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { double data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(double), 8, &data, 0, NULL, NULL)); //printf(" %lg", data); } printf("\n"); CL_CHECK(clReleaseMemObject(memObjects[0])); CL_CHECK(clReleaseMemObject(memObjects[1])); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M); DUMMY_ARGS_USAGE_2(ldb, offBX); initialize_scalars(alpha, beta); buffer_.n_ = N; buffer_.k_ = K; buffer_.offA_ = offA; buffer_.offC_ = offCY; if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (ldc == 0) { buffer_.ldc_ = N; } else if (ldc < N) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = N; if (order_option == 0) { order_ = clblasRowMajor; if (transA_option == 0) { buffer_.trans_a_ = clblasNoTrans; buffer_.a_num_vectors_ = N; if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = K; if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } } else { order_ = clblasColumnMajor; if (transA_option == 0) { buffer_.a_num_vectors_ = K; buffer_.trans_a_ = clblasNoTrans; if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = N; if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), NULL, &err); }
static cl_int opencl_plugin_init_mesh_buffers(opencl_plugin plugin, cl_int mesh_data_count, mesh_data *mesh_data_list) { cl_int err; cl_int i; cl_mem new_vertex_buffer = NULL, new_triangle_buffer = NULL; cl_int total_num_vertices = 0, total_num_triangles = 0; assert(plugin != NULL); assert(mesh_data_count >= 0); assert(mesh_data_list != NULL); for (i = 0; i < mesh_data_count; i++) { total_num_vertices += mesh_data_list[i].num_vertices; total_num_triangles += mesh_data_list[i].num_triangles; } if (total_num_vertices > plugin->vertex_buffer_capacity) { /* Current buffer not big enough, free old buffer first */ if (plugin->vertex_buffer) { clReleaseMemObject(plugin->vertex_buffer); plugin->vertex_buffer = NULL; } plugin->vertex_buffer_capacity = 0; /* TODO: Maybe better dynamic resizing (factor = 1.5)? */ new_vertex_buffer = clCreateBuffer(plugin->context, CL_MEM_READ_ONLY, sizeof(float) * 3 * total_num_vertices, NULL, &err); CHECK_CL_ERROR(err); plugin->vertex_buffer_capacity = total_num_vertices; plugin->vertex_buffer = new_vertex_buffer; new_vertex_buffer = NULL; } if (total_num_triangles > plugin->triangle_buffer_capacity) { /* Current buffer not big enough, free old buffer first */ if (plugin->triangle_buffer) { clReleaseMemObject(plugin->triangle_buffer); plugin->triangle_buffer = NULL; } plugin->triangle_buffer_capacity = 0; /* TODO: Maybe better dynamic resizing (factor = 1.5)? */ new_triangle_buffer = clCreateBuffer(plugin->context, CL_MEM_READ_ONLY, sizeof(cl_int) * 3 * total_num_triangles, NULL, &err); CHECK_CL_ERROR(err); plugin->triangle_buffer_capacity = total_num_triangles; plugin->triangle_buffer = new_triangle_buffer; new_triangle_buffer = NULL; } total_num_vertices = 0; total_num_triangles = 0; for (i = 0; i < mesh_data_count; i++) { mesh_data *mesh_data = &mesh_data_list[i]; err = clEnqueueWriteBuffer( plugin->queue, plugin->vertex_buffer, CL_FALSE, sizeof(float) * 3 * total_num_vertices, sizeof(float) * 3 * mesh_data->num_vertices, mesh_data->vertices, 0, NULL, NULL); CHECK_CL_ERROR(err); err = clEnqueueWriteBuffer( plugin->queue, plugin->triangle_buffer, CL_FALSE, sizeof(cl_int) * 3 * total_num_triangles, sizeof(cl_int) * 3 * mesh_data->num_triangles, mesh_data->triangles, 0, NULL, NULL); CHECK_CL_ERROR(err); total_num_vertices += mesh_data_list[i].num_vertices; total_num_triangles += mesh_data_list[i].num_triangles; } /* Wait for all buffer writes to finish, TODO: investigate this further */ err = clFinish(plugin->queue); CHECK_CL_ERROR(err); return 0; error: if (new_vertex_buffer) clReleaseMemObject(new_vertex_buffer); if (new_triangle_buffer) clReleaseMemObject(new_triangle_buffer); return -1; }
void run_benchmark( void *vargs, cl_context& context, cl_command_queue& commands, cl_program& program, cl_kernel& kernel ) { struct bench_args_t *args = (struct bench_args_t *)vargs; // Create device buffers // cl_mem obs_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->obs), NULL, NULL); cl_mem init_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->init), NULL, NULL); cl_mem transition_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->transition), NULL, NULL); cl_mem emission_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->emission), NULL, NULL); cl_mem path_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->path), NULL, NULL); if (!obs_buffer || !init_buffer || !transition_buffer || !emission_buffer || !path_buffer) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); exit(1); } // Write our data set into device buffers // int err; err = clEnqueueWriteBuffer(commands, obs_buffer, CL_TRUE, 0, sizeof(args->obs), args->obs, 0, NULL, NULL); err |= clEnqueueWriteBuffer(commands, init_buffer, CL_TRUE, 0, sizeof(args->init), args->init, 0, NULL, NULL); err |= clEnqueueWriteBuffer(commands, transition_buffer, CL_TRUE, 0, sizeof(args->transition), args->transition, 0, NULL, NULL); err |= clEnqueueWriteBuffer(commands, emission_buffer, CL_TRUE, 0, sizeof(args->emission), args->emission, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to device memory!\n"); printf("Test failed\n"); exit(1); } // Set the arguments to our compute kernel // err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &obs_buffer); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &init_buffer); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &transition_buffer); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &emission_buffer); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &path_buffer); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // #ifdef C_KERNEL err = clEnqueueTask(commands, kernel, 0, NULL, NULL); #else printf("Error: OpenCL kernel is not currently supported!\n"); exit(1); #endif if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); exit(1); } // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, path_buffer, CL_TRUE, 0, sizeof(args->path), args->path, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); exit(1); } }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T* h_idata = (T*)malloc(bytes); for(int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL); int testIterations = 100; double dTotalTime = 0.0; // print headers shrLog("Time in seconds for various numbers of elements for each kernel\n"); shrLog("\n\n"); shrLog("Kernel"); for (int i = minN; i <= maxN; i *= 2) { shrLog(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { shrLog("\n"); shrLog("%d", kernel); for (int i = minN; i <= maxN; i *= 2) { int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); double reduceTime; if( numBlocks <= MAX_BLOCK_DIM_SIZE ) { profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata); reduceTime = dTotalTime/(double)testIterations; } else { reduceTime = -1.0; } shrLog(", %.4f m", reduceTime); } } // cleanup free(h_idata); free(h_odata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); }
int main(int argc, char* argv[]) { struct pb_Parameters *parameters; parameters = pb_ReadParameters(&argc, argv); if (!parameters) return -1; if(!parameters->inpFiles[0]){ fputs("Input file expected\n", stderr); return -1; } struct pb_TimerSet timers; char oclOverhead[] = "OCL Overhead"; char intermediates[] = "IntermediatesKernel"; char finals[] = "FinalKernel"; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_IO); int numIterations; if (argc >= 2){ numIterations = atoi(argv[1]); } else { fputs("Expected at least one command line argument\n", stderr); return -1; } unsigned int img_width, img_height; unsigned int histo_width, histo_height; FILE* f = fopen(parameters->inpFiles[0],"rb"); int result = 0; result += fread(&img_width, sizeof(unsigned int), 1, f); result += fread(&img_height, sizeof(unsigned int), 1, f); result += fread(&histo_width, sizeof(unsigned int), 1, f); result += fread(&histo_height, sizeof(unsigned int), 1, f); if (result != 4){ fputs("Error reading input and output dimensions from file\n", stderr); return -1; } unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int)); unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char)); result = fread(img, sizeof(unsigned int), img_width*img_height, f); fclose(f); if (result != img_width*img_height){ fputs("Error reading input array from file\n", stderr); return -1; } cl_int ciErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(parameters); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram[2]; cl_kernel histo_intermediates_kernel; cl_kernel histo_final_kernel; cl_mem input; cl_mem ranges; cl_mem sm_mappings; cl_mem global_subhisto; cl_mem global_overflow; cl_mem final_histo; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); cl_uint workItemDimensions; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) ); size_t workItemSizes[workItemDimensions]; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) ); size_t program_length[2]; const char *source_path[2] = { "src/opencl_mxpa/histo_intermediates.cl", "src/opencl_mxpa/histo_final.cl"}; char *source[4]; for (int i = 0; i < 2; ++i) { // Dynamically allocate buffer for source source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]); if(!source[i]) { fprintf(stderr, "Could not load program source\n"); exit(1); } clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum); OCL_ERRCK_VAR(ciErrNum); free(source[i]); } for (int i = 0; i < 2; ++i) { //fprintf(stderr, "Building Program #%d...\n", i); OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) ); /* char *build_log; size_t ret_val_size; ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); OCL_ERRCK_VAR(ciErrNum); build_log = (char *)malloc(ret_val_size+1); ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); OCL_ERRCK_VAR(ciErrNum); // to be carefully, terminate with \0 // there's no information in the reference whether the string is 0 terminated or not build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); */ } histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SwitchToTimer(&timers, pb_TimerID_COPY); input = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); // Must dynamically allocate. Too large for stack unsigned int *zeroData; zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int)); if (zeroData == NULL) { fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height); exit(1); } for (int y=0; y < img_height; y++){ OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, y*img_width*sizeof(unsigned int), // Offset in bytes img_width*sizeof(unsigned int), // Size of data to write &img[y*img_width], // Host Source 0, NULL, NULL) ); } pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); unsigned int img_dim = img_height*img_width; OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) ); size_t inter_localWS[1] = { workItemSizes[0] }; size_t inter_globalWS[1] = { img_height * inter_localWS[0] }; size_t final_localWS[1] = { workItemSizes[0] }; size_t final_globalWS[1] = {(((int)(histo_height*histo_width+(final_localWS[0]-1))) / (int)final_localWS[0])*(int)final_localWS[0] }; pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); for (int iter = 0; iter < numIterations; iter++) { unsigned int ranges_h[2] = {UINT32_MAX, 0}; // how about something like // __global__ unsigned int ranges[2]; // ...kernel // __shared__ unsigned int s_ranges[2]; // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];} // __syncthreads(); // Although then removing the blocking cudaMemcpy's might cause something about // concurrent kernel execution. // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads? OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 0, // Offset in bytes 2*sizeof(unsigned int), // Size of data to write ranges_h, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 0, // Offset in bytes histo_width*histo_height*sizeof(unsigned int), // Size of data to write zeroData, // Host Source 0, NULL, NULL) ); pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0, inter_globalWS, inter_localWS, 0, 0, 0) ); pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0, final_globalWS, final_localWS, 0, 0, 0) ); } pb_SwitchToTimer(&timers, pb_TimerID_IO); OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 0, // Offset in bytes histo_height*histo_width*sizeof(unsigned char), // Size of data to read histo, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(input) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) ); if (parameters->outFile) { dump_histo_img(histo, histo_height, histo_width, parameters->outFile); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(zeroData); free(img); free(histo); pb_SwitchToTimer(&timers, pb_TimerID_NONE); printf("\n"); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); pb_DestroyTimerSet(&timers); OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) ); OCL_ERRCK_RETVAL ( clReleaseContext(clContext) ); return 0; }
bool runTest( int argc, const char** argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads; cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1); clReleaseKernel(reductionKernel); if (smallBlock) maxThreads = 64; // number of threads per block else maxThreads = 128; int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); shrLog(" %d elements\n", size); shrLog(" %d threads (max)\n", maxThreads); cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE); shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE); #ifdef GPU_PROFILING if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); return true; } else #endif { // create random input data on CPU unsigned int bytes = size * sizeof(T); T* h_idata = (T*)malloc(bytes); for(int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) cpuFinalThreshold = 1; shrLog(" %d blocks\n\n", numBlocks); // allocate mem for the result on host side T* h_odata = (T*)malloc(numBlocks * sizeof(T)); // allocate device memory and data cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL); int testIterations = 100; double dTotalTime = 0.0; T gpu_result = 0; gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, &dTotalTime, h_odata, d_idata, d_odata); #ifdef GPU_PROFILING double reduceTime = dTotalTime/(double)testIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); #endif // compute reference solution shrLog("\nComparing against Host/C++ computation...\n"); T cpu_result = reduceCPU<T>(h_idata, size); if (datatype == REDUCE_INT) { shrLog(" GPU result = %d\n", gpu_result); shrLog(" CPU result = %d\n\n", cpu_result); shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED"); } else { shrLog(" GPU result = %.9f\n", gpu_result); shrLog(" CPU result = %.9f\n\n", cpu_result); double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12; double diff = abs((double)gpu_result - (double)cpu_result); shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED"); } // cleanup free(h_idata); free(h_odata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); return (gpu_result == cpu_result); } }
int BinomialOption::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); status = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_OPENCL_ERROR(status, "deviceInfo.setDeviceInfo failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue(context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed."); } // Create and initialize memory objects // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; // if(isAmdPlatform()) // inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create memory object for stock price randBuffer = clCreateBuffer(context, inMemFlags, numSamples * sizeof(cl_float4), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (randBuffer)"); // Create memory object for output array outBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, numSamples * sizeof(cl_float4), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outBuffer)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("BinomialOption_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "binomial_options", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed"); // If group-size is gerater than maximum supported on kernel if((size_t)(numSteps + 1) > kernelInfo.kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << (numSteps + 1) << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelInfo.kernelWorkGroupSize << std::endl; std::cout << "Using appropiate group-size." << std::endl; std::cout << "-------------------------------------------" << std::endl; } numSteps = (cl_int)kernelInfo.kernelWorkGroupSize - 2; } return SDK_SUCCESS; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY, bufParam; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); int lenParam = 5; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); bufParam = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenParam*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufParam, CL_TRUE, 0, (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL); /* Call clblas function. */ err = clblasSrotm(N, bufX, 0, incx, bufY, 0, incy, bufParam, 0, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSrotm() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROTM placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufParam); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(int argc, char **argv) { cl_int ret; /* * Command line */ char *binary_path; if (argc != 2) { printf("syntax: %s <binary>\n", argv[0]); exit(1); } binary_path = argv[1]; /* * Platform */ /* Get platform */ cl_platform_id platform; cl_uint num_platforms; ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: second call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n", platform_name); printf("\n"); /* * Device */ /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* * Context */ /* Create context */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } /* * Command Queue */ /* Create command queue */ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("\n"); /* * Program */ /* Program binary */ const unsigned char *binary; size_t binary_length; /* Read binary */ binary = read_buffer(binary_path, &binary_length); if (!binary) { printf("error: %s: cannot open binary\n", binary_path); exit(1); } /* Create a program */ cl_program program; program = clCreateProgramWithBinary(context, 1, &device, &binary_length, &binary, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* * Kernel */ /* Create kernel */ cl_kernel kernel; kernel = clCreateKernel(program, "vector_add", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } printf("\n"); /* * Buffers */ /* Create and allocate host buffers */ size_t num_elem = 10; cl_int *src1_host_buffer; cl_int *src2_host_buffer; cl_int *dst_host_buffer; src1_host_buffer = malloc(num_elem * sizeof(cl_int)); src2_host_buffer = malloc(num_elem * sizeof(cl_int)); dst_host_buffer = malloc(num_elem * sizeof(cl_int)); /* Initialize host source buffer */ int i; for (i = 0; i < num_elem; i++) { src1_host_buffer[i] = i; src2_host_buffer[i] = 100; } /* Create device source buffers */ cl_mem src1_device_buffer; cl_mem src2_device_buffer; src1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL); src2_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL); if (!src1_device_buffer || !src2_device_buffer) { printf("error: could not create destination buffer\n"); exit(1); } /* Create device destination buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem * sizeof(cl_int), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create destination buffer\n"); exit(1); } /* Copy buffer */ ret = clEnqueueWriteBuffer(command_queue, src1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), src1_host_buffer, 0, NULL, NULL); ret |= clEnqueueWriteBuffer(command_queue, src2_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), src2_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* * Kernel arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src1_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src2_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* * Launch Kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; /* Launch the kernel */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* * Result */ /* Receive buffer */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Print result */ for (i = 0; i < num_elem; i++) printf("dst_host_buffer[%d] = %d\n", i, dst_host_buffer[i]); printf("\n"); return 0; }
void test_variable_opencl_func(void *buffers[], void *args) { STARPU_SKIP_IF_VALGRIND; int id, devid, ret; int factor = *(int *) args; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(buffers[0]); cl_context context; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); starpu_opencl_get_context(devid, &context); cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(int), &variable_config.copy_failed, &err); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "variable_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(fail), &fail); if (err) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(factor), &factor); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global = 1; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } err = clEnqueueReadBuffer(queue, fail, CL_TRUE, 0, sizeof(int), &variable_config.copy_failed, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); ret = starpu_opencl_unload_opencl(&opencl_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); return; }
int FastWalshTransform::setupCL(void) { cl_int status = 0; cl_device_type dType; if(sampleArgs->deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //sampleArgs->deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(sampleArgs->isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = getPlatform(platform, sampleArgs->platformId, sampleArgs->isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed"); // Display available devices. retValue = displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed."); // getting device on which to run the sample status = getDevices(context, &devices, sampleArgs->deviceId, sampleArgs->isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue( context, devices[sampleArgs->deviceId], prop, &status); CHECK_OPENCL_ERROR( status, "clCreateCommandQueue failed."); } //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]); CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed"); inputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(cl_float) * length, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputBuffer)"); // create a CL program using the kernel source buildProgramData buildData; buildData.kernelName = std::string("FastWalshTransform_Kernels.cl"); buildData.devices = devices; buildData.deviceId = sampleArgs->deviceId; buildData.flagsStr = std::string(""); if(sampleArgs->isLoadBinaryEnabled()) { buildData.binaryName = std::string(sampleArgs->loadBinary.c_str()); } if(sampleArgs->isComplierFlagsSpecified()) { buildData.flagsFileName = std::string(sampleArgs->flags.c_str()); } retValue = buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "fastWalshTransform", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); return SDK_SUCCESS; }
int main(int argc, char **argv) { int start,end; unsigned long p[64], c[64], k[56]; unsigned long res; build_samples (p, c, k, 0); set_low_keys(k); cl_platform_id cpPlatform; clGetPlatformIDs(1, &cpPlatform, NULL); cl_device_id cdDevice; clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); char cBuffer[1024]; clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL); printf("CL_DEVICE_NAME:\t\t%s\n", cBuffer); clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL); printf("CL_DRIVER_VERSION:\t%s\n\n", cBuffer); cl_uint compute_units; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS:\t%u\n", compute_units); size_t workitem_dims; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims); size_t workitem_size[3]; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]); size_t workgroup_size; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size); cl_uint clock_frequency; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL); printf("CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency); cl_context GPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, NULL); cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, 0, NULL); cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, p, NULL); cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, c, NULL); cl_mem GPUVector3 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 56, k, NULL); cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned long), &res, NULL); size_t szKernelLength; char* cSourceCL = oclLoadProgSource("ocl_deseval.cl", "", &szKernelLength); cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1, (const char **)&cSourceCL, &szKernelLength, NULL); if (clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL)!=CL_SUCCESS) { char cBuffer[2048]; if(clGetProgramBuildInfo(OpenCLProgram,cdDevice,CL_PROGRAM_BUILD_LOG,sizeof(cBuffer),cBuffer,NULL)==CL_SUCCESS); printf("Build error:\n%s\n",cBuffer); exit(1); } cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "keysearch", NULL); clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector); clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1); clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2); clSetKernelArg(OpenCLVectorAdd, 3, sizeof(cl_mem), (void*)&GPUVector3); size_t WorkSize[1] = {1024}; start=clock(); for (int i=0; i<1024; i++) { //clEnqueueWriteBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, // 56 * sizeof(unsigned long), k, 0, NULL, NULL); clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL); //clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, // sizeof(unsigned long), &res, 0, NULL, NULL); if(res!=0) { printf("Key found\n"); //key_found(res,k); break; } increment_key (k); } end=clock(); clReleaseKernel(OpenCLVectorAdd); clReleaseProgram(OpenCLProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(GPUContext); clReleaseMemObject(GPUVector1); clReleaseMemObject(GPUVector2); clReleaseMemObject(GPUOutputVector); printf ("Searched %i keys in %.3f seconds\n", 1000000, ((double)(end-start))/CLOCKS_PER_SEC); return 0; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasStbsv(order, uplo, trans, diag, N, K, bufA, 0, lda, bufX, 0, incx, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStbsv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* At this point you will get the result of STBSV placed in X array. */ printResult(); } /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main() { size_t global_work_size[1] = { 1 }, local_work_size[1]= { 1 }; cl_int err; cl_platform_id platforms[1]; cl_uint nplatforms; cl_device_id devices[1]; // + 1 for duplicate test cl_uint num_devices; cl_program program = NULL; cl_kernel kernelA = NULL; cl_kernel kernelB = NULL; cl_kernel kernelC= NULL; char inputA[] = "A"; char inputB[] = "B"; char inputC[] = "C"; cl_mem inputBufferA = NULL; cl_mem inputBufferB = NULL; cl_mem inputBufferC = NULL; /* command queues */ cl_command_queue queueA = NULL; cl_command_queue queueB = NULL; cl_command_queue queueC = NULL; /* events */ cl_event eventA1 = NULL; cl_event eventB2 = NULL; cl_event eventA3 = NULL; cl_event eventB4 = NULL; /* event wait lists */ cl_event B2_wait_list[1]; cl_event A3_wait_list[1]; cl_event B4_wait_list[1]; cl_event C5_wait_list[2]; err = clGetPlatformIDs(1, platforms, &nplatforms); if (err != CL_SUCCESS && !nplatforms) return EXIT_FAILURE; err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 1, devices, &num_devices); if (err != CL_SUCCESS) return EXIT_FAILURE; cl_context context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err); if (err != CL_SUCCESS) return EXIT_FAILURE; err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), devices, NULL); if (err != CL_SUCCESS) { puts("clGetContextInfo call failed\n"); goto error; } queueA = clCreateCommandQueue(context, devices[0], 0, NULL); if (!queueA) { puts("clCreateCommandQueue call failed\n"); goto error; } queueB = clCreateCommandQueue(context, devices[0], 0, NULL); if (!queueB) { puts("clCreateCommandQueue call failed\n"); goto error; } queueC = clCreateCommandQueue(context, devices[0], 0, NULL); if (!queueB) { puts("clCreateCommandQueue call failed\n"); goto error; } inputBufferA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, strlen (inputB)+1, (void *) inputA, &err); if (inputBufferA == NULL) { printf("clCreateBuffer call failed err = %d\n", err); goto error; } inputBufferB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, strlen (inputA)+1, (void *) inputB, &err); if (inputBufferB == NULL) { printf("clCreateBuffer call failed err = %d\n", err); goto error; } inputBufferC = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, strlen (inputA)+1, (void *) inputC, &err); if (inputBufferC == NULL) { printf("clCreateBuffer call failed err = %d\n", err); goto error; } size_t kernel_size = strlen (kernelASourceCode); char* kernel_buffer = kernelASourceCode; program = clCreateProgramWithSource (context, 1, (const char**)&kernel_buffer, &kernel_size, &err); if (err != CL_SUCCESS) return EXIT_FAILURE; err = clBuildProgram (program, num_devices, devices, NULL, NULL, NULL); if (err != CL_SUCCESS) return EXIT_FAILURE; kernelA = clCreateKernel (program, "test_kernel", NULL); if (!kernelA) { puts("clCreateKernel call failed\n"); goto error; } kernelB = clCreateKernel (program, "test_kernel", NULL); if (!kernelB) { puts("clCreateKernel call failed\n"); goto error; } kernelC = clCreateKernel (program, "test_kernel", NULL); if (!kernelC) { puts("clCreateKernel call failed\n"); goto error; } err = clSetKernelArg (kernelA, 0, sizeof (cl_mem), &inputBufferA); if (err) { puts("clSetKernelArg failed\n"); goto error; } err = clSetKernelArg (kernelB, 0, sizeof (cl_mem), &inputBufferB); if (err) { puts("clSetKernelArg failed\n"); goto error; } err = clSetKernelArg (kernelC, 0, sizeof (cl_mem), &inputBufferC); if (err) { puts("clSetKernelArg failed\n"); goto error; } /* first enqueue A1*/ err = clEnqueueNDRangeKernel (queueA, kernelA, 1, NULL, global_work_size, local_work_size, 0, NULL, &eventA1); if (err != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } /* enqueue B2 */ B2_wait_list[0] = eventA1; err = clEnqueueNDRangeKernel (queueB, kernelB, 1, NULL, global_work_size, local_work_size, 1, B2_wait_list, &eventB2); if (err != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } /* enqueue A3 */ A3_wait_list[0] = eventB2; err = clEnqueueNDRangeKernel (queueA, kernelA, 1, NULL, global_work_size, local_work_size, 1, A3_wait_list, &eventA3); if (err != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } /* enqueue B4 */ B4_wait_list[0] = eventA3; err = clEnqueueNDRangeKernel (queueB, kernelB, 1, NULL, global_work_size, local_work_size, 1, B4_wait_list, &eventB4); if (err != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } /* enqueue C5 */ C5_wait_list[0] = eventA3; C5_wait_list[1] = eventB4; err = clEnqueueNDRangeKernel (queueC, kernelC, 1, NULL, global_work_size, local_work_size, 2, C5_wait_list, NULL); if (err != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } clFinish(queueC); printf("\n"); return EXIT_SUCCESS; error: return EXIT_FAILURE; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program binary */ unsigned char *bin; size_t bin_len; cl_int bin_ret; /* Read program binary */ if (argc == 2) bin = read_buffer((char *)argv[1], &bin_len); else { printf("error: No binary specified\n"); exit(1); } /* Create a program */ cl_program program; program = clCreateProgramWithBinary(context, 1, &device, &bin_len, (const unsigned char **)&bin, &bin_ret, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithBinary' failed\n"); exit(1); } if (bin_ret != CL_SUCCESS) { printf("error: Invalid binary for device\n"); exit(1); } printf("program=%p\n", program); /* Free binary */ free(bin); printf("program binary loaded\n"); printf("\n"); ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object*/ cl_kernel kernel; kernel = clCreateKernel(program, "not_equal_ulong16ulong16", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_ulong16 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_ulong16)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_ulong16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong16), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_ulong16 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_ulong16)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_ulong16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong16), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_int16 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_int16)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_int16)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_int16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_int16), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_int16)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int main() { int i,j,k; // nb of operations: const int dsize = 512; int nthreads = 1; int nbOfAverages = 1e2; int opsMAC = 2; // operations per MAC cl_short4 *in, *out; cl_half *ck; double tops; //total ops #define NQUEUES 1 cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queues[NQUEUES]; cl_mem bufin, bufck, bufout; cl_event event = NULL; cl_program program; cl_kernel kernel; size_t global[2], local[2]; size_t param[5]; char version[300]; // allocate matrices in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in)); out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out)); ck = (cl_half *) calloc(9*9, sizeof(*ck)); in[0].x = 0x3c00; in[1].x = 0x4000; in[dsize].x = 0x4100; ck[0] = 0x3c00; ck[1] = 0x4000; ck[9] = 0x3000; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); for(i = 0; i < NQUEUES; i++) queues[i] = clCreateCommandQueue( ctx, device, 0, &err ); // Print some info about the system clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL); printf("CL_DEVICE_VERSION=%s\n", version); clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL); printf("CL_DRIVER_VERSION=%s\n", version); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]); j = param[0]; clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES="); for(i = 0; i < j; i++) printf("%d ", (int)param[i]); printf("\n"); clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); if(!program) { printf("Error creating program\n"); return -1; } err = clBuildProgram(program, 0, 0, 0, 0, 0); if(err != CL_SUCCESS) { char buffer[20000]; size_t len; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); puts(buffer); return -1; } kernel = clCreateKernel(program, "conv9x9", &err); if(!kernel || err != CL_SUCCESS) { printf("Error creating kernel\n"); return -1; } /* Prepare OpenCL memory objects and place matrices inside them. */ cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT}; cl_int rc; bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck), NULL, &err ); size_t origin[3] = {0,0,0}; size_t region[3] = {dsize, dsize, 1}; err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL ); clSetKernelArg(kernel, 0, sizeof(int), &dsize); clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin); clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck); clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout); local[0] = 8; local[1] = 8; global[0] = global[1] = dsize-32; usleep(100000); struct timeval start,end; gettimeofday(&start, NULL); for (k=0; k<nthreads; k++) { //printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads()); for(i=0;i<nbOfAverages;i++) { // do the 2D convolution err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("clEnqueueNDRangeKernel error %d\n", err); return -1; } } } clFinish(queues[0]); gettimeofday(&end, NULL); double t = ((double) (end.tv_sec - start.tv_sec)) + ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified! /* Wait for calculations to be finished. */ /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL ); clFinish(queues[0]); printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x); /* Release OpenCL memory objects. */ clReleaseMemObject( bufin ); clReleaseMemObject( bufck ); clReleaseMemObject( bufout ); /* Release OpenCL working objects. */ for(i = 0; i < NQUEUES; i++) clReleaseCommandQueue( queues[i] ); clReleaseContext( ctx ); // report performance: tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads); printf("\nTime in s: %lf:", t); printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9); printf("\n"); return(0); }