bool SkCLImageDiffer::loadKernelStream(SkStream* stream, const char name[], cl_kernel* kernel) { // Read the kernel source into memory SkString sourceString; sourceString.resize(stream->getLength()); size_t bytesRead = stream->read(sourceString.writable_str(), sourceString.size()); if (bytesRead != sourceString.size()) { SkDebugf("Failed to read kernel source file"); return false; } return loadKernelSource(sourceString.c_str(), name, kernel); }
autoencoder_GPU::autoencoder_GPU():autoencoder(){ // initialize the OpenCL environment gpu_init(gpu_env, 0); d_weight0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status); d_weight1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status); d_weight2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status); d_weight3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status); d_weight4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status); d_weight5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status); d_weight6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status); d_weight7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status); d_bias0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status); d_bias1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status); d_bias2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status); d_bias3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status); d_bias4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status); d_bias5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status); d_bias6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status); d_bias7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status); d_layer0act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer0err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer1act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer1err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer2act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer2err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer3act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer3err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer4act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer4err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer4state = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer5act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer5err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer6act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer6err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer7act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer7err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer8act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer8err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status); // error vector d_error = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); // transfer data from CPU to GPU, TO DO // build OpenCL kernels char* source = new char[KERNEL_SOURCE_LENGTH]; loadKernelSource("../src/gpu_rbm.cl", source); gpu_env.prog = clCreateProgramWithSource(gpu_env.ctx, 1, (const char**)&source, NULL, &gpu_env.status); gpu_env.status = clBuildProgram(gpu_env.prog, 0, NULL, NULL, NULL, NULL); if (gpu_env.status == CL_BUILD_PROGRAM_FAILURE) { // Determine the size of the log size_t log_size; clGetProgramBuildInfo(gpu_env.prog, gpu_env.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(gpu_env.prog, gpu_env.device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); exit(0); } squareError = clCreateKernel(gpu_env.prog, "squareError", &gpu_env.status); sigmoid = clCreateKernel(gpu_env.prog, "sigmoid", &gpu_env.status); addBias = clCreateKernel(gpu_env.prog, "addBias", &gpu_env.status); sumBatch = clCreateKernel(gpu_env.prog, "sumBatch", &gpu_env.status); add = clCreateKernel(gpu_env.prog, "add", &gpu_env.status); getStates = clCreateKernel(gpu_env.prog, "getStates", &gpu_env.status); updateWeights = clCreateKernel(gpu_env.prog, "updateWeights", &gpu_env.status); updateBias = clCreateKernel(gpu_env.prog, "updateBias", &gpu_env.status); randNum = clCreateKernel(gpu_env.prog, "PRNG_threefry4x32", &gpu_env.status); randn = clCreateKernel(gpu_env.prog, "PRNGn_threefry4x32", &gpu_env.status); reset = clCreateKernel(gpu_env.prog, "reset", &gpu_env.status); rounding = clCreateKernel(gpu_env.prog, "rounding", &gpu_env.status); subtract = clCreateKernel(gpu_env.prog, "subtract", &gpu_env.status); deriv = clCreateKernel(gpu_env.prog, "deriv", &gpu_env.status); updateAE = clCreateKernel(gpu_env.prog, "update", &gpu_env.status); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight0, CL_TRUE, 0, nLayerSize0 * nLayerSize1 * sizeof(floatType), (void*)weight0, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight1, CL_TRUE, 0, nLayerSize1 * nLayerSize2 * sizeof(floatType), (void*)weight1, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight2, CL_TRUE, 0, nLayerSize2 * nLayerSize3 * sizeof(floatType), (void*)weight2, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight3, CL_TRUE, 0, nLayerSize3 * nLayerSize4 * sizeof(floatType), (void*)weight3, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight4, CL_TRUE, 0, nLayerSize4 * nLayerSize5 * sizeof(floatType), (void*)weight4, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight5, CL_TRUE, 0, nLayerSize5 * nLayerSize6 * sizeof(floatType), (void*)weight5, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight6, CL_TRUE, 0, nLayerSize6 * nLayerSize7 * sizeof(floatType), (void*)weight6, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight7, CL_TRUE, 0, nLayerSize7 * nLayerSize8 * sizeof(floatType), (void*)weight7, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias0, CL_TRUE, 0, nLayerSize1 * sizeof(floatType), (void*)bias0, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias1, CL_TRUE, 0, nLayerSize2 * sizeof(floatType), (void*)bias1, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias2, CL_TRUE, 0, nLayerSize3 * sizeof(floatType), (void*)bias2, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias3, CL_TRUE, 0, nLayerSize4 * sizeof(floatType), (void*)bias3, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias4, CL_TRUE, 0, nLayerSize5 * sizeof(floatType), (void*)bias4, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias5, CL_TRUE, 0, nLayerSize6 * sizeof(floatType), (void*)bias5, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias6, CL_TRUE, 0, nLayerSize7 * sizeof(floatType), (void*)bias6, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias7, CL_TRUE, 0, nLayerSize8 * sizeof(floatType), (void*)bias7, 0, NULL, NULL); }
int main(int argc, char ** argv) { cl_float4 * x = initializePositions(); std::string* source = loadKernelSource("src/kernels.cl"); // Get available platforms std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); // Select the default platform and create a context using this platform and the GPU cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 }; cl::Context context(CL_DEVICE_TYPE_GPU, cps); // Get a list of devices on this platform std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); // Create a command queue and use the first device cl::CommandQueue queue = cl::CommandQueue(context, devices[0]); cl::Program::Sources sources(1, std::make_pair(source->c_str(), source->length()+1)); // Make program of the source code in the context cl::Program program = cl::Program(context, sources); // Build program for these specific devices try{ program.build(devices); } catch(cl::Error error) { std::cout << error.what() << "(" << error.err() << ")" << std::endl; std::string build_log; build_log = program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(devices[0]); std::cout << "Build status: " << build_log << std::endl; build_log = program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(devices[0]); std::cout << "Build options: " << build_log << std::endl; build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); std::cout << "Build log: " << build_log << std::endl; exit(0); } // Create buffers for points, centers of mass, and bins cl::Buffer pointsBuffer = cl::Buffer(context, CL_MEM_READ_ONLY, POINTS * sizeof(cl_float4)); cl::Buffer cmBuffer = cl::Buffer(context, CL_MEM_READ_WRITE, BINS * sizeof(cl_float4)); cl::Buffer binsBuffer = cl::Buffer(context, CL_MEM_READ_WRITE, POINTS * sizeof(unsigned int)); cl::Buffer offsetsBuffer = cl::Buffer(context, CL_MEM_READ_WRITE, BINS * sizeof(unsigned int)); // Upload points to GPU and compute the centers of mass queue.enqueueWriteBuffer(pointsBuffer, CL_TRUE, 0, POINTS * sizeof(cl_float4), x); computeBins(context, queue, program, &pointsBuffer, &cmBuffer); // Generate bin offsets genOffsets(context, queue, program, &cmBuffer, &offsetsBuffer); // Sort the bins, and then compute all forces sortBins(context, queue, program, &pointsBuffer, &offsetsBuffer, &binsBuffer); cl_float4* a = forces(context, queue, program, &pointsBuffer, &cmBuffer, &offsetsBuffer, &binsBuffer); for(int i = 0; i < POINTS; i++){ printf("(%2.2f,%2.2f,%2.2f,%2.2f) (%2.3f,%2.3f,%2.3f)\n", x[i].x, x[i].y, x[i].z, x[i].w, a[i].x, a[i].y, a[i].z); } free(x); return 0; }