Пример #1
0
int CommandGenerate::execute(const std::vector<std::string>& p_args) {
	if(p_args.size() < 10) {
		help();
		return -1;
	}

	unsigned int platformId = atol(p_args[1].c_str());
	unsigned int deviceId = atol(p_args[2].c_str());
	unsigned int staggerSize = atol(p_args[3].c_str());
	unsigned int threadsNumber = atol(p_args[4].c_str());
	unsigned int hashesNumber = atol(p_args[5].c_str());
	unsigned int nonceSize = PLOT_SIZE * staggerSize;

	std::cerr << "Threads number: " << threadsNumber << std::endl;
	std::cerr << "Hashes number: " << hashesNumber << std::endl;

	unsigned int numjobs = (p_args.size() - 5)/4;
	std::cerr << numjobs << " plot(s) to do." << std::endl;
	unsigned int staggerMbSize = staggerSize / 4;
	std::cerr << "Non-GPU memory usage: " << staggerMbSize*numjobs << "MB" << std::endl;
	
	std::vector<std::string> paths(numjobs);
	std::vector<std::ofstream *> out_files(numjobs);
	std::vector<unsigned long long> addresses(numjobs);
	std::vector<unsigned long long> startNonces(numjobs);
	std::vector<unsigned long long> endNonces(numjobs);
	std::vector<unsigned int> noncesNumbers(numjobs);
	std::vector<unsigned char*> buffersCpu(numjobs);
	std::vector<bool> saving_thread_flags(numjobs);
	std::vector<std::future<void>> save_threads(numjobs);
	unsigned long long maxNonceNumber = 0;
	unsigned long long totalNonces = 0;

	int returnCode = 0;

	try {
		for (unsigned int i = 0; i < numjobs; i++) {
			std::cerr << "----" << std::endl;
			std::cerr << "Job number " << i << std::endl;
			unsigned int argstart = 6 + i*4;
			paths[i] = std::string(p_args[argstart]);
			addresses[i] = strtoull(p_args[argstart+1].c_str(), NULL, 10);
			startNonces[i] = strtoull(p_args[argstart+2].c_str(), NULL, 10);
			noncesNumbers[i] = atol(p_args[argstart+3].c_str());
			maxNonceNumber = std::max(maxNonceNumber, (long long unsigned int)noncesNumbers[i]);
			totalNonces += noncesNumbers[i];

			std::ostringstream outFile;
			outFile << paths[i] << "/" << addresses[i] << "_" << startNonces[i] << "_" << \
				noncesNumbers[i] << "_" << staggerSize;
			std::ios_base::openmode file_mode = std::ios::out | std::ios::binary | std::ios::trunc;
			out_files[i] = new std::ofstream(outFile.str(), file_mode);
			assert(out_files[i]);

			if(noncesNumbers[i] % staggerSize != 0) {
				noncesNumbers[i] -= noncesNumbers[i] % staggerSize;
				noncesNumbers[i] += staggerSize;
			}

			endNonces[i] = startNonces[i] + noncesNumbers[i];
			unsigned int noncesGbSize = noncesNumbers[i] / 4 / 1024;
			std::cerr << "Path: " << outFile.str() << std::endl;
			std::cerr << "Nonces: " << startNonces[i] << " to " << endNonces[i] << " (" << noncesGbSize << " GB)" << std::endl;
			std::cerr << "Creating CPU buffer" << std::endl;
			buffersCpu[i] = new unsigned char[nonceSize];
			if(!buffersCpu[i]) {
				throw std::runtime_error("Unable to create the CPU buffer (probably out of host memory.)");
			}
			saving_thread_flags[i] = false;
			std::cerr << "----" << std::endl;
		}

		cl_platform_id platforms[4];
		cl_uint platformsNumber;
		cl_device_id devices[32];
		cl_uint devicesNumber;
		cl_context context = 0;
		cl_command_queue commandQueue = 0;
		cl_mem bufferGpuGen = 0;
		cl_mem bufferGpuScoops = 0;
		cl_program program = 0;
		cl_kernel kernelStep1 = 0;
		cl_kernel kernelStep2 = 0;
		cl_kernel kernelStep3 = 0;

		int error;

		std::cerr << "Retrieving OpenCL platforms" << std::endl;
		error = clGetPlatformIDs(4, platforms, &platformsNumber);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to retrieve the OpenCL platforms");
		}

		if(platformId >= platformsNumber) {
			throw std::runtime_error("No platform found with the provided id");
		}

		std::cerr << "Retrieving OpenCL GPU devices" << std::endl;
		error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to retrieve the OpenCL devices");
		}

		if(deviceId >= devicesNumber) {
			throw std::runtime_error("No device found with the provided id");
		}

		std::cerr << "Creating OpenCL context" << std::endl;
		context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL context");
		}

		std::cerr << "Creating OpenCL command queue" << std::endl;
		commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL command queue");
		}

		std::cerr << "Creating OpenCL GPU generation buffer" << std::endl;
		bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer");
		}

		std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl;
		bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer");
		}

		std::cerr << "Creating OpenCL program" << std::endl;
		std::string source = loadSource("kernel/nonce.cl");
		const char* sources[] = {source.c_str()};
		size_t sourcesLength[] = {source.length()};
		program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL program");
		}

		std::cerr << "Building OpenCL program" << std::endl;
		error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0);
		if(error != CL_SUCCESS) {
			size_t logSize;
			clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize);

			char* log = new char[logSize];
			clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0);
			std::cerr << log << std::endl;
			delete[] log;

			throw OpenclError(error, "Unable to build the OpenCL program");
		}

		std::cerr << "Creating OpenCL step1 kernel" << std::endl;
		kernelStep1 = clCreateKernel(program, "nonce_step1", &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL kernel");
		}

		std::cerr << "Setting OpenCL step1 kernel static arguments" << std::endl;
		error = clSetKernelArg(kernelStep1, 2, sizeof(cl_mem), (void*)&bufferGpuGen);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to set the OpenCL kernel arguments");
		}

		std::cerr << "Creating OpenCL step2 kernel" << std::endl;
		kernelStep2 = clCreateKernel(program, "nonce_step2", &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL kernel");
		}

		std::cerr << "Setting OpenCL step2 kernel static arguments" << std::endl;
		error = clSetKernelArg(kernelStep2, 1, sizeof(cl_mem), (void*)&bufferGpuGen);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to set the OpenCL kernel arguments");
		}

		std::cerr << "Creating OpenCL step3 kernel" << std::endl;
		kernelStep3 = clCreateKernel(program, "nonce_step3", &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL kernel");
		}

		std::cerr << "Setting OpenCL step3 kernel static arguments" << std::endl;
		error = clSetKernelArg(kernelStep3, 0, sizeof(cl_uint), (void*)&staggerSize);
		error = clSetKernelArg(kernelStep3, 1, sizeof(cl_mem), (void*)&bufferGpuGen);
		error = clSetKernelArg(kernelStep3, 2, sizeof(cl_mem), (void*)&bufferGpuScoops);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to set the OpenCL kernel arguments");
		}

		size_t globalWorkSize = staggerSize;
		size_t localWorkSize = (staggerSize < threadsNumber) ? staggerSize : threadsNumber;
		time_t startTime = time(0);
		unsigned int totalNoncesCompleted = 0;
		for (unsigned long long nonce_ordinal = 0; nonce_ordinal < maxNonceNumber; nonce_ordinal += staggerSize) {
			for (unsigned int jobnum = 0; jobnum < paths.size(); jobnum += 1) {
				unsigned long long nonce = startNonces[jobnum] + nonce_ordinal;
				if (nonce > endNonces[jobnum]) {
				  break;
				}

				std::cout << "Running with start nonce " << nonce << std::endl;
				// Is a cl_ulong always an unsigned long long?
				unsigned int error = 0;
				error = clSetKernelArg(kernelStep1, 0, sizeof(cl_ulong), (void*)&addresses[jobnum]);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments");
				}
				error = clSetKernelArg(kernelStep1, 1, sizeof(cl_ulong), (void*)&nonce);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments");
				}

				error = clEnqueueNDRangeKernel(commandQueue, kernelStep1, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Error in step1 kernel launch");
				}

				unsigned int hashesSize = hashesNumber * HASH_SIZE;
				for(int hashesOffset = PLOT_SIZE ; hashesOffset > 0 ; hashesOffset -= hashesSize) {
					error = clSetKernelArg(kernelStep2, 0, sizeof(cl_ulong), (void*)&nonce);
					error = clSetKernelArg(kernelStep2, 2, sizeof(cl_uint), (void*)&hashesOffset);
					error = clSetKernelArg(kernelStep2, 3, sizeof(cl_uint), (void*)&hashesNumber);
					if(error != CL_SUCCESS) {
						throw OpenclError(error, "Unable to set the OpenCL step2 kernel arguments");
					}

					error = clEnqueueNDRangeKernel(commandQueue, kernelStep2, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0);
					if(error != CL_SUCCESS) {
						throw OpenclError(error, "Error in step2 kernel launch");
					}

					error = clFinish(commandQueue);
					if(error != CL_SUCCESS) {
						throw OpenclError(error, "Error in step2 kernel finish");
					}
				}

				totalNoncesCompleted += staggerSize;
				double percent = 100.0 * (double)totalNoncesCompleted / totalNonces;
				time_t currentTime = time(0);
				double speed = (double)totalNoncesCompleted / difftime(currentTime, startTime) * 60.0;
				double estimatedTime = (double)(totalNonces - totalNoncesCompleted) / speed;
				std::cerr << "\r" << percent << "% (" << totalNoncesCompleted << "/" << totalNonces << " nonces)";
				std::cerr << ", " << speed << " nonces/minutes";
				std::cerr << ", ETA: " << ((int)estimatedTime / 60) << "h" << ((int)estimatedTime % 60) << "m" << ((int)(estimatedTime * 60.0) % 60) << "s";
				std::cerr << "...                    ";

				error = clEnqueueNDRangeKernel(commandQueue, kernelStep3, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Error in step3 kernel launch");
				}

				if (saving_thread_flags[jobnum]) {
					save_threads[jobnum].wait(); // Wait for last job to finish
					saving_thread_flags[jobnum] = false;
				}

				error = clEnqueueReadBuffer(commandQueue, bufferGpuScoops, CL_TRUE, 0, sizeof(cl_uchar) * nonceSize, buffersCpu[jobnum], 0, 0, 0);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Error in synchronous read");
				}
				saving_thread_flags[jobnum] = true;
				save_threads[jobnum] = std::async(std::launch::async, save_nonces, nonceSize, out_files[jobnum], buffersCpu[jobnum]);
			}
		}

		//Clean up
		for (unsigned int i = 0; i < paths.size(); i += 1) {
		  if (saving_thread_flags[i]) {
		    std::cerr << "waiting for final save to " << paths[i] << " to finish" << std::endl;
		    save_threads[i].wait();
		    saving_thread_flags[i] = false;
		    std::cerr << "done waiting for final save" << std::endl;
		    if (buffersCpu[i]) {
		      delete[] buffersCpu[i];
		    }
		  }
		}
		
		if(kernelStep3) { clReleaseKernel(kernelStep3); }
		if(kernelStep2) { clReleaseKernel(kernelStep2); }
		if(kernelStep1) { clReleaseKernel(kernelStep1); }
		if(program) { clReleaseProgram(program); }
		if(bufferGpuGen) { clReleaseMemObject(bufferGpuGen); }
		if(bufferGpuScoops) { clReleaseMemObject(bufferGpuScoops); }
		if(commandQueue) { clReleaseCommandQueue(commandQueue); }
		if(context) { clReleaseContext(context); }


		time_t currentTime = time(0);
		double elapsedTime = difftime(currentTime, startTime) / 60.0;
		double speed = (double)totalNonces / elapsedTime;
		std::cerr << "\r100% (" << totalNonces << "/" << totalNonces << " nonces)";
		std::cerr << ", " << speed << " nonces/minutes";
		std::cerr << ", " << ((int)elapsedTime / 60) << "h" << ((int)elapsedTime % 60) << "m" << ((int)(elapsedTime * 60.0) % 60) << "s";
		std::cerr << "                    " << std::endl;
	} catch(const OpenclError& ex) {
		std::cerr << "[ERROR] [" << ex.getCode() << "] " << ex.what() << std::endl;
		returnCode = -1;
	} catch(const std::exception& ex) {
		std::cerr << "[ERROR] " << ex.what() << std::endl;
		returnCode = -1;
	}
	return returnCode;
}
Пример #2
0
int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
{
	int in, hid, out;
	float out_err, hid_err;
  
	in = net->input_n;
	hid = net->hidden_n;
	out = net->output_n;   

        //int use_device = 0;  // use CPU as device
	int use_device = 2;  // use GPU as device
        //int use_device = 2;  // use FPGA as device
	if(initialize(use_device)) return -1;
         
	int sourcesize = 1024*1024;
	char * source = (char *)calloc(sourcesize, sizeof(char)); 
	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }

	// read the kernel core source
	char * kernel_bp1  = "bpnn_layerforward_ocl";
	char * kernel_bp2  = "bpnn_adjust_weights_ocl";
	char * tempchar = "./backprop_kernel.cl";
        char * krnl_file = "./binary/backprop_kernel_default.xclbin";
 
        cl_int err = 0;
        cl_program prog;
        // create program from source
        if (use_device < 2 ) {
	    FILE * fp = fopen(tempchar, "rb"); 
	    if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
	    fread(source + strlen(source), sourcesize, 1, fp);
	    fclose(fp);
		
	    // compile kernel
	    err = 0;
	    const char * slist[2] = { source, 0 };
	    prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
	    if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
        } 
        // create program from binary
        else {
            char *krnl_bin;
	    const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin);

            err = 0;
            prog = clCreateProgramWithBinary(context, 1,
	                                    &device_list[0], &krnl_size,
	                                    (const unsigned char**) &krnl_bin,
	                                    NULL, &err);
            if ((!prog) || (err!=CL_SUCCESS)) {
		printf("Error: Failed to create compute program from binary %d!\n",
		       err);
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	    }
        }
        
	err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
	{ // show warnings/errors
		//static char log[65536]; memset(log, 0, sizeof(log));
		//cl_device_id device_id = 0;
		//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
		//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
	}
	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
    	
	cl_kernel kernel1;
	cl_kernel kernel2;
	kernel1 = clCreateKernel(prog, kernel_bp1, &err);  
        if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel1) 0 => %d\n", err); return -1; }
	kernel2 = clCreateKernel(prog, kernel_bp2, &err);  
	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel2) 0 => %d\n", err); return -1; }
	/* clReleaseProgram(prog); */
	
	float *input_weights_one_dim;
    float *input_weights_prev_one_dim;
	float * partial_sum;
	float sum;
	float num_blocks = in / BLOCK_SIZE;
	
	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
	
	// set global and local workitems
	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; 
	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
	
	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
	// todo: fix mem allocation
	int m = 0;
	for (int k = 0; k <= in; k++) {	
		for (int j = 0; j <= hid; j++) {
		input_weights_one_dim[m] = net->input_weights[k][j];
		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
	    m++;
		}
	}
	
	cl_mem input_hidden_ocl;
	cl_mem input_ocl;
	cl_mem output_hidden_ocl;
	cl_mem hidden_partial_sum;
	cl_mem hidden_delta_ocl;
	cl_mem input_prev_weights_ocl;
  
	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
	output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
	hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
	hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
		
	printf("Performing GPU computation\n");
	
	//write buffers
	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
 
	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
	clSetKernelArg(kernel1, 4, sizeof(float) *  HEIGHT, (void*)NULL );
	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
  
	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 3, NULL, global_work, local_work, 0, NULL, 0);
        if(err == CL_INVALID_KERNEL) {printf("Error is invalid kernel\n");}
	if(err != CL_SUCCESS) { printf("ERROR: 1 kernel1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
  
	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }	
  
	for (int j = 1; j <= hid; j++) {
		sum = 0.0;
		for (int k = 0; k < num_blocks; k++) {	
		sum += partial_sum[k * hid + j-1] ;
    }
		sum += net->input_weights[0][j];
		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
	}

	
	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);

	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
  
	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
  
	err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
  
	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }	
	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }	
  
	clReleaseMemObject(input_ocl);
	clReleaseMemObject(output_hidden_ocl);
	clReleaseMemObject(input_hidden_ocl);
	clReleaseMemObject(hidden_partial_sum);
	clReleaseMemObject(input_prev_weights_ocl);
  
	free(input_weights_prev_one_dim);
	free(partial_sum);
	free(input_weights_one_dim);

}
Пример #3
0
void execute(float *grid, size_t gridSize, unsigned int width, unsigned int workGroupSize, unsigned int iterations, bool printResult) {
	cl_context context;
	cl_command_queue commandQueue;
	cl_program program;
	cl_kernel kernel;
	
	size_t dataBytes, kernelLength;
	cl_int errorCode;
	
	cl_mem gridBuffer;
	
	cl_device_id* devices;
	cl_device_id gpu;
	
	cl_uint numPlatforms;

	errorCode = clGetPlatformIDs(0, NULL, &numPlatforms);
	cl_platform_id platforms[numPlatforms];
	errorCode = clGetPlatformIDs(numPlatforms, platforms, NULL);
	
	checkError(errorCode);
	
	cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0};

	context = clCreateContextFromType(properties, CL_DEVICE_TYPE_ALL, 0, NULL, &errorCode);
	checkError(errorCode);
	
	errorCode = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes);
	devices = malloc(dataBytes);
	errorCode |= clGetContextInfo(context, CL_CONTEXT_DEVICES, dataBytes, devices, NULL);
	
	gpu = devices[0];
	
	commandQueue = clCreateCommandQueue(context, gpu, 0, &errorCode);
	checkError(errorCode);
	
	gridBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, gridSize, grid, &errorCode);
	checkError(errorCode);
	
	const char* programBuffer = readFile("kernel.cl");
	kernelLength = strlen(programBuffer);
	program = clCreateProgramWithSource(context, 1, (const char **)&programBuffer, &kernelLength, &errorCode);
	checkError(errorCode);
	
	errorCode = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (errorCode == CL_BUILD_PROGRAM_FAILURE) {
		// Determine the size of the log
		size_t log_size;
		clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
		
		// Allocate memory for the log
		char *log = (char *) malloc(log_size);
		
		// Get the log
		clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
		
		// Print the log
		free(log);
		printf("%s\n", log);
	}
	checkError(errorCode);
	
	kernel = clCreateKernel(program, "diffuse", &errorCode);
	checkError(errorCode);

	size_t localWorkSize[2] = {workGroupSize, workGroupSize}, globalWorkSize[2] = {width, width};

	errorCode |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&gridBuffer);
	errorCode |= clSetKernelArg(kernel, 1, sizeof(float) * workGroupSize * workGroupSize, NULL);
	errorCode |= clSetKernelArg(kernel, 2, sizeof(int), (void *)&width);
	errorCode |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&workGroupSize);
	errorCode |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&iterations);
	checkError(errorCode);
	
	errorCode = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	checkError(errorCode);
	
	errorCode = clEnqueueReadBuffer(commandQueue, gridBuffer, CL_TRUE, 0, gridSize, grid, 0, NULL, NULL);
	checkError(errorCode);



	free(devices);
	free((void *)programBuffer);
	clReleaseContext(context);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(commandQueue);

	
}
Пример #4
0
int initGPU(int n)
{
	#pragma mark Device Information
	// Find the CPU CL device, as a fallback
	err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
	assert(err == CL_SUCCESS);

	// Find the GPU CL device, this is what we really want
	// If there is no GPU device is CL capable, fall back to CPU
	err |= clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
	if (err != CL_SUCCESS) device = cpu;
	assert(device);

	// Get some information about the returned device
	cl_char vendor_name[1024] = {0};
	cl_char device_name[1024] = {0};
	err |= clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
	err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
	assert(err == CL_SUCCESS);
	printf("Connecting to %s %s...", vendor_name, device_name);

	#pragma mark Context and Command Queue
	// Now create a context to perform our calculation with the 
	// specified device 
	context = clCreateContext(0, 1, &device, NULL, NULL, &err);
	assert(err == CL_SUCCESS);

	// And also a command queue for the context
	cmd_queue = clCreateCommandQueue(context, device, 0, NULL);

	#pragma mark Program and Kernel Creation
	// Load the program source from disk
	// The kernel/program is the project directory and in Xcode the executable
	// is set to launch from that directory hence we use a relative path
	const char * filename = "kernel.cl";
	char *program_source = load_program_source(filename);
	program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, &err);
	assert(err == CL_SUCCESS);

	err |= clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
	assert(err == CL_SUCCESS);

	// Now create the kernel "objects" that we want to use in the example file 
	kernel[0] = clCreateKernel(program[0], "add", &err);
	assert(err == CL_SUCCESS);

	#pragma mark Memory Allocation
	// Allocate memory on the device to hold our data and store the results into
	buffer_size = sizeof(int) * n;

	mem_c_position = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	mem_c_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	mem_p_angle = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	mem_p_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
	assert(err == CL_SUCCESS);

	mem_fitness = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size, NULL, &err);
	assert(err == CL_SUCCESS);

	// Get all of the stuff written and allocated
	clFinish(cmd_queue);

	printf(" done\n");

	return err; // CL_SUCCESS
}
Пример #5
0
magma_err_t
magma_cgeqrf2_2q_gpu( magma_int_t m, magma_int_t n,
        magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda,
        magmaFloatComplex *tau, magma_err_t *info,
        magma_queue_t* queues)
{
    /*  -- clMAGMA (version 1.1.0) --
        Univ. of Tennessee, Knoxville
        Univ. of California, Berkeley
        Univ. of Colorado, Denver
        @date January 2014

        Purpose
        =======
        CGEQRF computes a QR factorization of a complex M-by-N matrix A:
        A = Q * R.

        Arguments
        =========
        M       (input) INTEGER
        The number of rows of the matrix A.  M >= 0.

        N       (input) INTEGER
        The number of columns of the matrix A.  N >= 0.

        dA      (input/output) COMPLEX array on the GPU, dimension (LDDA,N)
        On entry, the M-by-N matrix dA.
        On exit, the elements on and above the diagonal of the array
        contain the min(M,N)-by-N upper trapezoidal matrix R (R is
        upper triangular if m >= n); the elements below the diagonal,
        with the array TAU, represent the orthogonal matrix Q as a
        product of min(m,n) elementary reflectors (see Further
        Details).

        LDDA    (input) INTEGER
        The leading dimension of the array dA.  LDDA >= max(1,M).
        To benefit from coalescent memory accesses LDDA must be
        dividable by 16.

        TAU     (output) COMPLEX array, dimension (min(M,N))
        The scalar factors of the elementary reflectors (see Further
        Details).

        INFO    (output) INTEGER
        = 0:  successful exit
        < 0:  if INFO = -i, the i-th argument had an illegal value
        if INFO = -9, internal GPU memory allocation failed.

        Further Details
        ===============

        The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

        Each H(i) has the form

        H(i) = I - tau * v * v'

        where tau is a complex scalar, and v is a complex vector with
        v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
        and tau in TAU(i).
        =====================================================================    */

#define dA(a_1,a_2)    dA, (dA_offset + (a_1) + (a_2)*(ldda))
#define work_ref(a_1)  ( work + (a_1))
#define hwork          ( work + (nb)*(m))

    magmaFloatComplex_ptr dwork;
    magmaFloatComplex  *work;

    magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows;
    magma_int_t nbmin, nx, ib, nb;
    magma_int_t lhwork, lwork;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = min(m,n);
    if (k == 0)
        return MAGMA_SUCCESS;

    nb = magma_get_cgeqrf_nb(m);

    lwork  = (m+n) * nb;
    lhwork = lwork - (m)*nb;


    if ( MAGMA_SUCCESS != magma_cmalloc( &dwork, n*nb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /*
       if ( MAGMA_SUCCESS != magma_cmalloc_cpu( &work, lwork ) ) {
     *info = MAGMA_ERR_HOST_ALLOC;
     magma_free( dwork );
     return *info;
     }
     */

    cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaFloatComplex)*lwork, NULL, NULL);
    work = (magmaFloatComplex*)clEnqueueMapBuffer(queues[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL);

    nbmin = 2;
    nx    = 2*nb;
    ldwork = m;
    lddwork= n;

    if (nb >= nbmin && nb < k && nx < k) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nx; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;
            magma_cgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), 0, ldwork, queues[0], NULL);
            clFlush(queues[0]);  
            if (i>0){
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                        m-old_i, n-old_i-2*old_ib, old_ib,
                        dA(old_i, old_i         ), ldda, dwork,0,      lddwork,
                        dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queues[1]);

                magma_csetmatrix_async( old_ib, old_ib, work_ref(old_i), 0, ldwork,
                        dA(old_i, old_i), ldda, queues[1], NULL); 
                clFlush(queues[1]);
            }

            magma_queue_sync(queues[0]);
            lapackf77_cgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);


            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, 
                    &rows, &ib, 
                    work_ref(i), &ldwork, tau+i, hwork, &ib);

            cpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
            magma_csetmatrix(rows, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queues[0]);
            cq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );

            if (i + ib < n) 
            {
                magma_csetmatrix(ib, ib, hwork, 0, ib, dwork, 0, lddwork, queues[1]);

                if (i+nb < k-nx){
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                            rows, ib, ib, 
                            dA(i, i   ), ldda, dwork,0,  lddwork, 
                            dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]);
                    magma_queue_sync(queues[1]);
                }else {
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                            rows, n-i-ib, ib, 
                            dA(i, i   ), ldda, dwork,0,  lddwork, 
                            dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]);
                    magma_csetmatrix(ib, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queues[1]);
                    clFlush(queues[1]);
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }

    magma_free(dwork);

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        magma_cgetmatrix(rows, ib, dA(i, i), ldda, work, 0, rows, queues[0]);

        lhwork = lwork - rows*ib;
        lapackf77_cgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);

        magma_csetmatrix(rows, ib, work, 0, rows, dA(i, i), ldda, queues[0]);
    }

    clEnqueueUnmapMemObject(queues[0], buffer, work, 0, NULL, NULL);
    clReleaseMemObject(buffer);
    //    magma_free_cpu(work);

    return *info;
} /* magma_cgeqrf2_gpu */
int
MemoryOptimizations::setupCL(void)
{
    cl_int status = 0;
    size_t deviceListSize;

    cl_device_type dType;
    
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetPlatformIDs failed."))
    {
        return SDK_FAILURE;
    }
    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetPlatformIDs failed."))
        {
            return SDK_FAILURE;
        }
        for (unsigned i = 0; i < numPlatforms; ++i) 
        {
            char pbuf[100];
            status = clGetPlatformInfo(platforms[i],
                                       CL_PLATFORM_VENDOR,
                                       sizeof(pbuf),
                                       pbuf,
                                       NULL);

            if(!sampleCommon->checkVal(status,
                                       CL_SUCCESS,
                                       "clGetPlatformInfo failed."))
            {
                return SDK_FAILURE;
            }

            platform = platforms[i];
            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
            {
                break;
            }
        }
        delete[] platforms;
    }

    if(NULL == platform)
    {
        sampleCommon->error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    // Display available devices.
    if(!sampleCommon->displayDevices(platform, dType))
    {
        sampleCommon->error("sampleCommon::displayDevices() failed");
        return SDK_FAILURE;
    }

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);
    if(!sampleCommon->checkVal(status, 
                               CL_SUCCESS,
                               "clCreateContextFromType failed."))
        return SDK_FAILURE;

    /* First, get the size of device list data */
    status = clGetContextInfo(context, 
                              CL_CONTEXT_DEVICES, 
                              0, 
                              NULL, 
                              &deviceListSize);
    if(!sampleCommon->checkVal(status, 
                               CL_SUCCESS,
                               "clGetContextInfo failed."))
        return SDK_FAILURE;

    int deviceCount = (int)(deviceListSize / sizeof(cl_device_id));
    if(!sampleCommon->validateDeviceId(deviceId, deviceCount))
    {
        sampleCommon->error("sampleCommon::validateDeviceId() failed");
        return SDK_FAILURE;
    }

    /* Now allocate memory for device list based on the size we got earlier */
    devices = (cl_device_id*)malloc(deviceListSize);
    if(devices == NULL) 
    {
        sampleCommon->error("Failed to allocate memory (devices).");
        return SDK_FAILURE;
    }

    /* Now, get the device list data */
    status = clGetContextInfo(context, 
                              CL_CONTEXT_DEVICES, 
                              deviceListSize, 
                              devices, 
                              NULL);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetGetContextInfo failed."))
        return SDK_FAILURE;


    /* Get Device specific Information */
    /* Get device extensions */
    char deviceExtensions[2048];
    status = clGetDeviceInfo(devices[deviceId], 
                             CL_DEVICE_EXTENSIONS, 
                             sizeof(deviceExtensions), 
                             deviceExtensions, 
                             0);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo failed.(extensions)"))
        return SDK_FAILURE;

    if(!strstr(deviceExtensions, "cl_khr_global_int32_base_atomics"))
    {
        sampleCommon->error("Device does not support global_int32_base_atomics!");
        return SDK_EXPECTED_FAILURE;
    }

    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_MAX_WORK_GROUP_SIZE,
                             sizeof(size_t),
                             (void *)&maxWorkGroupSize,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed."))
        return SDK_FAILURE;


    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
                             sizeof(cl_uint),
                             (void *)&maxDimensions,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed."))
        return SDK_FAILURE;


    maxWorkItemSizes = (size_t*)malloc(maxDimensions*sizeof(size_t));
    status = clGetDeviceInfo(devices[deviceId],
                             CL_DEVICE_MAX_WORK_ITEM_SIZES,
                             sizeof(size_t)*maxDimensions,
                             (void *)maxWorkItemSizes,
                             NULL);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed."))
        return SDK_FAILURE;




    {
        /* The block is to move the declaration of prop closer to its use */
        cl_command_queue_properties prop = 0;
        prop |= CL_QUEUE_PROFILING_ENABLE;

        commandQueue = clCreateCommandQueue(context, 
                                            devices[deviceId], 
                                            prop, 
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   0,
                                   "clCreateCommandQueue failed."))
            return SDK_FAILURE;
    }

    /* Input buffer */
    inputBuffer = clCreateBuffer(context, 
                                 CL_MEM_READ_ONLY,
                                 sizeof(cl_float4) * length,
                                 0, 
                                 &status);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateBuffer failed. (inputBuffer)"))
        return SDK_FAILURE;

    /* Write data to buffer */
    status = clEnqueueWriteBuffer(commandQueue,
                                  inputBuffer,
                                  1,
                                  0,
                                  sizeof(cl_float4) * length,
                                  input,
                                  0,
                                  0,
                                  0);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clEnqueueWriteBuffer failed. (inputBuffer)"))
        return SDK_FAILURE;


    outputBuffer = clCreateBuffer(context, 
                                  CL_MEM_WRITE_ONLY,
                                  sizeof(cl_float4) * length,
                                  0, 
                                  &status);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clCreateBuffer failed. (outputBuffer)"))
        return SDK_FAILURE;

    /* create a CL program using the kernel source */
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    if(isLoadBinaryEnabled())
    {
        kernelPath.append(loadBinary.c_str());
        if(!kernelFile.readBinaryFromFile(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }

        const char * binary = kernelFile.source().c_str();
        size_t binarySize = kernelFile.source().size();
        program = clCreateProgramWithBinary(context,
                                            1,
                                            &devices[deviceId], 
                                            (const size_t *)&binarySize,
                                            (const unsigned char**)&binary,
                                            NULL,
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clCreateProgramWithBinary failed."))
        {
            return SDK_FAILURE;
        }

    }
    else
    {
        kernelPath.append("MemoryOptimizations_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file: " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        const char * source = kernelFile.source().c_str();
        size_t sourceSize[] = {strlen(source)};
        program = clCreateProgramWithSource(context,
                                            1,
                                            &source,
                                            sourceSize,
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clCreateProgramWithSource failed."))
            return SDK_FAILURE;
    }

    /* create a cl program executable for all the devices specified */
    status = clBuildProgram(program, 1, &devices[deviceId], NULL, NULL, NULL);
    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            cl_int logStatus;
            char *buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo (program, 
                devices[deviceId], 
                CL_PROGRAM_BUILD_LOG, 
                buildLogSize, 
                buildLog, 
                &buildLogSize);
            if(!sampleCommon->checkVal(
                logStatus,
                CL_SUCCESS,
                "clGetProgramBuildInfo failed."))
                return SDK_FAILURE;

            buildLog = (char*)malloc(buildLogSize);
            if(buildLog == NULL)
            {
                sampleCommon->error("Failed to allocate host memory. (buildLog)");
                return SDK_FAILURE;
            }
            memset(buildLog, 0, buildLogSize);

            logStatus = clGetProgramBuildInfo (program, 
                devices[deviceId], 
                CL_PROGRAM_BUILD_LOG, 
                buildLogSize, 
                buildLog, 
                NULL);
            if(!sampleCommon->checkVal(
                logStatus,
                CL_SUCCESS,
                "clGetProgramBuildInfo failed."))
            {
                free(buildLog);
                return SDK_FAILURE;
            }

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << buildLog << std::endl;
            std::cout << " ************************************************\n";
            free(buildLog);
        }

        if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clBuildProgram failed."))
            return SDK_FAILURE;
    }

    /* Copy 1D Fast Path */
    kernel[0] = clCreateKernel(program, "copy1DFastPath", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed.(copy1DFastPath)"))
        return SDK_FAILURE;

    /* Copy 1D Complete Path */
    kernel[1] = clCreateKernel(program, "copy1DCompletePath", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (copy1DCompletePath)"))
        return SDK_FAILURE;

    /* Copy 2D float */
    kernel[2] = clCreateKernel(program, "copy2Dfloat", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (copy2Dfloat)"))
        return SDK_FAILURE;

    /* Copy 2D float4 */
    kernel[3] = clCreateKernel(program, "copy2Dfloat4", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (copy2Dfloat4)"))
        return SDK_FAILURE;

    /* Copy 1D float4 */
    kernel[4] = clCreateKernel(program, "copy1Dfloat4", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (copy1Dfloat4)"))
        return SDK_FAILURE;

    /* Copy No Coalesced */
    kernel[5] = clCreateKernel(program, "NoCoal", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (NoCoal)"))
        return SDK_FAILURE;

    /* Copy Split */
    kernel[6] = clCreateKernel(program, "Split", &status);
    if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateKernel failed. (Split)"))
        return SDK_FAILURE;

    return SDK_SUCCESS;
}
Пример #7
0
int main(int argc, char *argv[])
{
    int iGlobalSize = 1;
    int iCheck1, iCheck2, iCheck3, iCheck4;
    size_t iGlobalWorkSize = -1;
    size_t iLocalWorkSize = -1;
    if (argc > 1) // Size of input vector
    {
        iCheck1 = atoi(argv[1]);
        if (iCheck1 != 0)
        {
            iGlobalSize = iCheck1;
        }
    }
    int iNoReps = 100; // Number of repetitions.
	if (argc > 2)
	{
		iCheck2 = atoi(argv[2]);
        if (iCheck2 != 0)
        {
            iNoReps = iCheck2;
        }        
	}
    
/*    
    if (argc > 3) // Global work size
    {
        iCheck3 = atoi(argv[3]);
        if (iCheck3 != 0)
        {
            iGlobalWorkSize = iCheck3;
        }
    }
    if (argc > 4) // Local work size
    {
        iCheck4 = atoi(argv[4]);
        if (iCheck4 != 0)
        {
            iLocalWorkSize = iCheck4;
        }
    } */
    int bPrint = 0;
	if (argc > 3) // Originally 5.
	{
		bPrint = 1;
	}

 //   printf("The global size is %d, the global work size is %ld, and the local work size is %ld. \n", iGlobalSize, iGlobalWorkSize, iLocalWorkSize);
/*    size_t * ipGlobalWorkParam = NULL;
    if (iGlobalWorkSize != -1)
    {
        ipGlobalWorkParam = &iGlobalWorkSize;
    }
    
    size_t * ipLocalWorkParam = NULL;
    if (iLocalWorkSize != -1)
    {
        ipLocalWorkParam = &iLocalWorkSize;
    } */
    
    GCAQ * TheGCAQ = GCAQSetup();
    if (TheGCAQ == NULL)
    {
        return 1;
    }

#if BIGFLOAT
	const char *szFloatOpt = "-DBIGFLOAT";
#else
	const char *szFloatOpt = NULL;
#endif
    const int iNoKernels = 1;
	char *ourKernelStrings[6] =
		{ szDotProduct, szReduce, szDotProduct2, szReduce2, szDotProduct4, szReduce4};


  	GPAK *TheGPAK = GPAKSetup(TheGCAQ, iNoKernels, ourKernelStrings, szFloatOpt);
    if (TheGPAK == NULL)
    {
        GCAQShutdown(TheGCAQ);
        return 2;
    }

    INTG iTypicalWorkgroupNo = TheGPAK->TheMaxWorkGroupSizes[0];
    
    INTG iExpOutputSize = ioutsize(iGlobalSize, iTypicalWorkgroupNo);
    FLPT * fExpDotProdResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT));
    FLPT * fExpReduceResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT));
    fdotprodexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpDotProdResult);
    freduceexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpReduceResult);
 //   printvector("dot prod", iExpOutputSize, fExpDotProdResult);
 //   printvector("reduce", iExpOutputSize, fExpReduceResult);

    FLPT* inputDataF = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
    SetFIncrease(iGlobalSize, inputDataF);
    
    // For the dot product.
    
    FLPT* outputDataD = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
    SetFNull(iGlobalSize, outputDataD);

    // For the reduction.
    
    FLPT* outputDataR = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
    SetFNull(iGlobalSize, outputDataR);
    
	struct timespec start[iNoKernels];
	struct timespec end[iNoKernels];
    
// create buffers for the input and ouput

    int err; 
    cl_mem inputF, outputF, outputAll;
    inputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_READ_ONLY, iGlobalSize * sizeof(FLPT), NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error allocating for F");
        return 3;
    }
    
    
    outputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error allocating for output 7");
        return 9;
    }
    outputAll = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error allocating for output 8");
        return 9;
    }

    clEnqueueWriteBuffer(TheGCAQ->TheQueue, inputF, CL_TRUE, 0, iGlobalSize * sizeof(FLPT), inputDataF, 0, NULL, NULL);

    int iRep;
    int iKernel;
    int i;
    int iLengthTotal = iGlobalSize;
    size_t iGlobalWorkThing = iGlobalSize;
    int iSomething = 1;
    for (iKernel = 0; iKernel < iNoKernels; iKernel++)
    {    
		for (i = 0; i < iLengthTotal; i++)
		{
			outputDataD[i] = 0.0;
            outputDataR[i] = 0.0;
		}
        
        clock_gettime(CLOCK_MONOTONIC, &(start[iKernel]));
		for (iRep = 0; iRep < iNoReps; iRep++)
		{
            clSetKernelArg(TheGPAK->TheKernels[iKernel], 0, sizeof(int), &iLengthTotal);
            clSetKernelArg(TheGPAK->TheKernels[iKernel], 1, sizeof(cl_mem), &inputF);
            clSetKernelArg(TheGPAK->TheKernels[iKernel], 2, iSomething * iLocalWorkSize * sizeof(float), NULL); // Was 3
            clSetKernelArg(TheGPAK->TheKernels[iKernel], 3, sizeof(cl_mem), &outputAll); // Was 4         
            clEnqueueNDRangeKernel(TheGCAQ->TheQueue, TheGPAK->TheKernels[iKernel], 1,
                NULL, &iGlobalWorkThing, &(TheGPAK->TheMaxWorkGroupSizes[iKernel]), 0, NULL, NULL);
            clFinish(TheGCAQ->TheQueue);
 
// copy the results from out of the output buffer

            if (iKernel % 2 ==  0)
            {
                clEnqueueReadBuffer(TheGCAQ->TheQueue, outputAll, CL_TRUE, 0, iExpOutputSize * sizeof(float), outputDataD, 0, NULL, NULL);
            }
            else
            {
                clEnqueueReadBuffer(TheGCAQ->TheQueue, outputAll, CL_TRUE, 0, iExpOutputSize * sizeof(float), outputDataR, 0, NULL, NULL);
            }
        
        }
        clock_gettime(CLOCK_MONOTONIC, &(end[iKernel]));    
        if (bPrint)
        {
            for (i = 0; i < iExpOutputSize; i++)
			{
				if (iKernel % 2 == 0)
                {
                    if (outputDataD[i] != fExpDotProdResult[i])
                    {
                        printf
                            ("A problem at kernel %d and iteration %d for actual value %f but expected value %f!\n",
                            iKernel, i, outputDataD[i], fExpDotProdResult[i]);
                        break;
                    }
                }
                else
                {
                    if (outputDataR[i] != fExpReduceResult[i])
                    {
                        printf
                            ("A problem at kernel %d and iteration %d for actual value %f but expected value %f!\n",
                            iKernel, i, outputDataR[i], fExpReduceResult[i]);
                        break;
                    }
                }
			}    
        }
   //     if ((iKernel % 2) == 1)
   //     {
   //         iLengthTotal = iLengthTotal / 2;
    //        iSomething = iSomething * 2;
    //        iGlobalWorkThing = iGlobalWorkThing / 2;
  //      }
    }


    clReleaseMemObject(inputF);
    clReleaseMemObject(outputF);
    clReleaseMemObject(outputAll);
    
// print the results
//    if (bPrint)
//    {
//        printf("output %d: \n", iGlobalSize);
//        for(i=0;i<iExpOutputSize; i++)
//        {
//            printf("%d - %f - %f\n", i, outputDataD[i], outputDataR[i]);
//        }   
//    }
    
// cleanup - release OpenCL resources
    
    free(inputDataF);
    free(outputDataD);
    free(outputDataR);
   
    GPAKShutdown(TheGPAK);
    GCAQShutdown (TheGCAQ);
    printf("%d - ", iGlobalSize);
	for (iKernel = 0; iKernel < iNoKernels; iKernel++)
	{
		printf("%f - ", (1.0 * TLPERS * iGlobalSize * iNoReps) /
			   (MEGAHERTZ * timespecDiff(&(end[iKernel]), &(start[iKernel]))));
	}
    printf("\n");
    return 0;
}
Пример #8
0
    // Create the data array in device memory for our calculation
    //
    cl_mem device_$arg_ref = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof($arg_ref[0]) * grid_size, NULL, NULL);
    if (!device_$arg_ref)
    {
        printf("Error: Failed to allocate device memory!\n");
        return err;
    }

    // Write our data set into the data array in device memory
    //
    err = clEnqueueWriteBuffer(commands, device_$arg_ref, CL_TRUE, 0, sizeof($arg_ref[0]) * grid_size, $arg_ref, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        return err;
    }

    // Set the arguments to our compute kernel
    //
    err  = clSetKernelArg(kernel, $arg_index, sizeof(cl_mem), &device_$arg_ref);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        return err;
    }
Пример #9
0
    void test_csrmv()
    {
        clsparseStatus status;
        cl_int cl_status;

        clsparseEnableExtendedPrecision(CLSE::control, extended_precision);

        if (typeid(T) == typeid(cl_float) )
        {
            status = clsparseScsrmv(&gAlpha, &CSRE::csrSMatrix, &gX,
                                    &gBeta, &gY, CLSE::control);

            ASSERT_EQ(clsparseSuccess, status);

            float* vals = (float*)&CSRE::ublasSCsr.value_data()[0];
            int* rows = &CSRE::ublasSCsr.index1_data()[0];
            int* cols = &CSRE::ublasSCsr.index2_data()[0];
            for (int row = 0; row < CSRE::n_rows; row++)
            {
                // Summation done at a higher precision to decrease
                // summation errors from rounding.
                hY[row] *= hBeta;
                int row_end = rows[row+1];
                double temp_sum;
                temp_sum = hY[row];
                for (int i = rows[row]; i < rows[row+1]; i++)
                {
                    // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]];
                    temp_sum += hAlpha * vals[i] * hX[cols[i]];
                }
                hY[row] = temp_sum;
            }

            T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values,
                                                       CL_TRUE, CL_MAP_READ,
                                                       0, gY.num_values * sizeof(T),
                                                       0, nullptr, nullptr, &cl_status);
            ASSERT_EQ(CL_SUCCESS, cl_status);

            uint64_t max_ulps = 0;
            uint64_t min_ulps = UINT64_MAX;
            uint64_t total_ulps = 0;
            for (int i = 0; i < hY.size(); i++)
            {
                long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]);
                intDiff = llabs(intDiff);
                total_ulps += intDiff;
                if (max_ulps < intDiff)
                    max_ulps = intDiff;
                if (min_ulps > intDiff)
                    min_ulps = intDiff;
                // Debug printouts.
                //std::cout << "Row " << i << " Float Ulps: " << intDiff << std::endl;
                //std::cout.precision(9);
                //std::cout << "\tFloat hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint32_t *)&hY[i] << "), " << std::dec;
                //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint32_t *)&host_result[i] << ")" << std::dec << std::endl;
            }
#ifndef NDEBUG
            if (extended_precision)
            {
                std::cout << "Float Min ulps: " << min_ulps << std::endl;
                std::cout << "Float Max ulps: " << max_ulps << std::endl;
                std::cout << "Float Total ulps: " << total_ulps << std::endl;
                std::cout << "Float Average ulps: " << (double)total_ulps/(double)hY.size() <<  " (Size: " << hY.size() << ")" << std::endl;
            }
#endif

            for (int i = 0; i < hY.size(); i++)
            {
                double compare_val = 0.;
                if (extended_precision)
                {
                    // The limit here is somewhat weak because some GPUs don't
                    // support correctly rounded denorms in SPFP mode.
                    if (boost::math::isnormal(hY[i]))
                        compare_val = fabs(hY[i]*1e-3);
                }
                else
                {
                    if (boost::math::isnormal(hY[i]))
                        compare_val = fabs(hY[i]*0.1);
                }
                if (compare_val < 10*FLT_EPSILON)
                    compare_val = 10*FLT_EPSILON;
                ASSERT_NEAR(hY[i], host_result[i], compare_val);
            }

            cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,
                                                  host_result, 0, nullptr, nullptr);
            ASSERT_EQ(CL_SUCCESS, cl_status);
        }

        if (typeid(T) == typeid(cl_double) )
        {
            status = clsparseDcsrmv(&gAlpha, &CSRE::csrDMatrix, &gX,
                                    &gBeta, &gY, CLSE::control);

            ASSERT_EQ(clsparseSuccess, status);

            double* vals = (double*)&CSRE::ublasDCsr.value_data()[0];
            int* rows = &CSRE::ublasDCsr.index1_data()[0];
            int* cols = &CSRE::ublasDCsr.index2_data()[0];
            for (int row = 0; row < CSRE::n_rows; row++)
            {
                // Summation done using a compensated summation to decrease
                // summation errors from rounding. This allows us to get
                // smaller errors without requiring quad precision support.
                // This method is like performing summation at quad precision and
                // casting down to double in the end.
                hY[row] *= hBeta;
                int row_end = rows[row+1];
                double temp_sum;
                temp_sum = hY[row];
                T sumk_err = 0.;
                for (int i = rows[row]; i < rows[row+1]; i++)
                {
                    // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]];
                    temp_sum = two_sum(temp_sum, hAlpha*vals[i]*hX[cols[i]], &sumk_err);
                }
                hY[row] = temp_sum + sumk_err;
            }

            T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values,
                                                       CL_TRUE, CL_MAP_READ,
                                                       0, gY.num_values * sizeof(T),
                                                       0, nullptr, nullptr, &cl_status);
            ASSERT_EQ(CL_SUCCESS, cl_status);

            uint64_t max_ulps = 0;
            uint64_t min_ulps = ULLONG_MAX;
            uint64_t total_ulps = 0;
            for (int i = 0; i < hY.size(); i++)
            {
                long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]);
                intDiff = llabs(intDiff);
                total_ulps += intDiff;
                if (max_ulps < intDiff)
                    max_ulps = intDiff;
                if (min_ulps > intDiff)
                    min_ulps = intDiff;
                // Debug printouts.
                //std::cout << "Row " << i << " Double Ulps: " << intDiff << std::endl;
                //std::cout.precision(17);
                //std::cout << "\tDouble hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint64_t *)&hY[i] << "), " << std::dec;
                //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint64_t *)&host_result[i] << ")" << std::dec << std::endl;
            }
            if (extended_precision)
            {
#ifndef NDEBUG
                std::cout << "Double Min ulps: " << min_ulps << std::endl;
                std::cout << "Double Max ulps: " << max_ulps << std::endl;
                std::cout << "Double Total ulps: " << total_ulps << std::endl;
                std::cout << "Double Average ulps: " << (double)total_ulps/(double)hY.size() <<  " (Size: " << hY.size() << ")" << std::endl;
#endif

                for (int i = 0; i < hY.size(); i++)
                {
                    double compare_val = fabs(hY[i]*1e-14);
                    if (compare_val < 10*DBL_EPSILON)
                        compare_val = 10*DBL_EPSILON;
                    ASSERT_NEAR(hY[i], host_result[i], compare_val);
                }
            }
            else
            {
                for (int i = 0; i < hY.size(); i++)
                {
                    double compare_val = 0.;
                    if (boost::math::isnormal(hY[i]))
                        compare_val = fabs(hY[i]*0.1);
                    if (compare_val < 10*DBL_EPSILON)
                        compare_val = 10*DBL_EPSILON;
                    ASSERT_NEAR(hY[i], host_result[i], compare_val);
                }
            }

            cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,
                                                  host_result, 0, nullptr, nullptr);
            ASSERT_EQ(CL_SUCCESS, cl_status);
        }
        // Reset output buffer for next test.
        ::clReleaseMemObject(gY.values);
        clsparseInitVector(&gY);
        gY.values = clCreateBuffer(CLSE::context,
                CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
                hY.size() * sizeof(T), hY.data().begin(),
                &cl_status);
        gY.num_values = hY.size();
        ASSERT_EQ(CL_SUCCESS, cl_status);
    }
Пример #10
0
inline void vector_sum(const int  arraySize, 
                       const double* inputA, 
                       const double* inputB, 
                             double* output)
{ 
    /* Allocate memory buffers */
    /*
    * Ask the OpenCL implementation to allocate buffers for the data.
    * We ask the OpenCL implemenation to allocate memory rather than 
    * allocating it on the CPU to avoid having to copy the data later.
    * The read/write flags relate to accesses to the memory from within 
    * the kernel.
    */

    bool createMemoryObjectSuccess = true;
    int numberOfMemoryObjects = 3;
    cl_mem memoryObjects[3] = {0, 0, 0};
    int errorNumber = 0;

    int bufferSize = arraySize*sizeof(double);

    memoryObjects[0] = clCreateBuffer(context, 
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
            bufferSize, (void*)inputA, &errorNumber);
    checkErr(errorNumber, "Failed to create buffer, 1.");
    
    memoryObjects[1] = clCreateBuffer(context, 
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
            bufferSize, (void*)inputB, &errorNumber);
    checkErr(errorNumber, "Failed to create buffer, 2.");
    
    memoryObjects[2] = clCreateBuffer(context, 
            CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, 
            bufferSize, output, &errorNumber);
    checkErr(errorNumber, "Failed to create buffer, 3.");

    /* Enqueue commands and kernels */
    /* Enqueue to the command queues the commands that control the sequence 
     * and synchronization of kernel execution, reading and writing of data,
     * and manipulation of memory objects
     */

    /* Execute a kernel function */
    /* Call clSetKernelArg() for each parameter in the kernel */
    bool setKernelArgumentsSuccess = true;
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0, 
                                        sizeof(cl_mem), &memoryObjects[0]));
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1, 
                                        sizeof(cl_mem), &memoryObjects[1]));
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2, 
                                        sizeof(cl_mem), &memoryObjects[2]));
    if (not setKernelArgumentsSuccess) {
        cleanUpOpenCL();
        std::cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ 
                  << ":"<< __LINE__ << std::endl;
        exit(1);
    }

    /* Determine the work-group size and index space for the kernel */
    const size_t globalWorkSize[1] = {arraySize};
    const size_t localWorkSize[1] = { 1 };

    /* Enqueue the kernel for execution in the command queue */
    //for (int j = 0; j < ITER; j++) {
        if (not checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, 
                NULL, globalWorkSize, localWorkSize, 0, NULL, NULL))) {
            
            cleanUpOpenCL();
            std::cerr << "Failed enqueuing the kernel. " << __FILE__ << ":" 
                      << __LINE__ <<std::endl;
            exit(1);
        }
    //}

    /* Get a pointer to the output data */
    output = (double*)clEnqueueMapBuffer(commandQueue, 
                    memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, 
                    arraySize, 0, NULL, NULL, &errorNumber);

    if (not checkSuccess(errorNumber)) {

        cleanUpOpenCL();
        std::cerr << "Failed to map buffer " << __FILE__ << ":"
                  << __LINE__ << std::endl;
        exit(1); 
    }

    /* Wait for kernel execution */
    if (not checkSuccess(clFinish(commandQueue))) {

        cleanUpOpenCL();
        std::cerr << "Failed waiting for kernel execution to finish. "
                  << __FILE__ << ":"<< __LINE__ << std::endl;
        exit(1);
    }


    /* Unmap the memory objects as we finished using them in the CPU */
    if (not checkSuccess(clReleaseMemObject(memoryObjects[0]))) {

        cleanUpOpenCL();
        std::cerr << "Unmapping memory objects failed " << __FILE__ << ":"
                  << __LINE__ << std::endl;
        exit(1);
    }
    if (not checkSuccess(clReleaseMemObject(memoryObjects[1]))) {

        cleanUpOpenCL();
        std::cerr << "Unmapping memory objects failed " << __FILE__ << ":"
                  << __LINE__ << std::endl;
        exit(1);
    }
    if (not checkSuccess(clEnqueueUnmapMemObject(commandQueue, 
                    memoryObjects[2], output, 0, NULL, NULL))) {

        cleanUpOpenCL();
        std::cerr << "Unmapping memory objects failed " << __FILE__ << ":"
                  << __LINE__ << std::endl;
        exit(1);
    }
}
Пример #11
0
int main() {
// START:context
  cl_platform_id platform;
  clGetPlatformIDs(1, &platform, NULL);

  cl_device_id device;
  clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
// END:context

// START:queue
  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
// END:queue

// START:kernel
  char* source = read_source("multiply_arrays.cl");
  cl_program program = clCreateProgramWithSource(context, 1,
    (const char**)&source, NULL, NULL);
  free(source);
  clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL);
// END:kernel

// START:buffers
  cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS];
  random_fill(a, NUM_ELEMENTS);
  random_fill(b, NUM_ELEMENTS);
  cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    sizeof(cl_float) * NUM_ELEMENTS, a, NULL);
  cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    sizeof(cl_float) * NUM_ELEMENTS, b, NULL);
  cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
    sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL);
// END:buffers

// START:execute
  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
  clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
  clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);

  size_t work_units = NUM_ELEMENTS;
  clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL);
// END:execute

// START:results
  cl_float results[NUM_ELEMENTS];
  clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS,
    results, 0, NULL, NULL);
// END:results

// START:cleanup
  clReleaseMemObject(inputA);
  clReleaseMemObject(inputB);
  clReleaseMemObject(output);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(queue);
  clReleaseContext(context);
// END:cleanup

  for (int i = 0; i < NUM_ELEMENTS; ++i) {
    printf("%f * %f = %f\n", a[i], b[i], results[i]);
  }

  return 0;
}
Пример #12
0
int32_t init_kernel_platform() {

	cl_uint plat_num;
	cl_platform_id plat_id = NULL;
	cl_uint dev_num = 0;
	cl_device_id *devices;

	ret = clGetPlatformIDs(0, NULL, &plat_num);
	if (ret < 0) {
		LOGD("MU1 Error: Getting plat_ids!\n");
		return -1;
	}

	if(plat_num > 0)
	{
		cl_platform_id* plat_ids = (cl_platform_id* )malloc(plat_num* sizeof(cl_platform_id));
		ret = clGetPlatformIDs(plat_num, plat_ids, NULL);
		plat_id = plat_ids[0];
		free(plat_ids);
	}

	ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, 0, NULL, &dev_num);	
	if (dev_num == 0) {
		LOGD("MU1: No GPU device available.\n");
		LOGD("MU1: Choose CPU as default device.\n");
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, 0, NULL, &dev_num);	
		devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id));
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, dev_num, devices, NULL);
	} else {
		LOGD("MU1: Choose GPU as default device. dev_num %d\n", dev_num);
		devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id));
		ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, dev_num, devices, NULL);
	}
	
	context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);

	commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
    
	char filename[] = "/data/mu1_kernel.cl";
	char file_context[10*1024]={0};
	const char *source = &file_context[0];

    ret = read_cl(filename, &file_context[0]);

	size_t sourceSize[10] = {strlen(source)};
	cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize[0], NULL);
	
	ret = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
    if(ret < 0) {
        LOGD("MU1 Error: clBuildProgram error\n");
        return 0;
    }

	kernel = clCreateKernel(program, "process_iq", NULL);

	inputBuffer_i = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_i[0][0]), NULL);
	inputBuffer_q = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_q[0][0]), NULL);
	inputBuffer_o = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, 
            512*1024*4, (void *)(&table_o[0][0]), NULL);


	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer_i);
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputBuffer_q);
	ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&inputBuffer_o);



    if(devices != NULL) { free(devices);}

    LOGD("MU1: init cl plat success");
    return 0;
}
Пример #13
0
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cl_platform_id cpPlatform;       //OpenCL platform
    cl_device_id cdDevice;           //OpenCL device
    cl_context       cxGPUContext;   //OpenCL context
    cl_command_queue cqCommandQueue; //OpenCL command que
    cl_mem      d_Input, d_Output;   //OpenCL memory buffer objects

    cl_int ciErrNum;

    float *h_Input, *h_OutputCPU, *h_OutputGPU;

    const uint
        imageW = 2048,
        imageH = 2048,
        stride = 2048;

    const int dir = DCT_FORWARD;

    shrQAStart(argc, argv);

    // set logfile name and start logs
    shrSetLogFileName ("oclDCT8x8.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Allocating and initializing host memory...\n");
        h_Input     = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float));
        srand(2009);
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++)
                h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX;

    shrLog("Initializing OpenCL...\n");
        //Get the NVIDIA platform
        ciErrNum = oclGetPlatformID(&cpPlatform);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Get a GPU device
        ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create the context
        cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create a command-queue
        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Initializing OpenCL DCT 8x8...\n");
        initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv);

    shrLog("Creating OpenCL memory objects...\n");
        d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride *  sizeof(cl_float), h_Input, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW);
        //Just a single iteration or a warmup iteration
        DCT8x8(
            cqCommandQueue,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

#define GPU_PROFILING 1
#ifdef GPU_PROFILING
    const int numIterations = 16;
    cl_event startMark, endMark;
    ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrDeltaT(0);

    for(int iter = 0; iter < numIterations; iter++)
        DCT8x8(
            NULL,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

    ciErrNum  = clEnqueueMarker(cqCommandQueue, &endMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);

    //Calculate performance metrics by wallclock time
    double gpuTime = shrDeltaT(0) / (double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", 
            (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); 

    //Get profiler time
    cl_ulong startTime = 0, endTime = 0;
    ciErrNum  = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
    ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
#endif

    shrLog("Reading back OpenCL results...\n");
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Comparing against Host/C++ computation...\n"); 
        DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir);
        double sum = 0, delta = 0;
        double L2norm;
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++){
                sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j];
                delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]);
            }
        L2norm = sqrt(delta / sum);
        shrLog("Relative L2 norm: %.3e\n\n", L2norm);

    shrLog("Shutting down...\n");
        //Release kernels and program
        closeDCT8x8();

        //Release other OpenCL objects
        ciErrNum  = clReleaseMemObject(d_Output);
        ciErrNum |= clReleaseMemObject(d_Input);
        ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
        ciErrNum |= clReleaseContext(cxGPUContext);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Release host buffers
        free(h_OutputGPU);
        free(h_OutputCPU);
        free(h_Input);

        //Finish
        shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-3) ? QA_PASSED : QA_FAILED);
}
Пример #14
0
int main(int argc, char **argv)
{
  printf("enter demo main\n");
  fflush(stdout);
  putenv("POCL_VERBOSE=1");
  putenv("POCL_DEVICES=basic");
  putenv("POCL_LEAVE_TEMP_DIRS=1");
  putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1");
  putenv("POCL_TEMP_DIR=pocl");
  putenv("POCL_CACHE_DIR=pocl");
  putenv("POCL_WORK_GROUP_METHOD=spmd");
  if(argc >= 2){
    printf("argv[1]:%s:\n",argv[1]);
    if(!strcmp(argv[1], "h"))
      putenv("POCL_WORK_GROUP_METHOD=spmd");
    if(!strcmp(argv[1], "c"))
      putenv("POCL_CROSS_COMPILE=1");
  }
  if(argc >= 3){
    printf("argv[2]:%s:\n",argv[2]);
    if(!strcmp(argv[2], "h"))
      putenv("POCL_WORK_GROUP_METHOD=spmd");
    if(!strcmp(argv[2], "c"))
      putenv("POCL_CROSS_COMPILE=1");
  }

  //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib");
  //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib");
  //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib");
  //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath());
	cl_platform_id platforms[100];
	cl_uint platforms_n = 0;
	CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n));

	printf("=== %d OpenCL platform(s) found: ===\n", platforms_n);
	for (int i=0; i<platforms_n; i++)
	{
		char buffer[10240];
		printf("  -- %d --\n", i);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL));
		printf("  PROFILE = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL));
		printf("  VERSION = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL));
		printf("  NAME = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL));
		printf("  VENDOR = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL));
		printf("  EXTENSIONS = %s\n", buffer);
	}

	if (platforms_n == 0)
		return 1;

	cl_device_id devices[100];
	cl_uint devices_n = 0;
	// CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n));
	CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n));

	printf("=== %d OpenCL device(s) found on platform:\n", devices_n);
	for (int i=0; i<devices_n; i++)
	{
		char buffer[10240];
		cl_uint buf_uint;
		cl_ulong buf_ulong;
		printf("  -- %d --\n", i);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_NAME = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VENDOR = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DRIVER_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
		printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
	}

	if (devices_n == 0)
		return 1;

	cl_context context;
	context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err));

	cl_command_queue queue;
  queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err));

	cl_kernel kernel = 0;
  cl_mem memObjects[2] = {0,0};


  // Create OpenCL program - first attempt to load cached binary.
  //  If that is not available, then create the program from source
  //  and store the binary for future use.
  std::cout << "Attempting to create program from binary..." << std::endl;
  cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin");
  if (program == NULL)
  {
      std::cout << "Binary not loaded, create from source..." << std::endl;
      program = CreateProgram(context, devices[1], "kernel.cl");
      if (program == NULL)
      {
          Cleanup(context, queue, program, kernel, memObjects);
          return 1;
      }

      std::cout << "Save program binary for future run..." << std::endl;
      if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false)
      {
          std::cerr << "Failed to write program binary" << std::endl;
          Cleanup(context, queue, program, kernel, memObjects);
          return 1;
      }
  }
  else
  {
      std::cout << "Read program from binary." << std::endl;
  }

  printf("attempting to create input buffer\n");
  fflush(stdout);
	cl_mem input_buffer;
	input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*NUM_DATA, NULL, &_err));

  printf("attempting to create output buffer\n");
  fflush(stdout);
	cl_mem output_buffer;
	output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double)*NUM_DATA, NULL, &_err));

  memObjects[0] = input_buffer;
  memObjects[1] = output_buffer;

  double factor = ((double)rand()/(double)(RAND_MAX)) * 100.0;;

  printf("attempting to create kernel\n");
  fflush(stdout);
	kernel = CL_CHECK_ERR(clCreateKernel(program, "daxpy", &_err));
  printf("setting up kernel args cl_mem:%lx \n",input_buffer);
  fflush(stdout);
	CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
	CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
	CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));

  printf("attempting to enqueue write buffer\n");
  fflush(stdout);
	for (int i=0; i<NUM_DATA; i++) {
    double in = ((double)rand()/(double)(RAND_MAX)) * 100.0;;
		CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(double), 8, &in, 0, NULL, NULL));
	}

	cl_event kernel_completion;
	size_t global_work_size[1] = { NUM_DATA };
  printf("attempting to enqueue kernel\n");
  fflush(stdout);
	CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion));
  printf("Enqueue'd kerenel\n");
  fflush(stdout);
    cl_ulong time_start, time_end;
  CL_CHECK(clWaitForEvents(1, &kernel_completion));
  CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL));
  CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL));
  double elapsed = time_end - time_start;
  printf("time(ns):%lg\n",elapsed);
	CL_CHECK(clReleaseEvent(kernel_completion));

	printf("Result:");
	for (int i=0; i<NUM_DATA; i++) {
		double data;
		CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(double), 8, &data, 0, NULL, NULL));
		//printf(" %lg", data);
	}
	printf("\n");

	CL_CHECK(clReleaseMemObject(memObjects[0]));
	CL_CHECK(clReleaseMemObject(memObjects[1]));

	CL_CHECK(clReleaseKernel(kernel));
	CL_CHECK(clReleaseProgram(program));
	CL_CHECK(clReleaseContext(context));

	return 0;
}
Пример #15
0
    void setup_buffer(int order_option, int side_option, int uplo_option,
                      int diag_option, int transA_option, int transB_option,
                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
                      double alpha, double beta)
    {
        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
        DUMMY_ARGS_USAGE_2(ldb, offBX);

        initialize_scalars(alpha, beta);

        buffer_.n_ = N;
        buffer_.k_ = K;
        buffer_.offA_ = offA;
        buffer_.offC_ = offCY;

        if (uplo_option == 0)
        {
            buffer_.uplo_ = clblasUpper;
        }
        else
        {
            buffer_.uplo_ = clblasLower;
        }


        if (ldc == 0)
        {
            buffer_.ldc_ = N;
        }
        else if (ldc < N)
        {
            std::cerr << "ldc:wrong size\n";
        }
        else
        {
            buffer_.ldc_ = ldc;
        }
        buffer_.c_num_vectors_ = N;

        if (order_option == 0)
        {
            order_ = clblasRowMajor;
            if (transA_option == 0)
            {
                buffer_.trans_a_ = clblasNoTrans;
                buffer_.a_num_vectors_ = N;
                if (lda == 0)
                {
                    buffer_.lda_ = K;
                }
                else if (lda < K)
                {
                    std::cerr << "lda:wrong size\n";
                    exit(1);
                }
                else
                {
                    buffer_.lda_ = lda;
                }
            }
            else
            {
                buffer_.a_num_vectors_ = K;
                if (transA_option == 1)
                {
                    buffer_.trans_a_ = clblasTrans;
                }
                else if (transA_option == 2)
                {
                    buffer_.trans_a_ = clblasConjTrans;
                }
                if (lda == 0)
                {
                    buffer_.lda_ = N;
                }
                else if (lda < N)
                {
                    std::cerr << "lda:wrong size\n";
                    exit(1);
                }
                else
                {
                    buffer_.lda_ = lda;
                }
            }
        }
        else
        {
            order_ = clblasColumnMajor;
            if (transA_option == 0)
            {
                buffer_.a_num_vectors_ = K;
                buffer_.trans_a_ = clblasNoTrans;
                if (lda == 0)
                {
                    buffer_.lda_ = N;
                }
                else if (lda < N)
                {
                    std::cerr << "lda:wrong size\n";
                    exit(1);
                }
                else
                {
                    buffer_.lda_ = lda;
                }
            }
            else
            {
                buffer_.a_num_vectors_ = N;
                if (transA_option == 1)
                {
                    buffer_.trans_a_ = clblasTrans;
                }
                else if (transA_option == 2)
                {
                    buffer_.trans_a_ = clblasConjTrans;
                }

                if (lda == 0)
                {
                    buffer_.lda_ = K;
                }
                else if (lda < K)
                {
                    std::cerr << "lda:wrong size\n";
                    exit(1);
                }
                else
                {
                    buffer_.lda_ = lda;
                }
            }
        }

        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
        buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];

        cl_int err;
        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
                                            buffer_.offA_) * sizeof(T),
                                        NULL, &err);

        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                            buffer_.offC_) * sizeof(T),
                                        NULL, &err);
    }
Пример #16
0
static cl_int opencl_plugin_init_mesh_buffers(opencl_plugin plugin,
                                              cl_int mesh_data_count,
                                              mesh_data *mesh_data_list)
{
    cl_int err;
    cl_int i;
    cl_mem new_vertex_buffer = NULL, new_triangle_buffer = NULL;
    cl_int total_num_vertices = 0, total_num_triangles = 0;

    assert(plugin != NULL);
    assert(mesh_data_count >= 0);
    assert(mesh_data_list != NULL);

    for (i = 0; i < mesh_data_count; i++) {
        total_num_vertices += mesh_data_list[i].num_vertices;
        total_num_triangles += mesh_data_list[i].num_triangles;
    }

    if (total_num_vertices > plugin->vertex_buffer_capacity) {
        /* Current buffer not big enough, free old buffer first */
        if (plugin->vertex_buffer) {
            clReleaseMemObject(plugin->vertex_buffer);
            plugin->vertex_buffer = NULL;
        }

        plugin->vertex_buffer_capacity = 0;

        /* TODO: Maybe better dynamic resizing (factor = 1.5)? */
        new_vertex_buffer =
            clCreateBuffer(plugin->context, CL_MEM_READ_ONLY,
                           sizeof(float) * 3 * total_num_vertices, NULL, &err);
        CHECK_CL_ERROR(err);

        plugin->vertex_buffer_capacity = total_num_vertices;
        plugin->vertex_buffer = new_vertex_buffer;
        new_vertex_buffer = NULL;
    }

    if (total_num_triangles > plugin->triangle_buffer_capacity) {
        /* Current buffer not big enough, free old buffer first */
        if (plugin->triangle_buffer) {
            clReleaseMemObject(plugin->triangle_buffer);
            plugin->triangle_buffer = NULL;
        }

        plugin->triangle_buffer_capacity = 0;

        /* TODO: Maybe better dynamic resizing (factor = 1.5)? */
        new_triangle_buffer =
            clCreateBuffer(plugin->context, CL_MEM_READ_ONLY,
                           sizeof(cl_int) * 3 * total_num_triangles, NULL, &err);
        CHECK_CL_ERROR(err);

        plugin->triangle_buffer_capacity = total_num_triangles;
        plugin->triangle_buffer = new_triangle_buffer;
        new_triangle_buffer = NULL;
    }

    total_num_vertices = 0;
    total_num_triangles = 0;
    for (i = 0; i < mesh_data_count; i++) {
        mesh_data *mesh_data = &mesh_data_list[i];

        err = clEnqueueWriteBuffer(
            plugin->queue, plugin->vertex_buffer, CL_FALSE,
            sizeof(float) * 3 * total_num_vertices,
            sizeof(float) * 3 * mesh_data->num_vertices, mesh_data->vertices,
            0, NULL, NULL);
        CHECK_CL_ERROR(err);

        err = clEnqueueWriteBuffer(
            plugin->queue, plugin->triangle_buffer, CL_FALSE,
            sizeof(cl_int) * 3 * total_num_triangles,
            sizeof(cl_int) * 3 * mesh_data->num_triangles, mesh_data->triangles,
            0, NULL, NULL);
        CHECK_CL_ERROR(err);

        total_num_vertices += mesh_data_list[i].num_vertices;
        total_num_triangles += mesh_data_list[i].num_triangles;
    }

    /* Wait for all buffer writes to finish, TODO: investigate this further */
    err = clFinish(plugin->queue);
    CHECK_CL_ERROR(err);

    return 0;
error:
    if (new_vertex_buffer)
        clReleaseMemObject(new_vertex_buffer);
    if (new_triangle_buffer)
        clReleaseMemObject(new_triangle_buffer);
    return -1;
}
Пример #17
0
void run_benchmark( void *vargs, cl_context& context, cl_command_queue& commands, cl_program& program, cl_kernel& kernel ) {
  struct bench_args_t *args = (struct bench_args_t *)vargs;
  // Create device buffers
  //
  cl_mem obs_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->obs), NULL, NULL);
  cl_mem init_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->init), NULL, NULL);
  cl_mem transition_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->transition), NULL, NULL);
  cl_mem emission_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->emission), NULL, NULL);
  cl_mem path_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->path), NULL, NULL);
  if (!obs_buffer || !init_buffer || !transition_buffer || !emission_buffer || !path_buffer)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    exit(1);
  }    

  // Write our data set into device buffers  
  //
  int err;
  err = clEnqueueWriteBuffer(commands, obs_buffer, CL_TRUE, 0, sizeof(args->obs), args->obs, 0, NULL, NULL);
  err |= clEnqueueWriteBuffer(commands, init_buffer, CL_TRUE, 0, sizeof(args->init), args->init, 0, NULL, NULL);
  err |= clEnqueueWriteBuffer(commands, transition_buffer, CL_TRUE, 0, sizeof(args->transition), args->transition, 0, NULL, NULL);
  err |= clEnqueueWriteBuffer(commands, emission_buffer, CL_TRUE, 0, sizeof(args->emission), args->emission, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
      printf("Error: Failed to write to device memory!\n");
      printf("Test failed\n");
      exit(1);
  }
    
  // Set the arguments to our compute kernel
  //
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &obs_buffer);
  err  |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &init_buffer);
  err  |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &transition_buffer);
  err  |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &emission_buffer);
  err  |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &path_buffer);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    exit(1);
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

#ifdef C_KERNEL
  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);
#else
  printf("Error: OpenCL kernel is not currently supported!\n");
  exit(1);
#endif
  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    exit(1);
  }

  // Read back the results from the device to verify the output
  //
  err = clEnqueueReadBuffer( commands, path_buffer, CL_TRUE, 0, sizeof(args->path), args->path, 0, NULL, NULL );  
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output array! %d\n", err);
    printf("Test failed\n");
    exit(1);
  }
}
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
{ 
    // create random input data on CPU
    unsigned int bytes = maxN * sizeof(T);

    T* h_idata = (T*)malloc(bytes);

    for(int i = 0; i < maxN; i++) {
        // Keep the numbers small so we don't get truncation error in the sum
        if (datatype == REDUCE_INT)
            h_idata[i] = (T)(rand() & 0xFF);
        else
            h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
    }

    int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);

    // allocate mem for the result on host side
    T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));

    // allocate device memory and data
    cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
    cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL);

    int testIterations = 100;
    double dTotalTime = 0.0;
    
    // print headers
    shrLog("Time in seconds for various numbers of elements for each kernel\n");
    shrLog("\n\n");
    shrLog("Kernel");
    for (int i = minN; i <= maxN; i *= 2)
    {
        shrLog(", %d", i);
    }
   
    for (int kernel = 0; kernel < 7; kernel++)
    {
        shrLog("\n");
        shrLog("%d", kernel);
        for (int i = minN; i <= maxN; i *= 2)
        {
            int numBlocks = 0;
            int numThreads = 0;
            getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);
            
            double reduceTime;
            if( numBlocks <= MAX_BLOCK_DIM_SIZE ) {
                profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, 
                                testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata);
                reduceTime = dTotalTime/(double)testIterations;
            } else {                
                reduceTime = -1.0;
            }
            shrLog(", %.4f m", reduceTime);
        }
    }

    // cleanup
    free(h_idata);
    free(h_odata);
    clReleaseMemObject(d_idata);
    clReleaseMemObject(d_odata);
}
Пример #19
0
int main(int argc, char* argv[]) {
  struct pb_Parameters *parameters;

  parameters = pb_ReadParameters(&argc, argv);
  if (!parameters)
    return -1;

  if(!parameters->inpFiles[0]){
    fputs("Input file expected\n", stderr);
    return -1;
  }

  
  struct pb_TimerSet timers;
  
  char oclOverhead[] = "OCL Overhead";
  char intermediates[] = "IntermediatesKernel";
  char finals[] = "FinalKernel";

  pb_InitializeTimerSet(&timers);
  
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL);
    
  pb_SwitchToTimer(&timers, pb_TimerID_IO);
  
  int numIterations;
  if (argc >= 2){
    numIterations = atoi(argv[1]);
  } else {
    fputs("Expected at least one command line argument\n", stderr);
    return -1;
  }

  unsigned int img_width, img_height;
  unsigned int histo_width, histo_height;

  FILE* f = fopen(parameters->inpFiles[0],"rb");
  int result = 0;

  result += fread(&img_width,    sizeof(unsigned int), 1, f);
  result += fread(&img_height,   sizeof(unsigned int), 1, f);
  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
  result += fread(&histo_height, sizeof(unsigned int), 1, f);

  if (result != 4){
    fputs("Error reading input and output dimensions from file\n", stderr);
    return -1;
  }

  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));

  result = fread(img, sizeof(unsigned int), img_width*img_height, f);

  fclose(f);

  if (result != img_width*img_height){
    fputs("Error reading input array from file\n", stderr);
    return -1;
  }

  cl_int ciErrNum;
  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(parameters);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
    return -1;
  }

  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  cl_context clContext = (cl_context) pb_context->clContext;
  cl_command_queue clCommandQueue;
  
  cl_program clProgram[2];
  
  cl_kernel histo_intermediates_kernel;
  cl_kernel histo_final_kernel;
  
  cl_mem input;
  cl_mem ranges;
  cl_mem sm_mappings;
  cl_mem global_subhisto;
  cl_mem global_overflow;
  cl_mem final_histo;
  
  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SetOpenCL(&clContext, &clCommandQueue);
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  cl_uint workItemDimensions;
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) );
  
  size_t workItemSizes[workItemDimensions];
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) );
  
  size_t program_length[2];
  const char *source_path[2] = { 
    "src/opencl_mxpa/histo_intermediates.cl", 
   "src/opencl_mxpa/histo_final.cl"};
  char *source[4];

  for (int i = 0; i < 2; ++i) {
    // Dynamically allocate buffer for source
    source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]);
    if(!source[i]) {
      fprintf(stderr, "Could not load program source\n"); exit(1);
    }
  	
  	clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);
  	  	
  	free(source[i]);
  }
  	
  	  	  	  	  	  	  	
  for (int i = 0; i < 2; ++i) {
    //fprintf(stderr, "Building Program #%d...\n", i);
    OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) );
       
    /*
       char *build_log;
       size_t ret_val_size;
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
       build_log = (char *)malloc(ret_val_size+1);
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
       	OCL_ERRCK_VAR(ciErrNum);
       	

       // to be carefully, terminate with \0
       // there's no information in the reference whether the string is 0 terminated or not
       build_log[ret_val_size] = '\0';

       fprintf(stderr, "%s\n", build_log );
     */
  }
  	
  histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SwitchToTimer(&timers, pb_TimerID_COPY);  

  input =           clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  ranges =          clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);  
  sm_mappings =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  final_histo =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);

  // Must dynamically allocate. Too large for stack
  unsigned int *zeroData;
  zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int));
  if (zeroData == NULL) {
    fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height);
    exit(1);
  }
   
  for (int y=0; y < img_height; y++){
    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, 
                          y*img_width*sizeof(unsigned int), // Offset in bytes
                          img_width*sizeof(unsigned int), // Size of data to write
                          &img[y*img_width], // Host Source
                          0, NULL, NULL) );
  }
 
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  unsigned int img_dim = img_height*img_width;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) );
  
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) );

  size_t inter_localWS[1] = { workItemSizes[0] };
  size_t inter_globalWS[1] = { img_height * inter_localWS[0] };
  
  size_t final_localWS[1] = { workItemSizes[0] };
  size_t final_globalWS[1] = {(((int)(histo_height*histo_width+(final_localWS[0]-1))) /
                                          (int)final_localWS[0])*(int)final_localWS[0] };
  
  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

  for (int iter = 0; iter < numIterations; iter++) {
    unsigned int ranges_h[2] = {UINT32_MAX, 0};
    
    // how about something like
    // __global__ unsigned int ranges[2];
    // ...kernel
    // __shared__ unsigned int s_ranges[2];
    // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
    // __syncthreads();
    
    // Although then removing the blocking cudaMemcpy's might cause something about
    // concurrent kernel execution.
    // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?


  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 
                          0, // Offset in bytes
                          2*sizeof(unsigned int), // Size of data to write
                          ranges_h, // Host Source
                          0, NULL, NULL) );
                          
  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 
                          0, // Offset in bytes
                          histo_width*histo_height*sizeof(unsigned int), // Size of data to write
                          zeroData, // Host Source
                          0, NULL, NULL) );
                          
  pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);

  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0,
                            inter_globalWS, inter_localWS, 0, 0, 0) );              
  pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);                            
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0,
                            final_globalWS, final_localWS, 0, 0, 0) );                           
  }

  pb_SwitchToTimer(&timers, pb_TimerID_IO);

  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 
                          0, // Offset in bytes
                          histo_height*histo_width*sizeof(unsigned char), // Size of data to read
                          histo, // Host Source
                          0, NULL, NULL) );                         

  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) );
  
  OCL_ERRCK_RETVAL ( clReleaseMemObject(input) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) );

  if (parameters->outFile) {
    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
  }

  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  free(zeroData);
  free(img);
  free(histo);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  printf("\n");
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(parameters);
  
  pb_DestroyTimerSet(&timers);

  OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) );
  OCL_ERRCK_RETVAL ( clReleaseContext(clContext) );

  return 0;
}
bool
runTest( int argc, const char** argv, ReduceType datatype) 
{
    int size = 1<<24;    // number of elements to reduce
    int maxThreads;

    cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1);        
    clReleaseKernel(reductionKernel);

    if (smallBlock) 
      maxThreads = 64;  // number of threads per block
    else
      maxThreads = 128;

    int whichKernel = 6;
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;

    shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
    shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
    shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
    shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
    
    shrLog(" %d elements\n", size);
    shrLog(" %d threads (max)\n", maxThreads);

    cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE);
    shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);

    bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE);

#ifdef GPU_PROFILING
    if (runShmoo)
    {
        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
        return true;
    }
    else
#endif
    {
        // create random input data on CPU
        unsigned int bytes = size * sizeof(T);
        T* h_idata = (T*)malloc(bytes);

        for(int i=0; i<size; i++) 
        {
            // Keep the numbers small so we don't get truncation error in the sum
            if (datatype == REDUCE_INT)
                h_idata[i] = (T)(rand() & 0xFF);
            else
                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
        if (numBlocks == 1) cpuFinalThreshold = 1;
        shrLog(" %d blocks\n\n", numBlocks);

        // allocate mem for the result on host side
        T* h_odata = (T*)malloc(numBlocks * sizeof(T));

        // allocate device memory and data
        cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
        cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL);
      
        int testIterations = 100;
        double dTotalTime = 0.0;
        T gpu_result = 0;
        gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks,
                                        whichKernel, testIterations, cpuFinalReduction, 
                                        cpuFinalThreshold, &dTotalTime,
                                        h_odata, d_idata, d_odata);

#ifdef GPU_PROFILING
        double reduceTime = dTotalTime/(double)testIterations;
        shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 
               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
#endif

        // compute reference solution
        shrLog("\nComparing against Host/C++ computation...\n"); 
        T cpu_result = reduceCPU<T>(h_idata, size);
        if (datatype == REDUCE_INT)
        {
            shrLog(" GPU result = %d\n", gpu_result);
            shrLog(" CPU result = %d\n\n", cpu_result);
            shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED");
        }
        else
        {
            shrLog(" GPU result = %.9f\n", gpu_result);
            shrLog(" CPU result = %.9f\n\n", cpu_result);

            double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12;
            double diff = abs((double)gpu_result - (double)cpu_result);
            shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED");
        }
      
        // cleanup
        free(h_idata);
        free(h_odata);
        clReleaseMemObject(d_idata);
        clReleaseMemObject(d_odata);

        return (gpu_result == cpu_result);
    }
}
Пример #21
0
int
BinomialOption::setupCL()
{
    cl_int status = CL_SUCCESS;
    cl_device_type dType;
    
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };
    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    status = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_OPENCL_ERROR(status, "deviceInfo.setDeviceInfo failed");

    {
        // The block is to move the declaration of prop closer to its use
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(context, 
                                            devices[deviceId], 
                                            prop, 
                                            &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
    }

    // Create and initialize memory objects

    // Set Presistent memory only for AMD platform
    cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;
    // if(isAmdPlatform())
    //     inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

    // Create memory object for stock price
    randBuffer = clCreateBuffer(context,
                                inMemFlags,
                                numSamples * sizeof(cl_float4),
                                NULL,
                                &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (randBuffer)");

    // Create memory object for output array
    outBuffer = clCreateBuffer(context,
                               CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                               numSamples * sizeof(cl_float4),
                               NULL,
                               &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outBuffer)");

    // create a CL program using the kernel source 
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("BinomialOption_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string("");
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name
    kernel = clCreateKernel(program,
                            "binomial_options",
                            &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

    status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[deviceId]);
    CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed");

    // If group-size is gerater than maximum supported on kernel
    if((size_t)(numSteps + 1) > kernelInfo.kernelWorkGroupSize)
    {
        if(!quiet)
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << (numSteps + 1) << std::endl;
            std::cout << "Max Group Size supported on the kernel : " 
                      << kernelInfo.kernelWorkGroupSize << std::endl;
            std::cout << "Using appropiate group-size." << std::endl;
            std::cout << "-------------------------------------------" << std::endl;
        }
        numSteps = (cl_int)kernelInfo.kernelWorkGroupSize - 2;
    }

    return SDK_SUCCESS;
}
Пример #22
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufY, bufParam;
    cl_event event = NULL;
    int ret = 0;
	int lenX = 1 + (N-1)*abs(incx);
	int lenY = 1 + (N-1)*abs(incy);
	int lenParam = 5;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);
    bufParam = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenParam*sizeof(cl_float)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufParam, CL_TRUE, 0, (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasSrotm(N, bufX, 0, incx, bufY, 0, incy, bufParam, 0, 1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSrotm() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
                                    Y, 0, NULL, NULL);
        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
                                    X, 0, NULL, NULL);

        /* At this point you will get the result of SROTM placed in vector Y. */
        printResult();
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufParam);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Пример #23
0
int main(int argc, char **argv)
{

	cl_int ret;


	/*
	 * Command line
	 */
	char *binary_path;
	if (argc != 2)
	{
		printf("syntax: %s <binary>\n", argv[0]);
		exit(1);
	}
	binary_path = argv[1];


	/*
	 * Platform
	 */

	/* Get platform */
	cl_platform_id platform;
	cl_uint num_platforms;
	ret = clGetPlatformIDs(1, &platform, &num_platforms);
	if (ret != CL_SUCCESS)
	{
		printf("error: second call to 'clGetPlatformIDs' failed\n");
		exit(1);
	}
	printf("Number of platforms: %d\n", num_platforms);

	/* Get platform name */
	char platform_name[100];
	ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clGetPlatformInfo' failed\n");
		exit(1);
	}
	printf("platform.name='%s'\n", platform_name);
	printf("\n");



	/*
	 * Device
	 */

	/* Get device */
	cl_device_id device;
	cl_uint num_devices;
	ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clGetDeviceIDs' failed\n");
		exit(1);
	}
	printf("Number of devices: %d\n", num_devices);

	/* Get device name */
	char device_name[100];
	ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clGetDeviceInfo' failed\n");
		exit(1);
	}
	printf("device.name='%s'\n", device_name);
	printf("\n");



	/*
	 * Context
	 */
	
	/* Create context */
	cl_context context;
	context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateContext' failed\n");
		exit(1);
	}

	

	/*
	 * Command Queue
	 */
	
	/* Create command queue */
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(context, device, 0, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateCommandQueue' failed\n");
		exit(1);
	}
	printf("\n");



	/*
	 * Program
	 */
	
	/* Program binary */
	const unsigned char *binary;
	size_t binary_length;

	/* Read binary */
	binary = read_buffer(binary_path, &binary_length);
	if (!binary)
	{
		printf("error: %s: cannot open binary\n", binary_path);
		exit(1);
	}
	
	/* Create a program */
	cl_program program;
	program = clCreateProgramWithBinary(context, 1, &device, &binary_length,
			&binary, NULL, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateProgramWithSource' failed\n");
		exit(1);
	}

	/* Build program */
	ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
	if (ret != CL_SUCCESS )
	{
		size_t size;
		char *log;

		/* Get log size */
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);

		/* Allocate log and print */
		log = malloc(size);
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, size, log, NULL);
		printf("error: call to 'clBuildProgram' failed:\n%s\n", log);

		/* Free log and exit */
		free(log);
		exit(1);
	}
	printf("program built\n");
	printf("\n");



	/*
	 * Kernel
	 */
	
	/* Create kernel */
	cl_kernel kernel;
	kernel = clCreateKernel(program, "vector_add", &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateKernel' failed\n");
		exit(1);
	}
	printf("\n");


	/*
	 * Buffers
	 */
	
	/* Create and allocate host buffers */
	size_t num_elem = 10;

	cl_int *src1_host_buffer;
	cl_int *src2_host_buffer;
	cl_int *dst_host_buffer;
	src1_host_buffer = malloc(num_elem * sizeof(cl_int));
	src2_host_buffer = malloc(num_elem * sizeof(cl_int));
	dst_host_buffer = malloc(num_elem * sizeof(cl_int));

	/* Initialize host source buffer */
	int i;
	for (i = 0; i < num_elem; i++)
	{
		src1_host_buffer[i] = i;
		src2_host_buffer[i] = 100;
	}
	
	/* Create device source buffers */
	cl_mem src1_device_buffer;
	cl_mem src2_device_buffer;
	src1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL);
	src2_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL);
	if (!src1_device_buffer || !src2_device_buffer)
	{
		printf("error: could not create destination buffer\n");
		exit(1);
	}

	/* Create device destination buffer */
	cl_mem dst_device_buffer;
	dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem * sizeof(cl_int), NULL, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: could not create destination buffer\n");
		exit(1);
	}

	/* Copy buffer */
	ret = clEnqueueWriteBuffer(command_queue, src1_device_buffer, CL_TRUE,
		0, num_elem * sizeof(cl_int), src1_host_buffer, 0, NULL, NULL);
	ret |= clEnqueueWriteBuffer(command_queue, src2_device_buffer, CL_TRUE,
		0, num_elem * sizeof(cl_int), src2_host_buffer, 0, NULL, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clEnqueueWriteBuffer' failed\n");
		exit(1);
	}


	/*
	 * Kernel arguments
	 */
	
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src1_device_buffer);
	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src2_device_buffer);
	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clSetKernelArg' failed\n");
		exit(1);
	}
	
	
	/*
	 * Launch Kernel
	 */
	
	size_t global_work_size = num_elem;
	size_t local_work_size = num_elem;

	/* Launch the kernel */
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
		&global_work_size, &local_work_size, 0, NULL, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
		exit(1);
	}

	/* Wait for it to finish */
	clFinish(command_queue);


	/*
	 * Result
	 */
	
	/* Receive buffer */
	ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,
		0, num_elem * sizeof(cl_int), dst_host_buffer, 0, NULL, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clEnqueueReadBuffer' failed\n");
		exit(1);
	}

	/* Print result */
	for (i = 0; i < num_elem; i++)
		printf("dst_host_buffer[%d] = %d\n", i, dst_host_buffer[i]);
	printf("\n");

	return 0;
}
void test_variable_opencl_func(void *buffers[], void *args)
{
	STARPU_SKIP_IF_VALGRIND;

	int id, devid, ret;
	int factor = *(int *) args;

        cl_int             err;
	cl_kernel          kernel;
	cl_command_queue   queue;
	cl_event           event;

	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");

	cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(buffers[0]);

	cl_context context;
	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);
	starpu_opencl_get_context(devid, &context);

	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
		sizeof(int), &variable_config.copy_failed, &err);

	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);


	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_program,
					"variable_opencl",
					devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(fail), &fail);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 2, sizeof(factor), &factor);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global = 1;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	err = clEnqueueReadBuffer(queue,
				  fail,
				  CL_TRUE,
				  0, 
				  sizeof(int),
				  &variable_config.copy_failed,
				  0,
				  NULL,
				  NULL);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
        ret = starpu_opencl_unload_opencl(&opencl_program);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
	return;
}
int
FastWalshTransform::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;

    if(sampleArgs->deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //sampleArgs->deviceType = "gpu"
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(sampleArgs->isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = getPlatform(platform, sampleArgs->platformId,
                               sampleArgs->isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed");

    // Display available devices.
    retValue = displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed");


    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform,
        0
    };

    context = clCreateContextFromType(
                  cps,
                  dType,
                  NULL,
                  NULL,
                  &status);
    CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = getDevices(context, &devices, sampleArgs->deviceId,
                        sampleArgs->isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed");

    {
        // The block is to move the declaration of prop closer to its use
        cl_command_queue_properties prop = 0;
        commandQueue = clCreateCommandQueue(
                           context,
                           devices[sampleArgs->deviceId],
                           prop,
                           &status);
        CHECK_OPENCL_ERROR( status, "clCreateCommandQueue failed.");
    }

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");

    inputBuffer = clCreateBuffer(
                      context,
                      CL_MEM_READ_WRITE,
                      sizeof(cl_float) * length,
                      0,
                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputBuffer)");

    // create a CL program using the kernel source
    buildProgramData buildData;
    buildData.kernelName = std::string("FastWalshTransform_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = sampleArgs->deviceId;
    buildData.flagsStr = std::string("");
    if(sampleArgs->isLoadBinaryEnabled())
    {
        buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
    }

    if(sampleArgs->isComplierFlagsSpecified())
    {
        buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
    }

    retValue = buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed");

    // get a kernel object handle for a kernel with the given name
    kernel = clCreateKernel(program, "fastWalshTransform", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

    return SDK_SUCCESS;
}
Пример #26
0
int main(int argc, char **argv)
{
    int	start,end;
    unsigned long	p[64], c[64], k[56];
    unsigned long	res;

    build_samples (p, c, k, 0);
    set_low_keys(k);

    cl_platform_id cpPlatform;
    clGetPlatformIDs(1, &cpPlatform, NULL);

    cl_device_id cdDevice;
    clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

    char cBuffer[1024];
    clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
    printf("CL_DEVICE_NAME:\t\t%s\n", cBuffer);
    clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
    printf("CL_DRIVER_VERSION:\t%s\n\n", cBuffer);
    cl_uint compute_units;
    clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
    printf("CL_DEVICE_MAX_COMPUTE_UNITS:\t%u\n", compute_units);
    size_t workitem_dims;
    clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
    printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
    size_t workitem_size[3];
    clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
    printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
    size_t workgroup_size;
    clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
    printf("CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
    cl_uint clock_frequency;
    clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
    printf("CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);

    cl_context GPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, NULL);

    cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, 0, NULL);

    cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                                       CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, p, NULL);
    cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                                       CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, c, NULL);
    cl_mem GPUVector3 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                                       CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 56, k, NULL);

    cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                            sizeof(unsigned long), &res, NULL);

    size_t szKernelLength;
    char* cSourceCL = oclLoadProgSource("ocl_deseval.cl", "", &szKernelLength);
    cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1,
                               (const char **)&cSourceCL, &szKernelLength, NULL);

    if (clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL)!=CL_SUCCESS)
    {
        char cBuffer[2048];
        if(clGetProgramBuildInfo(OpenCLProgram,cdDevice,CL_PROGRAM_BUILD_LOG,sizeof(cBuffer),cBuffer,NULL)==CL_SUCCESS);
        printf("Build error:\n%s\n",cBuffer);
        exit(1);
    }
    cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "keysearch", NULL);

    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);
    clSetKernelArg(OpenCLVectorAdd, 3, sizeof(cl_mem), (void*)&GPUVector3);

    size_t WorkSize[1] = {1024};
    start=clock();
    for (int i=0; i<1024; i++) {
        //clEnqueueWriteBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0,
        //									56 * sizeof(unsigned long), k, 0, NULL, NULL);
        clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL,
                               WorkSize, NULL, 0, NULL, NULL);
        //clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0,
        //									sizeof(unsigned long), &res, 0, NULL, NULL);
        if(res!=0) {
            printf("Key found\n");
            //key_found(res,k);
            break;
        }
        increment_key (k);
    }
    end=clock();

    clReleaseKernel(OpenCLVectorAdd);
    clReleaseProgram(OpenCLProgram);
    clReleaseCommandQueue(cqCommandQueue);
    clReleaseContext(GPUContext);
    clReleaseMemObject(GPUVector1);
    clReleaseMemObject(GPUVector2);
    clReleaseMemObject(GPUOutputVector);

    printf ("Searched %i keys in %.3f seconds\n", 1000000, ((double)(end-start))/CLOCKS_PER_SEC);
    return 0;
}
Пример #27
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufX;
    cl_event event = NULL;
    int ret = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float),
                          NULL, &err);
    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float),
                          NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
                               N * lda * sizeof(cl_float), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
                               N * sizeof(cl_float), X, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasStbsv(order, uplo, trans, diag, N, K,
                      bufA, 0, lda, bufX, 0, incx, 1, &queue, 0, NULL, &event);

    if (err != CL_SUCCESS) {
        printf("clblasStbsv() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float),
                                  X, 0, NULL, NULL);

        /* At this point you will get the result of STBSV placed in X array. */
        printResult();
    }

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufA);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Пример #28
0
int main()
{
  size_t global_work_size[1] = { 1 }, local_work_size[1]= { 1 };
  cl_int err;
  cl_platform_id platforms[1];
  cl_uint nplatforms;
  cl_device_id devices[1]; // + 1 for duplicate test
  cl_uint num_devices;
  cl_program program = NULL;
  cl_kernel kernelA = NULL;
  cl_kernel kernelB = NULL;
  cl_kernel kernelC= NULL;
  char inputA[] = "A";
  char inputB[] = "B";
  char inputC[] = "C";
  cl_mem inputBufferA = NULL;
  cl_mem inputBufferB = NULL;
  cl_mem inputBufferC = NULL;
  /* command queues */
  cl_command_queue queueA = NULL;
  cl_command_queue queueB = NULL;
  cl_command_queue queueC = NULL;
  /* events */
  cl_event eventA1 = NULL;
  cl_event eventB2 = NULL;
  cl_event eventA3 = NULL;
  cl_event eventB4 = NULL;
  /* event wait lists */
  cl_event B2_wait_list[1];
  cl_event A3_wait_list[1];
  cl_event B4_wait_list[1];
  cl_event C5_wait_list[2];

  err = clGetPlatformIDs(1, platforms, &nplatforms);	
  if (err != CL_SUCCESS && !nplatforms)
    return EXIT_FAILURE;
  
  err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 1,
                       devices, &num_devices);  
  if (err != CL_SUCCESS)
    return EXIT_FAILURE;

  cl_context context = clCreateContext(NULL, num_devices, devices, NULL, 
                                       NULL, &err);
  if (err != CL_SUCCESS)
    return EXIT_FAILURE;

  err = clGetContextInfo(context, CL_CONTEXT_DEVICES,
                         sizeof(cl_device_id), devices, NULL);
  if (err != CL_SUCCESS) 
    {
      puts("clGetContextInfo call failed\n");
      goto error;
    }

  queueA = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (!queueA) 
    {
      puts("clCreateCommandQueue call failed\n");
      goto error;
    }

  queueB = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (!queueB) 
    {
      puts("clCreateCommandQueue call failed\n");
      goto error;
    }

  queueC = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (!queueB) 
    {
      puts("clCreateCommandQueue call failed\n");
      goto error;
    }

  inputBufferA = clCreateBuffer(context, 
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                                strlen (inputB)+1, (void *) inputA, &err);
  if (inputBufferA == NULL)
    {
      printf("clCreateBuffer call failed err = %d\n", err);
      goto error;
    }

  inputBufferB = clCreateBuffer(context, 
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                                strlen (inputA)+1, (void *) inputB, &err);
  if (inputBufferB == NULL)
    {
      printf("clCreateBuffer call failed err = %d\n", err);
      goto error;
    }
  
  inputBufferC = clCreateBuffer(context, 
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                                strlen (inputA)+1, (void *) inputC, &err);
  if (inputBufferC == NULL)
    {
      printf("clCreateBuffer call failed err = %d\n", err);
      goto error;
    }
  
  
  size_t kernel_size = strlen (kernelASourceCode);
  char* kernel_buffer = kernelASourceCode;
  
  program = clCreateProgramWithSource (context, 1, 
                                       (const char**)&kernel_buffer, 
                                       &kernel_size, &err);
  if (err != CL_SUCCESS)
    return EXIT_FAILURE;

  err = clBuildProgram (program, num_devices, devices, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
    return EXIT_FAILURE;

  kernelA = clCreateKernel (program, "test_kernel", NULL); 
  if (!kernelA) 
    {
      puts("clCreateKernel call failed\n");
      goto error;
    }

  kernelB = clCreateKernel (program, "test_kernel", NULL); 
  if (!kernelB) 
    {
      puts("clCreateKernel call failed\n");
      goto error;
    }
  
  kernelC = clCreateKernel (program, "test_kernel", NULL); 
  if (!kernelC) 
    {
      puts("clCreateKernel call failed\n");
      goto error;
    }
  
  err = clSetKernelArg (kernelA, 0, sizeof (cl_mem), &inputBufferA);
  if (err)
    {
      puts("clSetKernelArg failed\n");
      goto error;
    }
 
  err = clSetKernelArg (kernelB, 0, sizeof (cl_mem), &inputBufferB);
  if (err)
    {
      puts("clSetKernelArg failed\n");
      goto error;
    }
  
  err = clSetKernelArg (kernelC, 0, sizeof (cl_mem), &inputBufferC);
  if (err)
    {
      puts("clSetKernelArg failed\n");
      goto error;
    }

    

  /* first enqueue A1*/
  err = clEnqueueNDRangeKernel (queueA, kernelA, 1, NULL, global_work_size, 
                                local_work_size, 0, NULL, &eventA1); 
  if (err != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }

  /* enqueue B2 */
  B2_wait_list[0] = eventA1;
  err = clEnqueueNDRangeKernel (queueB, kernelB, 1, NULL, global_work_size, 
                                local_work_size, 1, B2_wait_list, &eventB2); 
  if (err != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }

  /* enqueue A3 */
  A3_wait_list[0] = eventB2;
  err = clEnqueueNDRangeKernel (queueA, kernelA, 1, NULL, global_work_size, 
                                local_work_size, 1, A3_wait_list, &eventA3); 
  if (err != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }

  /* enqueue B4 */
  B4_wait_list[0] = eventA3;
  err = clEnqueueNDRangeKernel (queueB, kernelB, 1, NULL, global_work_size, 
                                local_work_size, 1, B4_wait_list, &eventB4); 
  if (err != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }

  /* enqueue C5 */
  C5_wait_list[0] = eventA3;
  C5_wait_list[1] = eventB4;
  err = clEnqueueNDRangeKernel (queueC, kernelC, 1, NULL, global_work_size, 
                                local_work_size, 2, C5_wait_list, NULL); 
  if (err != CL_SUCCESS) 
    {
      puts("clEnqueueNDRangeKernel call failed\n");
      goto error;
    }


  clFinish(queueC);
  printf("\n");
  return EXIT_SUCCESS;

 error:
  return EXIT_FAILURE;

}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program binary */
        unsigned char *bin;
        size_t bin_len;
        cl_int bin_ret;
        
        /* Read program binary */
        if (argc == 2)
                bin = read_buffer((char *)argv[1], &bin_len);
        else
        {
                printf("error: No binary specified\n");
                exit(1);
        }
        
        /* Create a program */
        cl_program program;
        program = clCreateProgramWithBinary(context, 1, &device, &bin_len, (const unsigned char **)&bin, &bin_ret, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithBinary' failed\n");
                exit(1);
        }
        if (bin_ret != CL_SUCCESS)
        {
                printf("error: Invalid binary for device\n");
                exit(1);
        }
        printf("program=%p\n", program);
        
        /* Free binary */
        free(bin);
        
        printf("program binary loaded\n");
        printf("\n");

        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object*/
        cl_kernel kernel;
        kernel = clCreateKernel(program, "not_equal_ulong16ulong16", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_ulong16 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_ulong16));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_ulong16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong16), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create and init host side src buffer 1 */
        cl_ulong16 *src_1_host_buffer;
        src_1_host_buffer = malloc(num_elem * sizeof(cl_ulong16));
        for (int i = 0; i < num_elem; i++)
                src_1_host_buffer[i] = (cl_ulong16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 1 */
        cl_mem src_1_device_buffer;
        src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong16), src_1_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_int16 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_int16));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_int16));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_int16), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
        ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_int16), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_int16));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Free host side src buffer 1 */
        free(src_1_host_buffer);

        /* Free device side src buffer 1 */
        ret = clReleaseMemObject(src_1_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Пример #30
0
int main()
{
	int i,j,k;
	// nb of operations:
	const int dsize = 512;
	int nthreads = 1;
	int nbOfAverages = 1e2;
	int opsMAC = 2; // operations per MAC
	cl_short4 *in, *out;
	cl_half *ck;
	double tops; //total ops

#define NQUEUES 1
	cl_int err;
	cl_platform_id platform = 0;
	cl_device_id device = 0;
	cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
	cl_context ctx = 0;
	cl_command_queue queues[NQUEUES];
	cl_mem bufin, bufck, bufout;
	cl_event event = NULL;
	cl_program program;
	cl_kernel kernel;
	size_t global[2], local[2];
	size_t param[5];
	char version[300];
  
	// allocate matrices
	
	in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in));
	out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out));
	ck = (cl_half *) calloc(9*9, sizeof(*ck));
	in[0].x = 0x3c00;
	in[1].x = 0x4000;
	in[dsize].x = 0x4100;
	ck[0] = 0x3c00;
	ck[1] = 0x4000;
	ck[9] = 0x3000;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs( 1, &platform, NULL );
    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
    for(i = 0; i < NQUEUES; i++)
    	queues[i] = clCreateCommandQueue( ctx, device, 0, &err );

	// Print some info about the system
	clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL);
	printf("CL_DEVICE_VERSION=%s\n", version);
	clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL);
	printf("CL_DRIVER_VERSION=%s\n", version);
	program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err);
	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]);
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]);
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]);
	j = param[0];
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL);
	printf("CL_DEVICE_MAX_WORK_ITEM_SIZES=");
	for(i = 0; i < j; i++)
		printf("%d ", (int)param[i]);
	printf("\n");
        clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL);
        printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]);
		
		
	program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err);
	if(!program)
	{
		printf("Error creating program\n");
		return -1;
	}
	err = clBuildProgram(program, 0, 0, 0, 0, 0);
	if(err != CL_SUCCESS)
	{
		char buffer[20000];
		size_t len;
		
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		puts(buffer);
		return -1;
	}
	kernel = clCreateKernel(program, "conv9x9", &err);
	if(!kernel || err != CL_SUCCESS)
	{
		printf("Error creating kernel\n");
		return -1;
	}

    /* Prepare OpenCL memory objects and place matrices inside them. */
	cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT};
	cl_int rc;
	bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc);
	bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc);
    bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck),
                          NULL, &err );

	size_t origin[3] = {0,0,0};
	size_t region[3] = {dsize, dsize, 1};
    err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL );
    err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL );
	clSetKernelArg(kernel, 0, sizeof(int), &dsize);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin);
	clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck);
	clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout);
	local[0] = 8;
	local[1] = 8;
	global[0] = global[1] = dsize-32;
    usleep(100000);

	struct timeval start,end;
	gettimeofday(&start, NULL);

	for (k=0; k<nthreads; k++) {
		//printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads());
		for(i=0;i<nbOfAverages;i++) {
		// do the 2D convolution
			err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL);
			if(err != CL_SUCCESS)
			{
				printf("clEnqueueNDRangeKernel error %d\n", err);
				return -1;
			}
		}
	}

	clFinish(queues[0]);
	gettimeofday(&end, NULL);
	double t = ((double) (end.tv_sec - start.tv_sec))
	+ ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified!

    /* Wait for calculations to be finished. */

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL );
	clFinish(queues[0]);
	
	printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x);

    /* Release OpenCL memory objects. */
    clReleaseMemObject( bufin );
    clReleaseMemObject( bufck );
    clReleaseMemObject( bufout );

    /* Release OpenCL working objects. */
    for(i = 0; i < NQUEUES; i++)
    	clReleaseCommandQueue( queues[i] );
    clReleaseContext( ctx );
	
	// report performance:
	tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops
	printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads);
	printf("\nTime in s: %lf:", t);
	printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9);
	printf("\n");
	return(0);
}