Exemple #1
0
void computeMIonGPU(SequenceSet& sequence, Matrix<float>& MI, bool GPU)
{
	// initializes context and kernel and stores them
	OCL ocl(GPU);

	cl_int oclError1, oclError2;

	timeval start, end;

	// memory sizes
	size_t sequenceLength = sequence.getSequenceLength();
	size_t numSequences = sequence.getNumberOfSequences();
	
	// matrix MI is of size numElements
	size_t numElements = sequenceLength * sequenceLength;
	size_t sequenceSize = sequence.getNumberOfSequences() * sequenceLength;
	size_t onePointProbsSize = sequenceLength * NUMPROTEINCHARS;
		
	// host memory
	float * dst = new float[MI.size()];
	memset(dst, 0, MI.size());
	
	// device memory for sequences, one point probablities and resulting matrix
	cl_mem oclDevSrcSequence, oclDevSrcOnePointProbs, oclDevDstMI;
	
	// size for a work group: each workgroup computes one matrix entry, thus computes the correlation
	// one time for each character => 25 work items are sufficient
	size_t localWorkSize[2] = { 5, 5 };
	if (sequenceLength % localWorkSize[0] != 0) throw std::runtime_error("sequence length ^ 2 not divisable by local work size");
	
	// global work size defines the total amount of threads over all work group, thus needs to be a multiple of the local
	// work size in each dimension.
	size_t globalWorkSize[2] = { sequenceLength, sequenceLength };
	
	// create buffer on device, one for each input array
	oclDevSrcSequence = clCreateBuffer(		ocl.oclContext,
											CL_MEM_READ_ONLY,
											sizeof(cl_uchar) * sequenceSize,
											0, &oclError1);

	oclDevSrcOnePointProbs = clCreateBuffer(ocl.oclContext,
											CL_MEM_READ_ONLY,
											sizeof(cl_float) * onePointProbsSize,
											0, &oclError2);
	oclError1 |= oclError2;

	oclDevDstMI = clCreateBuffer(			ocl.oclContext,
											CL_MEM_WRITE_ONLY,
											sizeof(cl_float) * numElements,
											0, &oclError2);
	oclError1 |= oclError2;
	
	if (oclError1 != CL_SUCCESS) {
		std::cout << "error while allocating buffers" << std::endl;
		exit(1);
	}
	
	// set buffer to appropriate kernel arguments
	oclError1 = clSetKernelArg(ocl.oclKernel, 0, sizeof(cl_mem), (void*)&oclDevSrcSequence);
	oclError1 |= clSetKernelArg(ocl.oclKernel, 1, sizeof(cl_mem), (void*)&oclDevSrcOnePointProbs);
	oclError1 |= clSetKernelArg(ocl.oclKernel, 2, sizeof(cl_mem), (void*)&oclDevDstMI);
	oclError1 |= clSetKernelArg(ocl.oclKernel, 3, sizeof(cl_uint), &sequenceLength);
	oclError1 |= clSetKernelArg(ocl.oclKernel, 4, sizeof(cl_uint), &numSequences);

	if (oclError1 != CL_SUCCESS) {
		std::cout << "error while setting arguments: " << ocl.oclErrorString(oclError1) << std::endl;
		exit(1);
	}

	// copy host memory to device, non-blocking copy
	oclError1 = clEnqueueWriteBuffer(	ocl.oclCmdQueue,
						oclDevSrcSequence,
						CL_FALSE,
						0,
						sizeof(cl_uchar) * sequenceSize,
						(const void *) sequence.getData(),
						0, 0, 0);

	oclError1 |= clEnqueueWriteBuffer(	ocl.oclCmdQueue,
						oclDevSrcOnePointProbs,
						CL_FALSE,
						0,
						sizeof(cl_float) * onePointProbsSize,
						(const void *) sequence.getOnePointProbs(),
						0, 0, 0);

	if (oclError1 != CL_SUCCESS) {
		std::cout << "error while writing to device " << ocl.oclErrorString(oclError1) << std::endl;
		exit(1);
	}

	// execute kernel LOOPCOUNT times and measure execution time
	// TODO LOOPCOUNT aendern, um Kernel mehrfach auszufuehren
	gettimeofday(&start, 0);
	for (int i = 0; i < LOOPCOUNT; ++i) {
		oclError1 = clEnqueueNDRangeKernel(	ocl.oclCmdQueue,
							ocl.oclKernel,
							2,			// dimension
							0,
							globalWorkSize,
							localWorkSize,
							0, 0, 0);
	
	
		if (oclError1 != CL_SUCCESS) {
			std::cout << "error while executing kernel: " << ocl.oclErrorString(oclError1) << std::endl;
			exit(1);
		}
	}
	
	// clFinish blocks until all issued commands so far are completed, necessary for computing execution time
	oclError1 = clFinish(ocl.oclCmdQueue);
	gettimeofday(&end, 0);

	// read memory from device, store in temporary array and if no error happend copy to result matrix
	oclError1 = clEnqueueReadBuffer(	ocl.oclCmdQueue,
						oclDevDstMI,
						CL_TRUE,
						0,
						sizeof(cl_float) * numElements,
						dst,
						0, 0, 0);

	if (oclError1 != CL_SUCCESS) {
		std::cout << "error while reading from device: " << ocl.oclErrorString(oclError1) << std::endl;
		exit(1);
	}

	std::cout << "execution time: " 
			<< (end.tv_sec - start.tv_sec ) * 1000 +  ( end.tv_usec - start.tv_usec) / 1000
			<< " milliseconds" << std::endl;
	
	// fill the matrix with the computed results
	MI.copyElements(dst);
	
	// release used memory, can cause really bad crashes otherwise
	clReleaseMemObject(oclDevSrcSequence);
	clReleaseMemObject(oclDevSrcOnePointProbs);
	clReleaseMemObject(oclDevDstMI);
}