void computeMIonGPU(SequenceSet& sequence, Matrix<float>& MI, bool GPU) { // initializes context and kernel and stores them OCL ocl(GPU); cl_int oclError1, oclError2; timeval start, end; // memory sizes size_t sequenceLength = sequence.getSequenceLength(); size_t numSequences = sequence.getNumberOfSequences(); // matrix MI is of size numElements size_t numElements = sequenceLength * sequenceLength; size_t sequenceSize = sequence.getNumberOfSequences() * sequenceLength; size_t onePointProbsSize = sequenceLength * NUMPROTEINCHARS; // host memory float * dst = new float[MI.size()]; memset(dst, 0, MI.size()); // device memory for sequences, one point probablities and resulting matrix cl_mem oclDevSrcSequence, oclDevSrcOnePointProbs, oclDevDstMI; // size for a work group: each workgroup computes one matrix entry, thus computes the correlation // one time for each character => 25 work items are sufficient size_t localWorkSize[2] = { 5, 5 }; if (sequenceLength % localWorkSize[0] != 0) throw std::runtime_error("sequence length ^ 2 not divisable by local work size"); // global work size defines the total amount of threads over all work group, thus needs to be a multiple of the local // work size in each dimension. size_t globalWorkSize[2] = { sequenceLength, sequenceLength }; // create buffer on device, one for each input array oclDevSrcSequence = clCreateBuffer( ocl.oclContext, CL_MEM_READ_ONLY, sizeof(cl_uchar) * sequenceSize, 0, &oclError1); oclDevSrcOnePointProbs = clCreateBuffer(ocl.oclContext, CL_MEM_READ_ONLY, sizeof(cl_float) * onePointProbsSize, 0, &oclError2); oclError1 |= oclError2; oclDevDstMI = clCreateBuffer( ocl.oclContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * numElements, 0, &oclError2); oclError1 |= oclError2; if (oclError1 != CL_SUCCESS) { std::cout << "error while allocating buffers" << std::endl; exit(1); } // set buffer to appropriate kernel arguments oclError1 = clSetKernelArg(ocl.oclKernel, 0, sizeof(cl_mem), (void*)&oclDevSrcSequence); oclError1 |= clSetKernelArg(ocl.oclKernel, 1, sizeof(cl_mem), (void*)&oclDevSrcOnePointProbs); oclError1 |= clSetKernelArg(ocl.oclKernel, 2, sizeof(cl_mem), (void*)&oclDevDstMI); oclError1 |= clSetKernelArg(ocl.oclKernel, 3, sizeof(cl_uint), &sequenceLength); oclError1 |= clSetKernelArg(ocl.oclKernel, 4, sizeof(cl_uint), &numSequences); if (oclError1 != CL_SUCCESS) { std::cout << "error while setting arguments: " << ocl.oclErrorString(oclError1) << std::endl; exit(1); } // copy host memory to device, non-blocking copy oclError1 = clEnqueueWriteBuffer( ocl.oclCmdQueue, oclDevSrcSequence, CL_FALSE, 0, sizeof(cl_uchar) * sequenceSize, (const void *) sequence.getData(), 0, 0, 0); oclError1 |= clEnqueueWriteBuffer( ocl.oclCmdQueue, oclDevSrcOnePointProbs, CL_FALSE, 0, sizeof(cl_float) * onePointProbsSize, (const void *) sequence.getOnePointProbs(), 0, 0, 0); if (oclError1 != CL_SUCCESS) { std::cout << "error while writing to device " << ocl.oclErrorString(oclError1) << std::endl; exit(1); } // execute kernel LOOPCOUNT times and measure execution time // TODO LOOPCOUNT aendern, um Kernel mehrfach auszufuehren gettimeofday(&start, 0); for (int i = 0; i < LOOPCOUNT; ++i) { oclError1 = clEnqueueNDRangeKernel( ocl.oclCmdQueue, ocl.oclKernel, 2, // dimension 0, globalWorkSize, localWorkSize, 0, 0, 0); if (oclError1 != CL_SUCCESS) { std::cout << "error while executing kernel: " << ocl.oclErrorString(oclError1) << std::endl; exit(1); } } // clFinish blocks until all issued commands so far are completed, necessary for computing execution time oclError1 = clFinish(ocl.oclCmdQueue); gettimeofday(&end, 0); // read memory from device, store in temporary array and if no error happend copy to result matrix oclError1 = clEnqueueReadBuffer( ocl.oclCmdQueue, oclDevDstMI, CL_TRUE, 0, sizeof(cl_float) * numElements, dst, 0, 0, 0); if (oclError1 != CL_SUCCESS) { std::cout << "error while reading from device: " << ocl.oclErrorString(oclError1) << std::endl; exit(1); } std::cout << "execution time: " << (end.tv_sec - start.tv_sec ) * 1000 + ( end.tv_usec - start.tv_usec) / 1000 << " milliseconds" << std::endl; // fill the matrix with the computed results MI.copyElements(dst); // release used memory, can cause really bad crashes otherwise clReleaseMemObject(oclDevSrcSequence); clReleaseMemObject(oclDevSrcOnePointProbs); clReleaseMemObject(oclDevDstMI); }