void Convolve::start(Signal &signal, Kernel &kernel) {

  write(signal.getUnpaddedSize(),RA_SIZE);
  write(0,RA_IN_START_ADDR);
  write(0,RA_OUT_START_ADDR);

  write(signal.getSignal(), signal.getSize(), 0, MEM_SRAM_0);
  write(kernel.getKernel(), kernel.getSize(), RA_KERNEL);
  write(1,RA_GO); 
}
Example #2
0
void Benchmarker::run(int runTimes)
{
	OCLContext * ocl = OCLContext::getInstance();
	int totalSize = inputImage.getWidth() * inputImage.getHeight() * 4;

	// error code
	cl_int ciErrNum;
	
	// events and profiling vars
	cl_event eventGlobal;
	cl_int errcode_ret;
	cl_ulong end, start;
	
	size_t globalWorkSize[] = { inputImage.getWidth(), inputImage.getHeight() };
	
	for (int i = 0; i < kernels.size(); i++)
	{
		Kernel * kernel = kernels[i];
		kernel->clear();
		
		// get local work size
		size_t localWorkSize[] = { kernel->getLocalWorkSizeX(),  kernel->getLocalWorkSizeY() };
		
		cout << kernel->filename << "\t\t\t" << flush;
		
		bool failure = false;
		bool skipped = SKIP_EXCESS_LOCAL_MEM &&
			kernel->usesLocalMem() && kernel->getLocalMemSize() > MAX_LOCAL_MEM_SIZE ? true : false;
		if (skipped)
		{
			goto skipSim;
		}
		for (int t = 0; t < runTimes; t++)
		{
			// run all kernels
			ciErrNum = clEnqueueNDRangeKernel(
				ocl->getQueue(), kernel->getKernel(), 
				2, NULL, globalWorkSize, kernel->usesLocalMem() ? localWorkSize : NULL, 0, 0, &eventGlobal );
			if (ciErrNum != CL_SUCCESS)
			{
				failure = true;
				break;
			}
	
			// lets do some profiling
			errcode_ret = clWaitForEvents(1, &eventGlobal);
			oclCheckError(errcode_ret, CL_SUCCESS);
			errcode_ret = clGetEventProfilingInfo(eventGlobal, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, 0);
			errcode_ret |= clGetEventProfilingInfo(eventGlobal, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, 0);
			
			kernel->runTimes.push_back( (end-start)*1.0e-6f );
		}
		
	skipSim:
		if (failure || skipped)
		{
			/*
			memset( kernel->results, 0, totalSize * sizeof(float) );
			kernel->makeOpenGLTexture();
			*/
			cout << (skipped ? "exceeds local mem" : "FAILED !");
			cout << endl;
			
		}
		else
		{
			// read alien / milirary data
				
			ciErrNum = clEnqueueReadBuffer(ocl->getQueue(), output_cl, true, 0, 
				totalSize * sizeof(float), kernel->results, 0, 0, 0);
			shrCheckErrorEX(ciErrNum, CL_SUCCESS, &OCLContext::Cleanup);
			kernel->makeOpenGLTexture();

			// average run times
			kernel->averageRunTimes();
			cout  << fixed << setprecision( 1 ) << kernel->avgRunTime << endl;
			cout << flush;
		}
	}
}
Example #3
0
Benchmarker::Benchmarker(int _filterSize)
{
	OCLContext * ocl = OCLContext::getInstance();
	
	filterSize = _filterSize;
	generateFilter();

	// load input image
	if (!inputImage.isLoaded())
	{
		inputImage.readFile(getInputFile(), 4);
	}
	//cout << "\nLoaded image: " << inputImage.getWidth() << " x " << inputImage.getHeight() << endl;

	// allocate memory
	int totalSize = inputImage.getWidth() * inputImage.getHeight() * 4;	
	float * inputBuffer = new float[ totalSize ];
	
	// set pointers to beginning of buffers
	const unsigned char * p = inputImage.getPixel(0, 0);
	float * b = inputBuffer;
	
	for (int r = 0; r < inputImage.getHeight(); r++)
	{
		for (int c = 0; c < inputImage.getWidth(); c++, p += 4, b += 4)
		{
			b[0] = (float) p[0];
			b[1] = (float) p[1];
			b[2] = (float) p[2];
			b[3] = (float) p[3];
		}
	}
	
	// set compiler options
	// (two predefined macros specifying size of image and filter)
	sprintf(compilerOptions, "-D IMAGE_W=%d -D IMAGE_H=%d -D FILTER_SIZE=%d -D HALF_FILTER_SIZE=%d -D TWICE_HALF_FILTER_SIZE=%d -D HALF_FILTER_SIZE_IMAGE_W=%d",
		inputImage.getWidth(),
		inputImage.getHeight(),
		filterSize,
		filterSize/2,
		(filterSize/2) * 2,
		(filterSize/2) * inputImage.getWidth()
	);
	ocl->setCompilerOptions(compilerOptions);
	
	// input buffer
	cl_int ciErrNum;
	input_cl = clCreateBuffer(ocl->getGPUContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		totalSize * sizeof(float), inputBuffer, &ciErrNum);
	shrCheckErrorEX(ciErrNum, CL_SUCCESS, &OCLContext::Cleanup);
	
	// output buffer
	output_cl = clCreateBuffer(ocl->getGPUContext(), CL_MEM_WRITE_ONLY,
		totalSize * sizeof(float), NULL, &ciErrNum);
	shrCheckErrorEX(ciErrNum, CL_SUCCESS, &OCLContext::Cleanup);

	// filter
	filter_cl = clCreateBuffer(ocl->getGPUContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		filterSize * filterSize * sizeof(float) * 4, filter, &ciErrNum);
	shrCheckErrorEX(ciErrNum, CL_SUCCESS, &OCLContext::Cleanup);
	
	// cleanup
	delete [] inputBuffer;
	
	// load all kernels
	// --------------------------
	vector<string> clDir;
	if (! listDir("cl", clDir) )
	{
		cerr << "Error opening 'cl/'" << endl;
		return;
	}
	
	cout << "Loading kernels for filter size " << filterSize << " x " << filterSize << " ..." << endl;
	for (int i = 0; i < clDir.size(); i++)
	{
		string clName = getFileName(clDir[i]);
		string filename = string("cl/") + clDir[i];
		cout << "\t--> " << filename;

		// determine if kernel uses local memory (name should have the world 'local')
		bool kernelOk = true;
		bool localMem = false;
		int localWorkSizeX = 0, localWorkSizeY = 0;
		int localMemSize = 0;
		
		if (clName.find("local") != string::npos)
		{
			localMem = true;
			kernelOk = false;
			
			vector<string> * tokens = split(getFileName( clName ), "-");
			if (tokens->size() != 2)
			{
				localMem = false;
				cout << " uses local mem, but no local work size defined.";
			}
			else
			{				
				// find the X
				size_t pos = tokens->at(1).find("x");
				if (pos == string::npos)
				{
					localMem = false;
					cout << " uses local mem, but no local work size defined.";
					goto skip;
				}
				
				if (
					!from_string(localWorkSizeX, tokens->at(1).substr(0, pos)) ||
					!from_string(localWorkSizeY, tokens->at(1).substr(pos+1))
				)
				{
					localMem = false;
					cout << " uses local mem, but no local work size defined.";
					goto skip;
					
				}
			
				kernelOk = true;
					
				// calculate local memory size
				localMemSize = 
					( localWorkSizeX + 2 * (filterSize / 2) ) *
					( localWorkSizeY + 2 * (filterSize / 2) );
				localMemSize *= 4 * sizeof(float);
				
				//kernel->localMemSize = localMemSize;
				cout << " LOCAL work: " << localWorkSizeX << " x " << localWorkSizeY << ", LOCAL mem: " << localMemSize << endl << "\t\t\t";
			}
		skip:
			delete tokens;
		}
		
		// load the file
		char * sourceFile = readTextFile(filename.c_str());
		string modifiedSource = sourceFile;
		
		if (localMem)
		{
			if (!unrollLoop_local(filterSize, localWorkSizeX, sourceFile, modifiedSource))
			{
				modifiedSource = sourceFile;
			}
		}
		else
		{
			if (!unrollLoop(filterSize, sourceFile, modifiedSource))
			{
				modifiedSource = sourceFile;
			}
		}
		delete [] sourceFile;
				
		// load the kernel
		Kernel * kernel = new Kernel( 
			inputImage.getWidth(), inputImage.getHeight(),
			filename.c_str(), "convolute", modifiedSource.c_str()
		);
		kernel->filename = clName;
		kernel->localMem = localMem;
		if (localMem)
		{
			kernel->localMemSize = localMemSize;
			kernel->localWorkSizeX = localWorkSizeX;
			kernel->localWorkSizeY = localWorkSizeY;
		}
		
		if (kernelOk)
		{
			// setup arguments for kernel
			ciErrNum  = clSetKernelArg(kernel->getKernel(), 0, sizeof(cl_mem), (void *) &input_cl);
			ciErrNum |= clSetKernelArg(kernel->getKernel(), 1, sizeof(cl_mem), (void *) &output_cl);
			ciErrNum |= clSetKernelArg(kernel->getKernel(), 2, sizeof(cl_mem), (void *) &filter_cl);
			
			if (kernel->usesLocalMem())
			{
				ciErrNum |= clSetKernelArg(kernel->getKernel(), 3, kernel->localMemSize, 0 );
			}
			
			// check error
			shrCheckErrorEX(ciErrNum, CL_SUCCESS, &OCLContext::Cleanup);
		
			cout << "\t\tOK" << endl;
		
			// add to list of kernel
			kernels.push_back(kernel);
		}
		else
		{
			delete kernel;
			cout << "\t\tFailed." << endl;
		}
	}
}