Example #1
0
/**
* This function is useful for generating matrices for use with CUFFT.
*/
DeviceMatrixCL3D::Ptr makeDeviceMatrixCL3DPacked(size_t dim_t, size_t dim_y,
	size_t dim_x)
{
	DeviceMatrixCL3D* mat = new DeviceMatrixCL3D();
	mat->dim_x = dim_x;
	mat->dim_y = dim_y;
	mat->dim_t = dim_t;
	size_t pitch;

	TheContext * tc = new TheContext();

	cl_context GPUContext = tc->getMyContext()->getContextCL();
	cl_device_id cdDevice = tc->getMyContext()->getDeviceCL();

	mat->pitch_y = dim_x;
	mat->pitch_t = dim_y*mat->pitch_y;

	const int mem_size = mat->dim_t * mat->pitch_t;

	int err;

	mat->dataMatrix = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, mem_size, NULL, &err);
	if(err!=0)
	{
		printf("Error Code create buffer: %d\n",err);
	}

	return DeviceMatrixCL3D::Ptr(mat, deleteDeviceMatrixCL3D);	
}
Example #2
0
void DeviceMatrixCL_copyFromDevice(const DeviceMatrixCL& self, float* dst)
{
	if ((self.width > 0) && (self.height > 0)) {
		const int mem_size = self.height * self.pitch;

		TheContext * tc = new TheContext();

		size_t buffer_origin[3] = {0,0,0};
		size_t host_origin[3] = {0,0,0};	
		size_t region[3] = {self.width * sizeof(float),
			self.height,
			1};	

		cl_int err =
			clEnqueueReadBufferRect(
			tc->getMyContext()->cqCommandQueue,
			self.dataMatrix, CL_TRUE,
			buffer_origin, host_origin, region,
			self.pitch, 0,
			self.width * sizeof(float), 0,
			dst,
			0, NULL, NULL);

		if (err != 0){
			std::cout << "Error in copyFromDevice (CODE: " << err << ")" << std::endl;
		}
	}
}
Example #3
0
void DeviceMatrixCL3D_copyToDevice(DeviceMatrixCL3D& self, const float* data)
{
    if ((self.dim_x > 0) && (self.dim_y > 0) && (self.dim_t > 0)) {
		const int mem_size = self.dim_y *self.dim_t * self.pitch_y;
		TheContext * tc = new TheContext();
		
		size_t buffer_origin[3] = {0,0,0};
		size_t host_origin[3] = {0,0,0};	
		size_t region[3] = {
			self.dim_x * sizeof(float),
			self.dim_y,
			self.dim_t};
		
		int err = clEnqueueWriteBufferRect(
			tc->getMyContext()->cqCommandQueue,
			self.dataMatrix, CL_TRUE,
			buffer_origin, host_origin, region,
			self.pitch_y, 0,
			sizeof(float) * self.dim_x, 0,
			data,
			0, NULL, NULL);
		
		if (err != 0){
			std::cout << "Error in copyToDevice (CODE: " << err << ")" << std::endl;
		}
    }
}
Example #4
0
void cell_histogram_dense_device_cl(DeviceMatrixCL3D* histogram,
                                 const DeviceMatrixCL* assignments,
                                 const DeviceMatrixCL* weights,
                                 const int max_bin,
                                 const int cell_size,
                                 const int start_y,
                                 const int start_x)
{
	
	histogram->zero();
	
	const size_t local_work_size[2] = {BLOCK_SIZE, BLOCK_SIZE}; 
	
	int grid_ry = (histogram->dim_t + 1) / 2;
    int grid_cx = (histogram->dim_y + 1) / 2;

	
	const int n_blocks_x = grid_ry* local_work_size[0];
	const int n_blocks_y = grid_cx* local_work_size[1];
    
	const size_t global_work_size[2] = {n_blocks_x, n_blocks_y};	
	
	assert(histogram->dim_x == max_bin);
	
	
	TheContext* tc = new TheContext();
	cl_context GPUContext = tc->getMyContext()->getContextCL();
	cl_device_id cdDevice = tc->getMyContext()->getDeviceCL();
	
	MyKernels *kernels = new MyKernels(GPUContext,cdDevice);
	
	cl_kernel theKernel= kernels->getCellHistogramKernel2();
	
	
	cl_int err;
	err=0;
	
    err =  parameters_histogram_dense(theKernel, histogram, assignments,
												weights,max_bin,cell_size,start_y,
												start_x);	
  	
    if (err != CL_SUCCESS) {
        printf("Error: Failed to set kernel arguments 3! %d\n", err);
        exit(1);
    }
	
	err = clEnqueueNDRangeKernel(tc->getMyContext()->cqCommandQueue, 
								 theKernel, 2, NULL, 
								 global_work_size, local_work_size, 0, NULL, NULL);
	clFinish(tc->getMyContext()->cqCommandQueue);// to make sure the kernel completed

    if (err) {
        printf("Error: Failed to execute kernel! %d\n", err);
        exit(1);
    }
}
Example #5
0
DeviceMatrixCL3D::Ptr makeDeviceMatrixCL3D(size_t dim_t, size_t dim_y, 
	size_t dim_x){
		DeviceMatrixCL3D* mat = new DeviceMatrixCL3D();
		mat->dim_x = dim_x;
		mat->dim_y = dim_y;
		mat->dim_t = dim_t;
		//printf("%d  x %d  x  %d\n",dim_x,dim_y,dim_t);
		size_t pitch;

		TheContext * tc = new TheContext();

		cl_context GPUContext = tc->getMyContext()->getContextCL();
		cl_device_id cdDevice = tc->getMyContext()->getDeviceCL();

		/*The optimal pitch is computed by (1) getting the base address alignment
		preference for your card (CL_DEVICE_MEM_BASE_ADDR_ALIGN property with
		clGetDeviceInfo: note that the returned value is in bits, so you have
		to divide by 8 to get it in bytes);*/

		int buffer;
		cl_int ierr = clGetDeviceInfo(cdDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN , sizeof(buffer), &buffer, NULL);
		
		buffer /= 8;

		int naturalPitch = sizeof(float) * mat->dim_x;

		/*let's call this base (2) find the largest multiple of base
		that is no less than your natural
		data pitch (sizeof(type) times number of columns);*/

		int devicepitch = ceil(float(naturalPitch)/buffer) * buffer;

		//printf("Pitch: %d, DevicePitch: %d, Buffer: %d\n", naturalPitch, devicepitch, buffer);

		mat->pitch_y = naturalPitch;//devicepitch;
		mat->pitch_t = dim_y*mat->pitch_y;

		//You then allocate pitch times number of rows bytes, and pass the pitch information to kernels.

		const int mem_size =  mat->dim_t*mat->pitch_t;

		//std::cout << "Mem size: " << mem_size << std::endl;

		int err;

		mat->dataMatrix = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, mem_size, NULL, &err);
		if(err!=0)
		{
			printf("Error Code create buffer: %d\n",err);
		}

		return DeviceMatrixCL3D::Ptr(mat, deleteDeviceMatrixCL3D);
}
Example #6
0
void min_cl_local(const DeviceMatrixCL* matrix, DeviceMatrixCL* output)
{
	
	TheContext* tc = new TheContext();
	
    cl_context GPUContext = tc->getMyContext()->getContextCL();
    cl_device_id cdDevice = tc->getMyContext()->getDeviceCL();
	
    // Creates the program
    // Uses NVIDIA helper functions to get the code string and it's size (in bytes)
  	
	MyKernels *kernels = new MyKernels(GPUContext,cdDevice);
	
	cl_kernel theKernel= kernels->getMinKernel();
	
	
	cl_int err;
	err=0;
	
    err =  parameters_minmax_local(theKernel, matrix, output);	
  	
    if (err != CL_SUCCESS) {
        printf("Error: Failed to set kernel arguments 3! %d\n", err);
        exit(1);
    }
	
	
	const size_t local_work_size[2] = {256, 1}; 
	
	const int n_blocks_x =  ((matrix->height-1) / local_work_size[0] + 1)* local_work_size[0];
	const int n_blocks_y = 1;
	
    
    const size_t global_work_size[2] = {n_blocks_x, n_blocks_y};
	
	err = clEnqueueNDRangeKernel(tc->getMyContext()->cqCommandQueue, 
								 theKernel, 2, NULL, 
								 global_work_size, local_work_size, 0, NULL, NULL);
	
    if (err) {
        printf("Error: Failed to execute kernel! %d\n", err);
        exit(1);
    }
}
Example #7
0
void DeviceMatrixCL3D_copyFromDevice(const DeviceMatrixCL3D& self, float* dst)
{
    if ((self.dim_x > 0) && (self.dim_y > 0) && (self.dim_t > 0)) {
	
		const int mem_size = self.dim_y *self.dim_t * self.pitch_y;
	    	
        TheContext * tc = new TheContext();

	 //   printf("%d x %d\n",self.pitch_y,self.pitch_t);

		//printf("--->%d  x %d  x  %d\n",self.dim_x,self.dim_y,self.dim_t);
		
		size_t buffer_origin[3] = {0,0,0};
		size_t host_origin[3] = {0,0,0};	
        size_t region[3] = {self.dim_x * sizeof(float),
            self.dim_y,
            self.dim_t};	
		float prueba[5][2][3];
		//PyArray_DATA(retval.ptr());
        cl_int err =
		clEnqueueReadBufferRect(
			tc->getMyContext()->cqCommandQueue,
			self.dataMatrix, CL_TRUE,
			buffer_origin, host_origin, region,
			//self.pitch_y, self.dim_x * self.dim_y * sizeof(float),
			//self.pitch_y, 0,
			self.pitch_y, 0,
			self.dim_x * sizeof(float), 0,
			dst,
			0, NULL, NULL);
			//std::cout<<prueba[2][2][2]<<" "<<prueba[0][0][2]<<endl;
		
        if (err != 0){
            std::cout << "Error in copyFromDevice (CODE: " << err << ")" << std::endl;		
		}
	}
}
Example #8
0
void pwdist_eucCL(const DeviceMatrixCL* features_train,
        const DeviceMatrixCL* features_test,
        DeviceMatrixCL* output) {

    TheContext* tc = new TheContext();

    cl_context GPUContext = tc->getMyContext()->getContextCL();
    cl_device_id cdDevice = tc->getMyContext()->getDeviceCL();

    // Creates the program
    // Uses NVIDIA helper functions to get the code string and it's size (in bytes)
  	
	MyKernels *kernels = new MyKernels(GPUContext,cdDevice);
	
	cl_kernel theKernel= kernels->getPairwiseDistanceKernel();
	cl_int err;
	err=0;
	/*
	size_t local;
	err = clGetKernelWorkGroupInfo(theKernel, cdDevice, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
	printf("simultaneous threads num:%d\n", local);
	*/

	int f_pitch=0;
    err |= clSetKernelArg(theKernel, 0, sizeof (cl_mem), &features_train->dataMatrix);
    err |= clSetKernelArg(theKernel, 1, sizeof (int), &features_train->width);
   // err |= clSetKernelArg(theKernel, 2, sizeof (int), &features_train->height);
	f_pitch = features_train->pitch/sizeof(float);
    err |= clSetKernelArg(theKernel, 2, sizeof (int), &f_pitch);
	
	
    err |= clSetKernelArg(theKernel, 3, sizeof (cl_mem), &features_test->dataMatrix);
  //  err |= clSetKernelArg(theKernel, 5, sizeof (int), &features_test->width);
  //  err |= clSetKernelArg(theKernel, 6, sizeof (int), &features_test->height);
	f_pitch = features_test->pitch/sizeof(float);
    err |= clSetKernelArg(theKernel, 4, sizeof (int), &f_pitch);
	
    err |= clSetKernelArg(theKernel, 5, sizeof (cl_mem), &output->dataMatrix);
 //   err |= clSetKernelArg(theKernel, 9, sizeof (int), &output->width);
 //   err |= clSetKernelArg(theKernel, 10, sizeof (int), &output->height);
	f_pitch = output->pitch/sizeof(float);
    err |= clSetKernelArg(theKernel, 6, sizeof (int), &f_pitch);
	
   // err |= clSetKernelArg(theKernel, 12, sizeof (int), &type);
 //   err |= clSetKernelArg(theKernel, 12, sizeof (int), &BLOCK_SIZE);
//	printf("params: %d %d %d,  %d %d %d, %d %d %d-- %d\n",features_train->width,
//		features_train->height,features_train->pitch, features_test->width, features_test->height,
//		features_test->pitch, output->width, output->height, output->pitch, BLOCK_SIZE);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to set kernel arguments 3! %d\n", err);
        exit(1);
    }
	

    const int n_blocks_x = ((features_train->height - 1) / BLOCK_SIZE + 1) * BLOCK_SIZE;
    const int n_blocks_y = ((features_test->height - 1) / BLOCK_SIZE + 1) * BLOCK_SIZE;

    const size_t local_work_size[2] = {BLOCK_SIZE, BLOCK_SIZE}; 
    const size_t global_work_size[2] = {n_blocks_x, n_blocks_y};

  //  std::cout << "Threads: " << local_work_size[0] << ", " << local_work_size[1] << std::endl;
  //  std::cout << "Blocks: " << global_work_size[0] << ", " << global_work_size[1] << std::endl;

//	double tic = omp_get_wtime();
//	for(int i=0; i<6000; i++) 
	{
    err = clEnqueueNDRangeKernel(tc->getMyContext()->cqCommandQueue, 
            theKernel, 2, NULL, 
            global_work_size, local_work_size, 0, NULL, NULL);
	clFinish(tc->getMyContext()->cqCommandQueue);// to make sure the kernel completed
	}
//	double toc = omp_get_wtime();
//	std::cout << "OpenCL time: " << toc - tic << std::endl;


    if (err) {
        printf("Error: Failed to execute kernel! %d\n", err);
        exit(1);
    }
}
Example #9
0
void pwdist_genericCL(const DeviceMatrixCL* features_train,
        const DeviceMatrixCL* features_test,
        DeviceMatrixCL* output,
        int type) {

    TheContext* tc = new TheContext();

    cl_context GPUContext = tc->getMyContext()->getContextCL();
    cl_device_id cdDevice = tc->getMyContext()->getDeviceCL();

    // Creates the program
    // Uses NVIDIA helper functions to get the code string and it's size (in bytes)
  	
	MyKernels *kernels = new MyKernels(GPUContext,cdDevice);
	
	cl_kernel theKernel= kernels->getPairwiseDistanceKernel();
	cl_int err;
	err=0;
	
    err |= clSetKernelArg(theKernel, 0, sizeof (cl_mem), &features_train->dataMatrix);
    err |= clSetKernelArg(theKernel, 1, sizeof (int), &features_train->width);
    err |= clSetKernelArg(theKernel, 2, sizeof (int), &features_train->height);
    err |= clSetKernelArg(theKernel, 3, sizeof (int), &features_train->pitch);
	
	
    err |= clSetKernelArg(theKernel, 4, sizeof (cl_mem), &features_test->dataMatrix);
    err |= clSetKernelArg(theKernel, 5, sizeof (int), &features_test->width);
    err |= clSetKernelArg(theKernel, 6, sizeof (int), &features_test->height);
    err |= clSetKernelArg(theKernel, 7, sizeof (int), &features_test->pitch);
	
    err |= clSetKernelArg(theKernel, 8, sizeof (cl_mem), &output->dataMatrix);
    err |= clSetKernelArg(theKernel, 9, sizeof (int), &output->width);
    err |= clSetKernelArg(theKernel, 10, sizeof (int), &output->height);
    err |= clSetKernelArg(theKernel, 11, sizeof (int), &output->pitch);
	
    err |= clSetKernelArg(theKernel, 12, sizeof (int), &type);
    err |= clSetKernelArg(theKernel, 13, sizeof (int), &BLOCK_SIZE);
	
    if (err != CL_SUCCESS) {
        printf("Error: Failed to set kernel arguments 3! %d\n", err);
        exit(1);
    }
	

    const int n_blocks_x = ((features_train->height - 1) / BLOCK_SIZE + 1) * BLOCK_SIZE;
    const int n_blocks_y = ((features_test->height - 1) / BLOCK_SIZE + 1) * BLOCK_SIZE;

    const size_t local_work_size[2] = {BLOCK_SIZE, BLOCK_SIZE}; 
    const size_t global_work_size[2] = {n_blocks_x, n_blocks_y};

    //std::cout << "Threads: " << local_work_size[0] << ", " << local_work_size[1] << std::endl;
    //std::cout << "Blocks: " << global_work_size[0] << ", " << global_work_size[1] << std::endl;

    err = clEnqueueNDRangeKernel(tc->getMyContext()->cqCommandQueue, 
            theKernel, 2, NULL, 
            global_work_size, local_work_size, 0, NULL, NULL);

    if (err) {
        printf("Error: Failed to execute kernel! %d\n", err);
        exit(1);
    }
}
Example #10
0
void cell_histogram_dense_device_cl(DeviceMatrixCL3D* histogram,
                                 const DeviceMatrixCL* assignments,
                                 const DeviceMatrixCL* weights,
                                 const int max_bin,
                                 const int cell_size,
                                 const int start_y,
                                 const int start_x)
{
  histogram->zero();
	
  const size_t local_work_size[2] = {cell_size, cell_size}; 
	
  int grid_ry = histogram->dim_t;
  int grid_cx = histogram->dim_y;

  const int n_blocks_x = grid_ry* local_work_size[0];
  const int n_blocks_y = grid_cx* local_work_size[1];
    
  const size_t global_work_size[2] = {n_blocks_x, n_blocks_y};	

   assert(histogram->dim_x == max_bin);
	
	
	TheContext* tc = new TheContext();
	cl_context GPUContext = tc->getMyContext()->getContextCL();
	cl_device_id cdDevice = tc->getMyContext()->getDeviceCL();
	
	MyKernels *kernels = new MyKernels(GPUContext,cdDevice);
	
	cl_kernel theKernel= kernels->getCellHistogramKernel1();
	
	cl_int err;
	err=0;
	
    err =  parameters_histogram_dense(theKernel, histogram, assignments,
									  weights,max_bin,cell_size,start_y,
									  start_x);	
  	
    if (err != CL_SUCCESS) {
        printf("Error: Failed to set kernel arguments 3! %d\n", err);
        exit(1);
    }
	
	
	err = clEnqueueNDRangeKernel(tc->getMyContext()->cqCommandQueue, 
								 theKernel, 2, NULL, 
								 global_work_size, local_work_size, 0, NULL, NULL);
	
    if (err) {
        printf("Error: Failed to execute kernel! %d\n", err);
        exit(1);
    }
/*	
    dim3 dimBlock(cell_size, cell_size);
    
    int grid_ry = histogram->dim_t;
    int grid_cx = histogram->dim_y;
    
    dim3 dimGrid(grid_cx, grid_ry);
	
    assert(histogram->dim_x == max_bin);
	
    cellHistogramKernel<<<dimGrid, dimBlock>>>(
											   *histogram, *assignments, *weights,
											   start_y, start_x,
											   max_bin);
  */
}
Example #11
0
int main(int argc, char* argv[])
{
	device_use = 0;
	if(argc>1)
		device_use = atoi(argv[1]);
	
	static char* exampleImagePath = "..\\..\\..\\media\\kewell1.jpg";

	//create a random filterbank
	const int num_filters = 256;

	//number of pipeline passes
	const int num_iters = 125;

	const int filter_dim = 3;

	FilterBank fb(filter_dim, num_filters);
	fb.set_on_device();

	Classifier clf(128, 64, 8, 2, num_filters);

	//load the image on device
	cv::Mat exampleImage = cv::imread(exampleImagePath, 0);
	//convert to float
	exampleImage.convertTo(exampleImage, CV_32FC1);

	cv::resize(exampleImage, exampleImage, cv::Size(exampleImage.cols, exampleImage.rows));

	if(device_use==0)
		std::cout << "running on CPU" <<std::endl;
	else
		std::cout << "running on GPU" <<std::endl;
	std::cout << "Image dimensions:" << exampleImage.size().height <<" "<< exampleImage.size().width <<std::endl;
	
	//pull the data
	float* f_imData = (float*) exampleImage.data;
	DeviceMatrixCL::Ptr dmpCL = makeDeviceMatrixCL(exampleImage.size().height, exampleImage.size().width);
	DeviceMatrixCL_copyToDevice(*dmpCL, f_imData);


/*	for(int i=0; i<20; i++)
	{

	DeviceMatrixCL3D::Ptr ff_im = fb.apply_cl(dmpCL);
//	tic1= omp_get_wtime();

	DeviceMatrixCL::Ptr block_histogram = cell_histogram_dense_cl(
		ff_im, num_filters, 8, 0, 0, 
		exampleImage.size().height, exampleImage.size().width);
//	tic2= omp_get_wtime();

	DeviceMatrixCL::Ptr result = clf.apply(block_histogram);
	}
	*/
	double tic0, tic1, tic2, tic3;
	double tim1 = 0.0;
	double tim2 = 0.0;
	double tim3 = 0.0;

	for(int i=0; i<num_iters; i++)
	{
	tic0= omp_get_wtime();
	DeviceMatrixCL3D::Ptr ff_im = fb.apply_cl(dmpCL);
	tic1= omp_get_wtime();
	tim1 += tic1 - tic0;

	DeviceMatrixCL::Ptr block_histogram = cell_histogram_dense_cl(
		ff_im, num_filters, 8, 0, 0, 
		exampleImage.size().height, exampleImage.size().width);
	tic2= omp_get_wtime();
	tim2 += tic2 - tic1;

	DeviceMatrixCL::Ptr result = clf.apply(block_histogram);

	TheContext* tc = new TheContext();

	clFinish(tc->getMyContext()->cqCommandQueue);
	tic3 = omp_get_wtime();	
	tim3 += tic3 - tic2;
	}
	
	std::cout << "full pipeline time: " << tim1 + tim2 + tim3 << std::endl;
	std::cout << "filter pipeline time: " << tim1 << std::endl;
	std::cout << "histogram pipeline time: " << tim2 << std::endl;
	std::cout << "classifier pipeline time: " << tim3 << std::endl;

	return 0;
}