/** * This function is useful for generating matrices for use with CUFFT. */ DeviceMatrixCL3D::Ptr makeDeviceMatrixCL3DPacked(size_t dim_t, size_t dim_y, size_t dim_x) { DeviceMatrixCL3D* mat = new DeviceMatrixCL3D(); mat->dim_x = dim_x; mat->dim_y = dim_y; mat->dim_t = dim_t; size_t pitch; TheContext * tc = new TheContext(); cl_context GPUContext = tc->getMyContext()->getContextCL(); cl_device_id cdDevice = tc->getMyContext()->getDeviceCL(); mat->pitch_y = dim_x; mat->pitch_t = dim_y*mat->pitch_y; const int mem_size = mat->dim_t * mat->pitch_t; int err; mat->dataMatrix = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, mem_size, NULL, &err); if(err!=0) { printf("Error Code create buffer: %d\n",err); } return DeviceMatrixCL3D::Ptr(mat, deleteDeviceMatrixCL3D); }
void cell_histogram_dense_device_cl(DeviceMatrixCL3D* histogram, const DeviceMatrixCL* assignments, const DeviceMatrixCL* weights, const int max_bin, const int cell_size, const int start_y, const int start_x) { histogram->zero(); const size_t local_work_size[2] = {BLOCK_SIZE, BLOCK_SIZE}; int grid_ry = (histogram->dim_t + 1) / 2; int grid_cx = (histogram->dim_y + 1) / 2; const int n_blocks_x = grid_ry* local_work_size[0]; const int n_blocks_y = grid_cx* local_work_size[1]; const size_t global_work_size[2] = {n_blocks_x, n_blocks_y}; assert(histogram->dim_x == max_bin); TheContext* tc = new TheContext(); cl_context GPUContext = tc->getMyContext()->getContextCL(); cl_device_id cdDevice = tc->getMyContext()->getDeviceCL(); MyKernels *kernels = new MyKernels(GPUContext,cdDevice); cl_kernel theKernel= kernels->getCellHistogramKernel2(); cl_int err; err=0; err = parameters_histogram_dense(theKernel, histogram, assignments, weights,max_bin,cell_size,start_y, start_x); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments 3! %d\n", err); exit(1); } err = clEnqueueNDRangeKernel(tc->getMyContext()->cqCommandQueue, theKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); clFinish(tc->getMyContext()->cqCommandQueue);// to make sure the kernel completed if (err) { printf("Error: Failed to execute kernel! %d\n", err); exit(1); } }
DeviceMatrixCL3D::Ptr makeDeviceMatrixCL3D(size_t dim_t, size_t dim_y, size_t dim_x){ DeviceMatrixCL3D* mat = new DeviceMatrixCL3D(); mat->dim_x = dim_x; mat->dim_y = dim_y; mat->dim_t = dim_t; //printf("%d x %d x %d\n",dim_x,dim_y,dim_t); size_t pitch; TheContext * tc = new TheContext(); cl_context GPUContext = tc->getMyContext()->getContextCL(); cl_device_id cdDevice = tc->getMyContext()->getDeviceCL(); /*The optimal pitch is computed by (1) getting the base address alignment preference for your card (CL_DEVICE_MEM_BASE_ADDR_ALIGN property with clGetDeviceInfo: note that the returned value is in bits, so you have to divide by 8 to get it in bytes);*/ int buffer; cl_int ierr = clGetDeviceInfo(cdDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN , sizeof(buffer), &buffer, NULL); buffer /= 8; int naturalPitch = sizeof(float) * mat->dim_x; /*let's call this base (2) find the largest multiple of base that is no less than your natural data pitch (sizeof(type) times number of columns);*/ int devicepitch = ceil(float(naturalPitch)/buffer) * buffer; //printf("Pitch: %d, DevicePitch: %d, Buffer: %d\n", naturalPitch, devicepitch, buffer); mat->pitch_y = naturalPitch;//devicepitch; mat->pitch_t = dim_y*mat->pitch_y; //You then allocate pitch times number of rows bytes, and pass the pitch information to kernels. const int mem_size = mat->dim_t*mat->pitch_t; //std::cout << "Mem size: " << mem_size << std::endl; int err; mat->dataMatrix = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, mem_size, NULL, &err); if(err!=0) { printf("Error Code create buffer: %d\n",err); } return DeviceMatrixCL3D::Ptr(mat, deleteDeviceMatrixCL3D); }
void DeviceMatrixCL_copyFromDevice(const DeviceMatrixCL& self, float* dst) { if ((self.width > 0) && (self.height > 0)) { const int mem_size = self.height * self.pitch; TheContext * tc = new TheContext(); size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {self.width * sizeof(float), self.height, 1}; cl_int err = clEnqueueReadBufferRect( tc->getMyContext()->cqCommandQueue, self.dataMatrix, CL_TRUE, buffer_origin, host_origin, region, self.pitch, 0, self.width * sizeof(float), 0, dst, 0, NULL, NULL); if (err != 0){ std::cout << "Error in copyFromDevice (CODE: " << err << ")" << std::endl; } } }
void DeviceMatrixCL3D_copyToDevice(DeviceMatrixCL3D& self, const float* data) { if ((self.dim_x > 0) && (self.dim_y > 0) && (self.dim_t > 0)) { const int mem_size = self.dim_y *self.dim_t * self.pitch_y; TheContext * tc = new TheContext(); size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = { self.dim_x * sizeof(float), self.dim_y, self.dim_t}; int err = clEnqueueWriteBufferRect( tc->getMyContext()->cqCommandQueue, self.dataMatrix, CL_TRUE, buffer_origin, host_origin, region, self.pitch_y, 0, sizeof(float) * self.dim_x, 0, data, 0, NULL, NULL); if (err != 0){ std::cout << "Error in copyToDevice (CODE: " << err << ")" << std::endl; } } }
void min_cl_local(const DeviceMatrixCL* matrix, DeviceMatrixCL* output) { TheContext* tc = new TheContext(); cl_context GPUContext = tc->getMyContext()->getContextCL(); cl_device_id cdDevice = tc->getMyContext()->getDeviceCL(); // Creates the program // Uses NVIDIA helper functions to get the code string and it's size (in bytes) MyKernels *kernels = new MyKernels(GPUContext,cdDevice); cl_kernel theKernel= kernels->getMinKernel(); cl_int err; err=0; err = parameters_minmax_local(theKernel, matrix, output); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments 3! %d\n", err); exit(1); } const size_t local_work_size[2] = {256, 1}; const int n_blocks_x = ((matrix->height-1) / local_work_size[0] + 1)* local_work_size[0]; const int n_blocks_y = 1; const size_t global_work_size[2] = {n_blocks_x, n_blocks_y}; err = clEnqueueNDRangeKernel(tc->getMyContext()->cqCommandQueue, theKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); exit(1); } }
void DeviceMatrixCL3D_copyFromDevice(const DeviceMatrixCL3D& self, float* dst) { if ((self.dim_x > 0) && (self.dim_y > 0) && (self.dim_t > 0)) { const int mem_size = self.dim_y *self.dim_t * self.pitch_y; TheContext * tc = new TheContext(); // printf("%d x %d\n",self.pitch_y,self.pitch_t); //printf("--->%d x %d x %d\n",self.dim_x,self.dim_y,self.dim_t); size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {self.dim_x * sizeof(float), self.dim_y, self.dim_t}; float prueba[5][2][3]; //PyArray_DATA(retval.ptr()); cl_int err = clEnqueueReadBufferRect( tc->getMyContext()->cqCommandQueue, self.dataMatrix, CL_TRUE, buffer_origin, host_origin, region, //self.pitch_y, self.dim_x * self.dim_y * sizeof(float), //self.pitch_y, 0, self.pitch_y, 0, self.dim_x * sizeof(float), 0, dst, 0, NULL, NULL); //std::cout<<prueba[2][2][2]<<" "<<prueba[0][0][2]<<endl; if (err != 0){ std::cout << "Error in copyFromDevice (CODE: " << err << ")" << std::endl; } } }
void pwdist_eucCL(const DeviceMatrixCL* features_train, const DeviceMatrixCL* features_test, DeviceMatrixCL* output) { TheContext* tc = new TheContext(); cl_context GPUContext = tc->getMyContext()->getContextCL(); cl_device_id cdDevice = tc->getMyContext()->getDeviceCL(); // Creates the program // Uses NVIDIA helper functions to get the code string and it's size (in bytes) MyKernels *kernels = new MyKernels(GPUContext,cdDevice); cl_kernel theKernel= kernels->getPairwiseDistanceKernel(); cl_int err; err=0; /* size_t local; err = clGetKernelWorkGroupInfo(theKernel, cdDevice, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } printf("simultaneous threads num:%d\n", local); */ int f_pitch=0; err |= clSetKernelArg(theKernel, 0, sizeof (cl_mem), &features_train->dataMatrix); err |= clSetKernelArg(theKernel, 1, sizeof (int), &features_train->width); // err |= clSetKernelArg(theKernel, 2, sizeof (int), &features_train->height); f_pitch = features_train->pitch/sizeof(float); err |= clSetKernelArg(theKernel, 2, sizeof (int), &f_pitch); err |= clSetKernelArg(theKernel, 3, sizeof (cl_mem), &features_test->dataMatrix); // err |= clSetKernelArg(theKernel, 5, sizeof (int), &features_test->width); // err |= clSetKernelArg(theKernel, 6, sizeof (int), &features_test->height); f_pitch = features_test->pitch/sizeof(float); err |= clSetKernelArg(theKernel, 4, sizeof (int), &f_pitch); err |= clSetKernelArg(theKernel, 5, sizeof (cl_mem), &output->dataMatrix); // err |= clSetKernelArg(theKernel, 9, sizeof (int), &output->width); // err |= clSetKernelArg(theKernel, 10, sizeof (int), &output->height); f_pitch = output->pitch/sizeof(float); err |= clSetKernelArg(theKernel, 6, sizeof (int), &f_pitch); // err |= clSetKernelArg(theKernel, 12, sizeof (int), &type); // err |= clSetKernelArg(theKernel, 12, sizeof (int), &BLOCK_SIZE); // printf("params: %d %d %d, %d %d %d, %d %d %d-- %d\n",features_train->width, // features_train->height,features_train->pitch, features_test->width, features_test->height, // features_test->pitch, output->width, output->height, output->pitch, BLOCK_SIZE); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments 3! %d\n", err); exit(1); } const int n_blocks_x = ((features_train->height - 1) / BLOCK_SIZE + 1) * BLOCK_SIZE; const int n_blocks_y = ((features_test->height - 1) / BLOCK_SIZE + 1) * BLOCK_SIZE; const size_t local_work_size[2] = {BLOCK_SIZE, BLOCK_SIZE}; const size_t global_work_size[2] = {n_blocks_x, n_blocks_y}; // std::cout << "Threads: " << local_work_size[0] << ", " << local_work_size[1] << std::endl; // std::cout << "Blocks: " << global_work_size[0] << ", " << global_work_size[1] << std::endl; // double tic = omp_get_wtime(); // for(int i=0; i<6000; i++) { err = clEnqueueNDRangeKernel(tc->getMyContext()->cqCommandQueue, theKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); clFinish(tc->getMyContext()->cqCommandQueue);// to make sure the kernel completed } // double toc = omp_get_wtime(); // std::cout << "OpenCL time: " << toc - tic << std::endl; if (err) { printf("Error: Failed to execute kernel! %d\n", err); exit(1); } }
void pwdist_genericCL(const DeviceMatrixCL* features_train, const DeviceMatrixCL* features_test, DeviceMatrixCL* output, int type) { TheContext* tc = new TheContext(); cl_context GPUContext = tc->getMyContext()->getContextCL(); cl_device_id cdDevice = tc->getMyContext()->getDeviceCL(); // Creates the program // Uses NVIDIA helper functions to get the code string and it's size (in bytes) MyKernels *kernels = new MyKernels(GPUContext,cdDevice); cl_kernel theKernel= kernels->getPairwiseDistanceKernel(); cl_int err; err=0; err |= clSetKernelArg(theKernel, 0, sizeof (cl_mem), &features_train->dataMatrix); err |= clSetKernelArg(theKernel, 1, sizeof (int), &features_train->width); err |= clSetKernelArg(theKernel, 2, sizeof (int), &features_train->height); err |= clSetKernelArg(theKernel, 3, sizeof (int), &features_train->pitch); err |= clSetKernelArg(theKernel, 4, sizeof (cl_mem), &features_test->dataMatrix); err |= clSetKernelArg(theKernel, 5, sizeof (int), &features_test->width); err |= clSetKernelArg(theKernel, 6, sizeof (int), &features_test->height); err |= clSetKernelArg(theKernel, 7, sizeof (int), &features_test->pitch); err |= clSetKernelArg(theKernel, 8, sizeof (cl_mem), &output->dataMatrix); err |= clSetKernelArg(theKernel, 9, sizeof (int), &output->width); err |= clSetKernelArg(theKernel, 10, sizeof (int), &output->height); err |= clSetKernelArg(theKernel, 11, sizeof (int), &output->pitch); err |= clSetKernelArg(theKernel, 12, sizeof (int), &type); err |= clSetKernelArg(theKernel, 13, sizeof (int), &BLOCK_SIZE); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments 3! %d\n", err); exit(1); } const int n_blocks_x = ((features_train->height - 1) / BLOCK_SIZE + 1) * BLOCK_SIZE; const int n_blocks_y = ((features_test->height - 1) / BLOCK_SIZE + 1) * BLOCK_SIZE; const size_t local_work_size[2] = {BLOCK_SIZE, BLOCK_SIZE}; const size_t global_work_size[2] = {n_blocks_x, n_blocks_y}; //std::cout << "Threads: " << local_work_size[0] << ", " << local_work_size[1] << std::endl; //std::cout << "Blocks: " << global_work_size[0] << ", " << global_work_size[1] << std::endl; err = clEnqueueNDRangeKernel(tc->getMyContext()->cqCommandQueue, theKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); exit(1); } }
void cell_histogram_dense_device_cl(DeviceMatrixCL3D* histogram, const DeviceMatrixCL* assignments, const DeviceMatrixCL* weights, const int max_bin, const int cell_size, const int start_y, const int start_x) { histogram->zero(); const size_t local_work_size[2] = {cell_size, cell_size}; int grid_ry = histogram->dim_t; int grid_cx = histogram->dim_y; const int n_blocks_x = grid_ry* local_work_size[0]; const int n_blocks_y = grid_cx* local_work_size[1]; const size_t global_work_size[2] = {n_blocks_x, n_blocks_y}; assert(histogram->dim_x == max_bin); TheContext* tc = new TheContext(); cl_context GPUContext = tc->getMyContext()->getContextCL(); cl_device_id cdDevice = tc->getMyContext()->getDeviceCL(); MyKernels *kernels = new MyKernels(GPUContext,cdDevice); cl_kernel theKernel= kernels->getCellHistogramKernel1(); cl_int err; err=0; err = parameters_histogram_dense(theKernel, histogram, assignments, weights,max_bin,cell_size,start_y, start_x); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments 3! %d\n", err); exit(1); } err = clEnqueueNDRangeKernel(tc->getMyContext()->cqCommandQueue, theKernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); exit(1); } /* dim3 dimBlock(cell_size, cell_size); int grid_ry = histogram->dim_t; int grid_cx = histogram->dim_y; dim3 dimGrid(grid_cx, grid_ry); assert(histogram->dim_x == max_bin); cellHistogramKernel<<<dimGrid, dimBlock>>>( *histogram, *assignments, *weights, start_y, start_x, max_bin); */ }
int main(int argc, char* argv[]) { device_use = 0; if(argc>1) device_use = atoi(argv[1]); static char* exampleImagePath = "..\\..\\..\\media\\kewell1.jpg"; //create a random filterbank const int num_filters = 256; //number of pipeline passes const int num_iters = 125; const int filter_dim = 3; FilterBank fb(filter_dim, num_filters); fb.set_on_device(); Classifier clf(128, 64, 8, 2, num_filters); //load the image on device cv::Mat exampleImage = cv::imread(exampleImagePath, 0); //convert to float exampleImage.convertTo(exampleImage, CV_32FC1); cv::resize(exampleImage, exampleImage, cv::Size(exampleImage.cols, exampleImage.rows)); if(device_use==0) std::cout << "running on CPU" <<std::endl; else std::cout << "running on GPU" <<std::endl; std::cout << "Image dimensions:" << exampleImage.size().height <<" "<< exampleImage.size().width <<std::endl; //pull the data float* f_imData = (float*) exampleImage.data; DeviceMatrixCL::Ptr dmpCL = makeDeviceMatrixCL(exampleImage.size().height, exampleImage.size().width); DeviceMatrixCL_copyToDevice(*dmpCL, f_imData); /* for(int i=0; i<20; i++) { DeviceMatrixCL3D::Ptr ff_im = fb.apply_cl(dmpCL); // tic1= omp_get_wtime(); DeviceMatrixCL::Ptr block_histogram = cell_histogram_dense_cl( ff_im, num_filters, 8, 0, 0, exampleImage.size().height, exampleImage.size().width); // tic2= omp_get_wtime(); DeviceMatrixCL::Ptr result = clf.apply(block_histogram); } */ double tic0, tic1, tic2, tic3; double tim1 = 0.0; double tim2 = 0.0; double tim3 = 0.0; for(int i=0; i<num_iters; i++) { tic0= omp_get_wtime(); DeviceMatrixCL3D::Ptr ff_im = fb.apply_cl(dmpCL); tic1= omp_get_wtime(); tim1 += tic1 - tic0; DeviceMatrixCL::Ptr block_histogram = cell_histogram_dense_cl( ff_im, num_filters, 8, 0, 0, exampleImage.size().height, exampleImage.size().width); tic2= omp_get_wtime(); tim2 += tic2 - tic1; DeviceMatrixCL::Ptr result = clf.apply(block_histogram); TheContext* tc = new TheContext(); clFinish(tc->getMyContext()->cqCommandQueue); tic3 = omp_get_wtime(); tim3 += tic3 - tic2; } std::cout << "full pipeline time: " << tim1 + tim2 + tim3 << std::endl; std::cout << "filter pipeline time: " << tim1 << std::endl; std::cout << "histogram pipeline time: " << tim2 << std::endl; std::cout << "classifier pipeline time: " << tim3 << std::endl; return 0; }