void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { convolve_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); }
int main(int argc, char * argv[]) { unsigned int nxl, nyl; PV::Timer timer; int status = 0; int argid = 0; int query = 1; int device = DEVICE; if (argc > 1) { device = atoi(argv[1]); } PV::CLDevice * cld = new PV::CLDevice(device); // query and print information about the devices found // if (query) cld->query_device_info(); PV::CLKernel * kernel; if (device == 1) { printf("running on gpu, I hope\n"); nxl = NXL; nyl = NYL; kernel = cld->createKernel("convolve.cl", "convolve"); } else { nxl = 1; nyl = 1; kernel = cld->createKernel("convolve_cpu.cl", "convolve_cpu"); } size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation //cl_mem input; // device memory used for the input array //cl_mem output; // device memory used for the output array PV::CLBuffer * input; // device memory used for the input array PV::CLBuffer * output; // device memory used for the output array //const unsigned int size_ex = SIZE_EX; //const unsigned int size_img = SIZE_IMG; size_t size_ex = SIZE_EX * sizeof(float); size_t size_img = SIZE_IMG * sizeof(float); const unsigned int nxGlobal = NXGLOBAL; const unsigned int nyGlobal = NYGLOBAL; const unsigned int nPad = NPAD; const unsigned int nPad2 = NPAD2; const unsigned int sx = 1; const unsigned int sy = nxGlobal + nPad2; // float * data = (float *) malloc(size_ex * sizeof(float)); // original data set given to device // float * results_d = (float *) malloc(size_img * sizeof(float)); // results returned from device // float * results_l = (float *) malloc(size_img * sizeof(float)); // results returned from local thread // unsigned char * activity = (unsigned char *) malloc(size_ex * sizeof(unsigned char)); float * data = (float *) malloc(size_ex); // original data set given to device float * results_d = (float *) malloc(size_img); // results returned from device float * results_l = (float *) malloc(size_img); // results returned from local thread //unsigned char * activity = (unsigned char *) malloc(size_ex * sizeof(unsigned char)); assert(data != NULL); assert(results_d != NULL); assert(results_l != NULL); //assert(activity != NULL); bzero(data, size_ex); bzero(results_d, size_img); bzero(results_l, size_img); // bzero(data, size_ex*sizeof(float)); // bzero(results_d, size_img*sizeof(float)); // bzero(results_l, size_img*sizeof(float)); //bzero(activity, size_ex*sizeof(unsigned char)); size_t local_size_ex = (nxl + nPad2) * (nyl + nPad2) * sizeof(float); // padded image patch //init_random_data(data, nxGlobal, nyGlobal, nPad); init_test_data(data, nxGlobal, nyGlobal, nPad); // time running kernel locally // timer.start(); convolve_c(data, results_l, nxGlobal, nyGlobal, nPad); timer.stop(); printf("Executing on local: "); timer.elapsed_time(); #ifdef USE_ACTIVITY_BYTES input = cld->addReadBuffer (argid++, activity, size_ex*sizeof(unsigned char)); #else //input = cld->addReadBuffer (argid++, data, size_ex*sizeof(float)); input = cld->createReadBuffer (size_ex, data); input->copyToDevice(); status |= kernel->setKernelArg(argid++, input); #endif //output = cld->addWriteBuffer(argid++, size_img*sizeof(float)); output = cld->createWriteBuffer(size_img, results_d); status |= kernel->setKernelArg(argid++, output); // status = cld->addKernelArg (argid++, nxGlobal); // status = cld->addKernelArg (argid++, nyGlobal); // status = cld->addKernelArg (argid++, nPad); // status = cld->addLocalArg (argid++, local_size_ex); status |= kernel->setKernelArg (argid++, (int)nxGlobal); status |= kernel->setKernelArg (argid++, (int)nyGlobal); status |= kernel->setKernelArg (argid++, (int)nPad); status |= kernel->setLocalArg (argid++, local_size_ex); timer.start(); #ifdef USE_ACTIVITY_BYTES cld->run(nxGlobal/4, nyGlobal, nxl, nyl); #else //cld->run(nxGlobal, nyGlobal, nxl, nyl); printf("starting run...\n"); kernel->run((size_t)nxGlobal, (size_t)nyGlobal, nxl, nyl); #endif timer.stop(); printf("Executing on device: "); timer.elapsed_time(); printf("Elapsed time on device: device time == %f \n", ((float)kernel->get_execution_time())/1.0e6); //cld->copyResultsBuffer(output, results_d, size_img*sizeof(float)); output->copyFromDevice(); // Check results for accuracy // check_results(results_d, results_l, nxGlobal, nyGlobal, nPad); //validate_results(results_d, results_l, nxGlobal, nyGlobal, nPad); // Shutdown and cleanup // //clReleaseMemObject(input); //clReleaseMemObject(output); delete input; delete output; delete cld; printf("Finished...\n"); return status; }