void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                     uint8_t *dst, ptrdiff_t dst_stride,
                     const int16_t *filter_x, int x_step_q4,
                     const int16_t *filter_y, int y_step_q4,
                     int w, int h) {
  convolve_c(src, src_stride, dst, dst_stride,
             filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
}
Esempio n. 2
0
int main(int argc, char * argv[])
{
   unsigned int nxl, nyl;

   PV::Timer timer;
   
   int status = 0;
   int argid  = 0;
   int query  = 1;
   int device = DEVICE;
   
   if (argc > 1) {
      device = atoi(argv[1]);
   }
	
   PV::CLDevice * cld = new PV::CLDevice(device);
   
   // query and print information about the devices found
   //
   if (query) cld->query_device_info();
   
   PV::CLKernel * kernel;

   if (device == 1) {
      printf("running on gpu, I hope\n");
      nxl = NXL;
      nyl = NYL;
      kernel = cld->createKernel("convolve.cl", "convolve");
   }
   else {
      nxl = 1;
      nyl = 1;
      kernel = cld->createKernel("convolve_cpu.cl", "convolve_cpu");
   }
   
   size_t global;                      // global domain size for our calculation
   size_t local;                       // local domain size for our calculation
   
   //cl_mem input;                         // device memory used for the input array
   //cl_mem output;                        // device memory used for the output array
   PV::CLBuffer * input;                         // device memory used for the input array
   PV::CLBuffer * output;                        // device memory used for the output array

   //const unsigned int size_ex  = SIZE_EX;
   //const unsigned int size_img = SIZE_IMG;
   size_t size_ex  = SIZE_EX * sizeof(float);
   size_t size_img = SIZE_IMG * sizeof(float);
   
   const unsigned int nxGlobal = NXGLOBAL;
   const unsigned int nyGlobal = NYGLOBAL;
   
   const unsigned int nPad  = NPAD;
   const unsigned int nPad2 = NPAD2;
   
   const unsigned int sx = 1;
   const unsigned int sy = nxGlobal + nPad2;
   
//   float * data     = (float *) malloc(size_ex * sizeof(float));    // original data set given to device
//   float * results_d = (float *) malloc(size_img * sizeof(float));  // results returned from device
//   float * results_l = (float *) malloc(size_img * sizeof(float));  // results returned from local thread
//   unsigned char * activity = (unsigned char *) malloc(size_ex * sizeof(unsigned char));
   float * data     = (float *) malloc(size_ex);    // original data set given to device
   float * results_d = (float *) malloc(size_img);  // results returned from device
   float * results_l = (float *) malloc(size_img);  // results returned from local thread
   //unsigned char * activity = (unsigned char *) malloc(size_ex * sizeof(unsigned char));
	
   assert(data != NULL);
   assert(results_d != NULL);
   assert(results_l != NULL);
   //assert(activity != NULL);
   
   bzero(data,      size_ex);
   bzero(results_d, size_img);
   bzero(results_l, size_img);
//   bzero(data,      size_ex*sizeof(float));
//   bzero(results_d, size_img*sizeof(float));
//   bzero(results_l, size_img*sizeof(float));
   //bzero(activity,  size_ex*sizeof(unsigned char));
	
   size_t local_size_ex = (nxl + nPad2) * (nyl + nPad2) * sizeof(float); // padded image patch
   
   //init_random_data(data, nxGlobal, nyGlobal, nPad);
   init_test_data(data, nxGlobal, nyGlobal, nPad);

   // time running kernel locally
   //
   timer.start();
   convolve_c(data, results_l, nxGlobal, nyGlobal, nPad);
   timer.stop();
   printf("Executing on local:  "); timer.elapsed_time();

#ifdef USE_ACTIVITY_BYTES
   input  = cld->addReadBuffer (argid++, activity, size_ex*sizeof(unsigned char));
#else
   //input  = cld->addReadBuffer (argid++, data,     size_ex*sizeof(float));
   input  = cld->createReadBuffer (size_ex, data);
   input->copyToDevice();
   status |= kernel->setKernelArg(argid++, input);
#endif
   //output = cld->addWriteBuffer(argid++, size_img*sizeof(float));
   output = cld->createWriteBuffer(size_img, results_d);
   status |= kernel->setKernelArg(argid++, output);
//   status = cld->addKernelArg  (argid++, nxGlobal);
//   status = cld->addKernelArg  (argid++, nyGlobal);
//   status = cld->addKernelArg  (argid++, nPad);
//   status = cld->addLocalArg   (argid++, local_size_ex);
   status |= kernel->setKernelArg  (argid++, (int)nxGlobal);
   status |= kernel->setKernelArg  (argid++, (int)nyGlobal);
   status |= kernel->setKernelArg  (argid++, (int)nPad);
   status |= kernel->setLocalArg   (argid++, local_size_ex);
   
   timer.start();
#ifdef USE_ACTIVITY_BYTES
   cld->run(nxGlobal/4, nyGlobal, nxl, nyl);
#else
   //cld->run(nxGlobal, nyGlobal, nxl, nyl);
   printf("starting run...\n");
   kernel->run((size_t)nxGlobal, (size_t)nyGlobal, nxl, nyl);
#endif
   timer.stop();
   printf("Executing on device: "); timer.elapsed_time();
   printf("Elapsed time on device:            device time == %f \n", ((float)kernel->get_execution_time())/1.0e6);
   
   //cld->copyResultsBuffer(output, results_d, size_img*sizeof(float));
   output->copyFromDevice();
   
   // Check results for accuracy
   //
   check_results(results_d, results_l, nxGlobal, nyGlobal, nPad);
   //validate_results(results_d, results_l, nxGlobal, nyGlobal, nPad);

   // Shutdown and cleanup
   //
   //clReleaseMemObject(input);
   //clReleaseMemObject(output);
   delete input;
   delete output;
   delete cld;
   
   printf("Finished...\n");
   
   return status;
}