DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), AbsoluteDifferenceS16Dataset),
               shape, data_type0, data_type1, output_data_type)
{
    // Create tensors
    CLTensor ref_src1 = create_tensor<CLTensor>(shape, data_type0);
    CLTensor ref_src2 = create_tensor<CLTensor>(shape, data_type1);
    CLTensor dst      = create_tensor<CLTensor>(shape, output_data_type);

    // Create and Configure function
    CLAbsoluteDifference abs_diff;
    abs_diff.configure(&ref_src1, &ref_src2, &dst);

    // Validate valid region
    const ValidRegion valid_region = shape_to_valid_region(shape);
    validate(dst.info()->valid_region(), valid_region);

    // Validate padding
    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
    validate(ref_src1.info()->padding(), padding);
    validate(ref_src2.info()->padding(), padding);
    validate(dst.info()->padding(), padding);
}
Example #2
0
int main( int argc, char *argv[] )
{

  int r=0;
/*data init to be replaced with ppm images*/
  unsigned int width  = 16;
  unsigned int height = 16;
  unsigned int bsize = 2;
  unsigned int seed = 12;
  
  if (getenv("CK_WIDTH")!=NULL)  width=atol(getenv("CK_WIDTH"));
  if (getenv("CK_HEIGHT")!=NULL)  height=atol(getenv("CK_HEIGHT"));
  if (getenv("CK_BSIZE")!=NULL)  bsize=atol(getenv("CK_BSIZE"));
  std::cout << "WIDTH, HEIGHT, BSIZE " << width << " " <<  height << " " << bsize << "\n";
  if (getenv("CK_SEED")!=NULL)  seed=atol(getenv("CK_SEED"));
  std::cout << "CK_SEED " << seed << "\n";
  srand(seed);
  

  auto *src_data = new float[width * height * bsize];  
  auto *dst_data = new float[width * height * bsize];
  for(unsigned int b = 0; b < bsize; b++){
     for(unsigned int h = 0; h < height; h++){
       for(unsigned int w = 0; w < width; w++){
   //      float r = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/MYMAX));
         src_data[b * (width * height) + h * width + w] = static_cast<float>(100 * b + 10 * h + w); //replace with random fixed seed value
         dst_data[b * (width * height) + h * width + w] = 0;
       }
     }
   }
  //tuner.AddArgumentInput(src_data);
  //tuner.AddArgumentInput(dst_data);
  //tuner.AddArgumentScalar(static_cast<int>(width));
  //tuner.SetNumRuns(2);
  //tuner.Tune();
  //tuner.PrintToScreen();
// OpenCL init
  CLScheduler::get().default_init();

  CLTensor ATensor; //NETensor for Neon 
  CLTensor OTensor;

  TensorShape shape(width, height, bsize);
  ATensor.allocator()->init(TensorInfo(shape,  Format::F32));
  OTensor.allocator()->init(TensorInfo(shape,  Format::F32));
  
  //FILL TENSORS... easiest way is: create an iteretor 
  Window input_window,output_window;
  input_window.use_tensor_dimensions(ATensor.info());
  output_window.use_tensor_dimensions(ATensor.info());
  //Data in/out
  ATensor.allocator()->allocate();
  OTensor.allocator()->allocate();

  ATensor.map();
  Iterator input_it(&ATensor, input_window);
  execute_window_loop(input_window, [&](const Coordinates & id){
    *reinterpret_cast<float *>(input_it.ptr()) = src_data[id.z() * (width * height) + id.y() * width + id.x()];
  },input_it);
  
  ATensor.unmap();

  const ITensorInfo *Ainfo    = ATensor.info();
  const ITensorInfo *Oinfo    = OTensor.info();
  const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(ATensor.info()->dimension(0),16);
  printf("[KERNEL SET-UP] num_elems_processed_per_iteration=%d\n",num_elems_processed_per_iteration);   if (ATensor.info()->dimension(0) % 16 != 0){
     setenv("CLTUNE_BUILD_OPTION","-DNON_MULTIPLE_OF_16", true);
     printf("SET CLTUNE BUILD\n");

  }
  printf("READ %s\n",getenv("CLTUNE_BUILD_OPTION"));
  auto kernel_file = std::vector<std::string>{"/home/flavio/CK_REPOS/ck-math/program/acl-softmax-opencl-tuner/softmax_layer2.cl"};

  cltune::Tuner tuner(size_t{0}, size_t{0});
  unsigned int gws_x = (input_window.x().end()-input_window.x().start())/16;
  unsigned gws_y =(input_window.y().end()-input_window.y().start())/1;

  const auto id = tuner.AddKernel(kernel_file, "softmax_layer_max", {1, gws_y}, {1,1});  
  tuner.AddParameter(id, "GROUP_SIZE", {1, 2, 4, 8, 16, 32});
  tuner.MulLocalSize(id, {"GROUP_SIZE"});

  const Strides     &Astrides = Ainfo->strides_in_bytes();
  unsigned int Aoffset_first_element = Ainfo->offset_first_element_in_bytes();

  for(unsigned int n = 0; n < Ainfo->num_dimensions(); ++n){
	Aoffset_first_element += input_window[n].start() * Astrides[n];
  }

  const Strides     &Ostrides = Oinfo->strides_in_bytes();
  unsigned int Ooffset_first_element = Oinfo->offset_first_element_in_bytes();

  for(unsigned int n = 0; n < Oinfo->num_dimensions(); ++n){
	Ooffset_first_element += output_window[n].start() * Ostrides[n];
  }

  std::vector<float> v(src_data, src_data + sizeof src_data / sizeof src_data[0]);
  std::vector<float> vout(dst_data, dst_data + sizeof dst_data / sizeof dst_data[0]);

  uint step_y = 1;
/*
  tuner.AddArgumentInput(v);
  tuner.AddArgumentScalar(static_cast<unsigned int>(Astrides[0])); 
  tuner.AddArgumentScalar(static_cast<unsigned int>(width)); 
  tuner.AddArgumentScalar(static_cast<unsigned int>(Astrides[1])); 
  tuner.AddArgumentScalar(static_cast<unsigned int>(step_y)); 
  tuner.AddArgumentScalar(static_cast<unsigned int>(Aoffset_first_element));
  tuner.AddArgumentScalar(static_cast<unsigned int>(Ostrides[0])); 
  tuner.AddArgumentInput(vout);
  tuner.AddArgumentScalar(static_cast<unsigned int>(Ostrides[1])); 
  tuner.AddArgumentScalar(static_cast<unsigned int>(step_y)); 
  tuner.AddArgumentScalar(static_cast<unsigned int>(Ooffset_first_element));
  tuner.AddArgumentScalar(static_cast<unsigned  int>(width));

  tuner.SetNumRuns(10);
  tuner.Tune();
*/
  tuner.PrintToScreen();
//Get output
/*  Window output_window;
  output_window.use_tensor_dimensions(OTensor.info());

  OTensor.map();
  Iterator output_it(&OTensor, output_window);
  execute_window_loop(output_window, [&](const Coordinates & id){
#ifdef PRINT
    std::cout << "Copying one row starting from [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
#endif
    // Copy one whole row:
    memcpy(dst_data + id.z() * (width * height) + id.y() * width, output_it.ptr(), width * sizeof(float));
  }, output_it);
  OTensor.unmap();
*/


// Compute time
//  secs += (after.tv_sec - before.tv_sec) + (after.tv_usec - before.tv_usec)/1000000.0;
//  std::cout << "Softmax[time]= " << secs; 
//  double avg_time = secs;
    
  delete[] src_data;
  delete[] dst_data;
  return 0;
}