void OCLSample::run() {
  double elapsed, average;

  initialize();
  createMemoryBuffers();

  std::cout << "------------------------------\n";
  std::cout << "* Source Kernel\n";
  std::cout << "------------------------------\n";
  setupKernel(sourceKernel_);
  timeKernel(sourceKernel_, elapsed, average);
  finishKernel(sourceKernel_);

  std::cout << "Number of Iterations: " << numIterations_ << "\n";
  std::cout << "Total Time:           " << elapsed << " sec\n";
  std::cout << "Average Time:         " << average << " sec\n";

  std::cout << "------------------------------\n";
  std::cout << "* Binary Kernel\n";
  std::cout << "------------------------------\n";
  setupKernel(binaryKernel_);
  timeKernel(binaryKernel_, elapsed, average);
  finishKernel(binaryKernel_);

  std::cout << "Number of Iterations: " << numIterations_ << "\n";
  std::cout << "Total Time:           " << elapsed << " sec\n";
  std::cout << "Average Time:         " << average << " sec\n";
}
void MotionDetector::motion_detection(float &tempo)
{
	cumulative_cmd_time = 0;
	cumulative_processing_time = 0;
	cumulative_total_time = 0;
	setup_cumulative = 0;
	
	cumulative_time = 0;
	setupKernel("motion_detection");
	for (int i = 0; i<ITERATIONS; i++){
		runMotion();
	}
	//std::fixed;
	std::cout.setf(std::ios::fixed);
	std::cout.precision(5);
	//Average time [ms] = Cumulative_Time/ITERATIONS
	std::cout << "\nSet-up time: " << cumulative_cmd_time / (ITERATIONS * 1000000.0) << std::endl;
	std::cout << "\nProcessing time: " << cumulative_processing_time / (ITERATIONS * 1000000.0) << std::endl;
	std::cout << "\nTotal time OpenCL: " << cumulative_total_time / (ITERATIONS * 1000000.0) << std::endl;
	std::cout << "\nTotal time C++: " << 1000 * (float)setup_cumulative / (CLOCKS_PER_SEC * ITERATIONS) << std::endl;
	
	tempo = cumulative_processing_time / (ITERATIONS * 1000000.0);

	return;
}
Exemple #3
0
int main(int argc, char ** argv)
{
  long lower, upper;
  int WGS;

  if (argc != 4) {
    printf("not 2 arguments\n");
    return 1;
  }
  sscanf(argv[1], "%ld", &lower);
  sscanf(argv[2], "%ld", &upper);
  sscanf(argv[3], "%d", &WGS);

  long results_size = (upper*(upper-1))/2;

  long* results = (long *) malloc(sizeof(long)*WGS);
  int i;
  for(i = 0; i < WGS; i ++) results[i] = 0L;

  printf("%ld\n", results_size);
  FILE *fp;
  char *KernelSource;
  cl_kernel kernel;
  fp = fopen("totient_kernel.cl", "r");
  if (!fp) {
    fprintf(stderr, "Failed to load kernel.\n");
    exit(1);
  }
  KernelSource = (char*)malloc(MAX_SOURCE_SIZE);
  fread( KernelSource, 1, MAX_SOURCE_SIZE, fp);
  fclose( fp );
  
  size_t local[1];
  size_t global[1];
  local[0] = WGS;
  global[0] = results_size;
  
  initGPU();

  // Fill in here:
  kernel = setupKernel( KernelSource, "totient", 2,
                                  LongArr, WGS, results,
                                  IntConst, WGS);

  // Fill in here:
  runKernel( kernel, 1, global, local);

  long tot = 0;
  int l;
  for(l = 0; l < WGS; l ++) tot += results[l];
  
  printf("C: Sum of Totients between [%ld..%ld] is %ld\n",
    lower, upper, tot);
  return 0;
}
Exemple #4
0
int edge::reproducers::local( unsigned int const i_nSteps, unsigned int const i_nElements ) {

  double                                  l_dT = 0.000001;
  unsigned int                            l_nSteps = i_nSteps;
  unsigned int                            l_nElements = i_nElements;
  t_elementChars                        * l_elChars; /* zero initialization */
  t_dg                                    l_dg;
  t_matStar                            (* l_starM)[N_DIM];
  t_fluxSolver                         (* l_fluxSolvers)[ C_ENT[T_SDISC.ELEMENT].N_FACES ];
  real_base                            (* l_dofs)[N_QUANTITIES][N_ELEMENT_MODES][N_CRUNS];
  real_base                            (* l_tInt)[N_QUANTITIES][N_ELEMENT_MODES][N_CRUNS];
  edge::io::Receivers                     l_recvs;
  edge::data::MmXsmmFused< real_base >    l_mm;
  unsigned int                            l_dummyUInt;
  double                                  l_dummyDouble;


  // 1. Set up structures
  setupDg( l_dg );
  setupKernel( l_mm );

  setupStarM( l_nElements, &l_starM );
  setupFluxSolv( l_nElements, &l_fluxSolvers );

  setupTensor( l_nElements, &l_dofs, &l_tInt );
#ifdef PP_USE_OMP
  #pragma omp parallel
  #pragma omp critical
#endif
  setupScratchMem( &(edge::parallel::g_scratchMem) );

  setupPseudoMesh( edge::reproducers::C_MODE_LOCAL, l_nElements,
                   &l_elChars,
                   nullptr, nullptr, nullptr, nullptr, nullptr );


  // 2. Run solvers
  std::cout << "Runing solvers" << std::endl;
  unsigned long long l_start = libxsmm_timer_tick();
#ifdef PP_USE_OMP
  #pragma omp parallel firstprivate( l_nSteps, l_nElements, l_dT )  \
                       firstprivate( l_elChars )                    \
                       firstprivate( l_dg, l_starM, l_fluxSolvers ) \
                       firstprivate( l_dofs, l_tInt  )              \
                       firstprivate( l_mm )                         \
                       private( l_recvs, l_dummyUInt, l_dummyDouble )
#endif
  {
    const unsigned int l_nThreads = omp_get_num_threads();
    const unsigned int l_tid = omp_get_thread_num();
    unsigned int l_firstEl = (unsigned int)((l_nElements + l_nThreads - 1) / l_nThreads) * l_tid;
    unsigned int l_lastEl = (unsigned int)((l_nElements + l_nThreads - 1) / l_nThreads) * (l_tid + 1);
    l_lastEl = std::min(l_lastEl, l_nElements);
    unsigned int l_numEl = l_lastEl - l_firstEl;

    for ( unsigned int l_step = 0; l_step < l_nSteps; l_step++ ) {
      edge::elastic::solvers::AderDg::local< unsigned int,
                                             real_base,
                                             edge::data::MmXsmmFused< real_base > >
                                           ( l_firstEl,
                                             l_numEl,
                                             l_dummyDouble,
                                             l_dT,
                                             l_dummyUInt,
                                             l_dummyUInt,
                                             nullptr,
                                             nullptr,
                                             l_elChars,
                                             l_dg,
                                             l_starM,
                                             l_fluxSolvers,
                                             l_dofs,
                                             l_tInt,
                                             nullptr,
                                             l_recvs,
                                             l_mm           );
#ifdef PP_USE_OMP
      #pragma omp barrier
#endif
    }
  }
  unsigned long long l_end = libxsmm_timer_tick();

  // 3. Print statistics
  double l_time = libxsmm_timer_duration(l_start, l_end);
  unsigned int l_local_flops[] =
  {
    792, 3564, 11412, 31500, 77184, 173538, 360522
  };
  unsigned long long l_flops = (unsigned long long)l_local_flops[ORDER-1] * PP_N_CRUNS * \
                               l_nElements * l_nSteps;
  double l_gflops = (double)l_flops / (l_time * 1000000000);
  std::cout << "Elapsed time: " << l_time << " s" << std::endl;
  std::cout << "Performance:  " << l_gflops << " GFLOPS" << std::endl;
  std::cout << std::endl;

#ifdef PP_REPRODUCER_DUMP
  std::string l_dumpFileName1 = "./local_o"+std::to_string(ORDER)+"_"
                                "f"+std::to_string(PP_PRECISION)+"_"
                                "el"+std::to_string(l_nElements)+"_"
                                "stp"+std::to_string(l_nSteps)+"_dofs.log";
  std::string l_dumpFileName2 = "./local_o"+std::to_string(ORDER)+"_"
                                "f"+std::to_string(PP_PRECISION)+"_"
                                "el"+std::to_string(l_nElements)+"_"
                                "stp"+std::to_string(l_nSteps)+"_tInt.log";
  std::ofstream l_fp1( l_dumpFileName1 );
  std::ofstream l_fp2( l_dumpFileName2 );
  for ( unsigned int l_el = 0; l_el < l_nElements; l_el++ ) {
    for ( unsigned int l_qt = 0; l_qt < N_QUANTITIES; l_qt++ ) {
      for ( unsigned int l_md = 0; l_md < N_ELEMENT_MODES; l_md++ ) {
        for ( unsigned int l_cfr = 0; l_cfr < N_CRUNS; l_cfr++ ) {
          l_fp1 << l_dofs[l_el][l_qt][l_md][l_cfr] << "\n";
          l_fp2 << l_tInt[l_el][l_qt][l_md][l_cfr] << "\n";
        }
      }
    }
  }
#endif


  // 4. Clean up
  cleanupDg( l_dg );
  cleanupStarM( l_starM );
  cleanupFluxSolv( l_fluxSolvers );
  cleanupTensor( l_dofs, l_tInt );

#ifdef PP_USE_OMP
  #pragma omp parallel
  #pragma omp critical
#endif
  cleanupScratchMem( edge::parallel::g_scratchMem );

  cleanupPseudoMesh( edge::reproducers::C_MODE_LOCAL,
                     l_elChars,
                     nullptr, nullptr, nullptr, nullptr, nullptr );

  return 0;
}