コード例 #1
0
ファイル: res_seqkernel.cpp プロジェクト: OP2/OP2-Common
// host stub function
void op_par_loop_res(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3){

  int nargs = 4;
  op_arg args[4];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(0);
  op_timers_core(&cpu_t1, &wall_t1);

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res\n");
  }

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    for ( int n=0; n<set_size; n++ ){
      if (n==set->core_size) {
        op_mpi_wait_all(nargs, args);
      }
      int map1idx = arg1.map_data[n * arg1.map->dim + 1];
      int map2idx = arg1.map_data[n * arg1.map->dim + 0];


      res(
        &((double*)arg0.data)[1 * n],
        &((double*)arg1.data)[1 * map1idx],
        &((double*)arg2.data)[1 * map2idx],
        (double*)arg3.data);
    }
  }

  if (set_size == 0 || set_size == set->core_size) {
    op_mpi_wait_all(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
  OP_kernels[0].transfer += (float)set->size * arg1.size;
  OP_kernels[0].transfer += (float)set->size * arg2.size * 2.0f;
  OP_kernels[0].transfer += (float)set->size * arg0.size;
  OP_kernels[0].transfer += (float)set->size * arg3.size;
  OP_kernels[0].transfer += (float)set->size * arg1.map->dim * 4.0f;
}
コード例 #2
0
// host stub function
void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1,
                        op_arg arg2, op_arg arg3, op_arg arg4) {

  double *arg4h = (double *)arg4.data;
  int nargs = 5;
  op_arg args[5];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(4);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[4].name = name;
  OP_kernels[4].count += 1;

  if (OP_diags > 2) {
    printf(" kernel routine w/o indirection:  update");
  }

  op_mpi_halo_exchanges_cuda(set, nargs, args);

  double arg4_l = arg4h[0];

  if (set->size > 0) {

    // Set up typed device pointers for OpenACC

    double *data0 = (double *)arg0.data_d;
    double *data1 = (double *)arg1.data_d;
    double *data2 = (double *)arg2.data_d;
    double *data3 = (double *)arg3.data_d;
#pragma acc parallel loop independent deviceptr(data0, data1, data2,           \
                                                data3) reduction(+ : arg4_l)
    for (int n = 0; n < set->size; n++) {
      update(&data0[4 * n], &data1[4 * n], &data2[4 * n], &data3[1 * n],
             &arg4_l);
    }
  }

  // combine reduction data
  arg4h[0] = arg4_l;
  op_mpi_reduce_double(&arg4, arg4h);
  op_mpi_set_dirtybit_cuda(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[4].time += wall_t2 - wall_t1;
  OP_kernels[4].transfer += (float)set->size * arg0.size;
  OP_kernels[4].transfer += (float)set->size * arg1.size;
  OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f;
  OP_kernels[4].transfer += (float)set->size * arg3.size;
}
コード例 #3
0
void op_par_loop_save_soln(char const *name, op_set set,      
  op_arg arg0,                                                
  op_arg arg1 ){                                              
                                                              
  int ninds   = 0;    
  int nargs   = 2;
  op_arg args[2] = {arg0,arg1};

  if (OP_diags>2) {                                           
    printf(" kernel routine w/o indirection:  save_soln \n"); 
  }                                                           
                                                              
  // initialise timers                                        
                                                              
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                    
  op_timers_core(&cpu_t1, &wall_t1);                               
                                                              
  // set number of threads                                    
                                                              
#ifdef _OPENMP                                                
  int nthreads = omp_get_max_threads( );                      
#else                                                         
  int nthreads = 1;                                           
#endif                                                        
                                                              
  // execute plan                                             
                                                              
#pragma omp parallel for                                      
  for (int thr=0; thr<nthreads; thr++) {                      
    int start  = (set->size* thr   )/nthreads;                
    int finish = (set->size*(thr+1))/nthreads;                
    op_x86_save_soln( (double *) arg0.data,                    
                      (double *) arg1.data,                    
                      start, finish );                        
  }                                                           
                           
  //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_DAT)
      	set_dirtybit(args[i]);
  
  //performe any global operations
  // - NONE

  
  
  // update kernel record                                     
                                                              
  op_timers_core(&cpu_t2, &wall_t2);                               
  op_timing_realloc(0);                                       
  OP_kernels[0].name      = name;                             
  OP_kernels[0].count    += 1;                                
  OP_kernels[0].time     += wall_t2 - wall_t1;                
  OP_kernels[0].transfer += (double)set->size * arg0.size;     
  OP_kernels[0].transfer += (double)set->size * arg1.size;     
}                                                             
コード例 #4
0
// host stub function
void op_par_loop_save_soln_cpu(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1){

  int nargs = 2;
  op_arg args[2];

  args[0] = arg0;
  args[1] = arg1;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(0);
  op_timers_core(&cpu_t1, &wall_t1);


  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  save_soln");
  }

  op_mpi_halo_exchanges(set, nargs, args);
  // set number of threads
  #ifdef _OPENMP
    int nthreads = omp_get_max_threads();
  #else
    int nthreads = 1;
  #endif

  if (set->size >0) {

    // execute plan
    #pragma omp parallel for
    for ( int thr=0; thr<nthreads; thr++ ){
      int start  = (set->size* thr)/nthreads;
      int finish = (set->size*(thr+1))/nthreads;
      for ( int n=start; n<finish; n++ ){
        save_soln(
          &((double*)arg0.data)[4*n],
          &((double*)arg1.data)[4*n]);
      }
    }
  }

  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
  OP_kernels[0].transfer += (float)set->size * arg0.size;
  OP_kernels[0].transfer += (float)set->size * arg1.size;
}
コード例 #5
0
ファイル: update_seqkernel.cpp プロジェクト: OP2/OP2-Common
// host stub function
void op_par_loop_update(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4){

  int nargs = 5;
  op_arg args[5];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(1);
  op_timers_core(&cpu_t1, &wall_t1);


  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  update");
  }

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    for ( int n=0; n<set_size; n++ ){
      update(
        &((float*)arg0.data)[1*n],
        &((float*)arg1.data)[1*n],
        &((float*)arg2.data)[1*n],
        (float*)arg3.data,
        (float*)arg4.data);
    }
  }

  // combine reduction data
  op_mpi_reduce_float(&arg3,(float*)arg3.data);
  op_mpi_reduce_float(&arg4,(float*)arg4.data);
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[1].name      = name;
  OP_kernels[1].count    += 1;
  OP_kernels[1].time     += wall_t2 - wall_t1;
  OP_kernels[1].transfer += (float)set->size * arg0.size;
  OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f;
  OP_kernels[1].transfer += (float)set->size * arg2.size * 2.0f;
}
コード例 #6
0
ファイル: res_calc_kernel.cpp プロジェクト: xyuan/OP2-Common
void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg4,
  op_arg arg8,
  op_arg arg9 ){


  int    nargs   = 13;
  op_arg args[13];

  arg0.idx = 0;
  args[0] = arg0;
  for (int v = 1; v < 4; v++) {
    args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ);
  }
  arg4.idx = 0;
  args[4] = arg4;
  for (int v = 1; v < 4; v++) {
    args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ);
  }
  args[8] = arg8;
  arg9.idx = 0;
  args[9] = arg9;
  for (int v = 1; v < 4; v++) {
    args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC);
  }

  int    ninds   = 3;
  int    inds[13] = {0,0,0,0,1,1,1,1,-1,2,2,2,2};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timers_core(&cpu_t1, &wall_t1);

  if (set->size >0) {


    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
    // execute plan

    int block_offset = 0;

    for (int col=0; col < Plan->ncolors; col++) {
      if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

      int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res_calc( blockIdx,
         (double *)arg0.data,
         (double *)arg4.data,
         (double *)arg9.data,
         Plan->ind_map,
         Plan->loc_map,
         (double *)arg8.data,
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);

      block_offset += nblocks;
    }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
コード例 #7
0
// host stub function
void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1,
                        op_arg arg2, op_arg arg3, op_arg arg4) {

  int nargs = 5;
  op_arg args[5];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  // create aligned pointers for dats
  ALIGNED_double const double *__restrict__ ptr0 = (double *)arg0.data;
  __assume_aligned(ptr0, double_ALIGN);
  ALIGNED_double double *__restrict__ ptr1 = (double *)arg1.data;
  __assume_aligned(ptr1, double_ALIGN);
  ALIGNED_double double *__restrict__ ptr2 = (double *)arg2.data;
  __assume_aligned(ptr2, double_ALIGN);
  ALIGNED_double const double *__restrict__ ptr3 = (double *)arg3.data;
  __assume_aligned(ptr3, double_ALIGN);

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(4);
  op_timers_core(&cpu_t1, &wall_t1);

  if (OP_diags > 2) {
    printf(" kernel routine w/o indirection:  update");
  }

  int exec_size = op_mpi_halo_exchanges(set, nargs, args);

  if (exec_size > 0) {

#ifdef VECTORIZE
#pragma novector
    for (int n = 0; n < (exec_size / SIMD_VEC) * SIMD_VEC; n += SIMD_VEC) {
      double dat4[SIMD_VEC] = {0.0};
#pragma simd
      for (int i = 0; i < SIMD_VEC; i++) {
        update(&(ptr0)[4 * (n + i)], &(ptr1)[4 * (n + i)], &(ptr2)[4 * (n + i)],
               &(ptr3)[1 * (n + i)], &dat4[i]);
      }
      for (int i = 0; i < SIMD_VEC; i++) {
        *(double *)arg4.data += dat4[i];
      }
    }
    // remainder
    for (int n = (exec_size / SIMD_VEC) * SIMD_VEC; n < exec_size; n++) {
#else
    for (int n = 0; n < exec_size; n++) {
#endif
      update(&(ptr0)[4 * n], &(ptr1)[4 * n], &(ptr2)[4 * n], &(ptr3)[1 * n],
             (double *)arg4.data);
    }
  }

  // combine reduction data
  op_mpi_reduce(&arg4, (double *)arg4.data);
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[4].name = name;
  OP_kernels[4].count += 1;
  OP_kernels[4].time += wall_t2 - wall_t1;
  OP_kernels[4].transfer += (float)set->size * arg0.size;
  OP_kernels[4].transfer += (float)set->size * arg1.size * 2.0f;
  OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f;
  OP_kernels[4].transfer += (float)set->size * arg3.size;
}
コード例 #8
0
ファイル: update_mpi_kernel.cpp プロジェクト: ioz9/OP2-Common
void op_par_loop_update(char const *name, op_set set,           
  op_arg arg0,                                                  
  op_arg arg1,                                                  
  op_arg arg2,                                                  
  op_arg arg3,                                                  
  op_arg arg4 ){                                                
   
  int ninds   = 0;    
  int nargs   = 5; 
  op_arg args[5] = {arg0,arg1,arg2,arg3,arg4};
  
  double *arg4h = (double *)arg4.data;                            
                                                                
  if (OP_diags>2) {                                             
    printf(" kernel routine w/o indirection:  update \n");      
  }                                                             
                                                                
  // initialise timers                                          
                                                                
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                      
  op_timers_core(&cpu_t1, &wall_t1);                                 
                                                                
  // set number of threads                                      
                                                                
#ifdef _OPENMP                                                  
  int nthreads = omp_get_max_threads( );                        
#else                                                           
  int nthreads = 1;                                             
#endif                                                          
                                                                
  // allocate and initialise arrays for global reduction        
                                                                
  double arg4_l[1+64*64];                                        
  for (int thr=0; thr<nthreads; thr++)                          
    for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_double;        
                                                                
  // execute plan                                               
                                                                
#pragma omp parallel for                                        
  for (int thr=0; thr<nthreads; thr++) {                        
    int start  = (set->size* thr   )/nthreads;                  
    int finish = (set->size*(thr+1))/nthreads;                  
    op_x86_update( (double *) arg0.data,                         
                   (double *) arg1.data,                         
                   (double *) arg2.data,                         
                   (double *) arg3.data,                         
                   arg4_l + thr*64,                             
                   start, finish );                             
  }                                                             
                                                                
  // combine reduction data                                     
                                                                
  for (int thr=0; thr<nthreads; thr++)                          
    for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64];        
     
  //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_DAT)
      	set_dirtybit(args[i]);
  
  //performe any global operations
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_GBL) 
      	global_reduce(&args[i]);
  


  // update kernel record                                       
                                                                
  op_timers_core(&cpu_t2, &wall_t2);                                 
  op_timing_realloc(4);                                         
  OP_kernels[4].name      = name;                               
  OP_kernels[4].count    += 1;                                  
  OP_kernels[4].time     += wall_t2 - wall_t1;                  
  OP_kernels[4].transfer += (double)set->size * arg0.size;       
  OP_kernels[4].transfer += (double)set->size * arg1.size;       
  OP_kernels[4].transfer += (double)set->size * arg2.size * 2.0f;
  OP_kernels[4].transfer += (double)set->size * arg3.size;       
}                                                               
コード例 #9
0
// host stub function
void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg4,
  op_arg arg8,
  op_arg arg9,
  op_arg arg13){

  int nargs = 17;
  op_arg args[17];

  arg0.idx = 0;
  args[0] = arg0;
  for ( int v=1; v<4; v++ ){
    args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ);
  }

  arg4.idx = 0;
  args[4] = arg4;
  for ( int v=1; v<4; v++ ){
    args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ);
  }

  args[8] = arg8;
  arg9.idx = 0;
  args[9] = arg9;
  for ( int v=1; v<4; v++ ){
    args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW);
  }

  arg13.idx = 0;
  args[13] = arg13;
  for ( int v=1; v<4; v++ ){
    args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC);
  }


  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(0);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;

  int  ninds   = 4;
  int  inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  // get plan
  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif
  #ifdef OP_BLOCK_SIZE_0
    int nthread = OP_BLOCK_SIZE_0;
  #else
    int nthread = OP_block_size;
  #endif


  int ncolors = 0;
  int set_size1 = set->size + set->exec_size;

  if (set->size >0) {

    if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) {
      opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0);
      opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST;
    }
    if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) {
      direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8);
      direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST;
    }

    //Set up typed device pointers for OpenMP
    int *map0 = arg0.map_data_d;
     int map0size = arg0.map->dim * set_size1;

    double* data8 = (double*)arg8.data_d;
    int dat8size = (arg8.opt?1:0) * getSetSizeFromOpArg(&arg8) * arg8.dat->dim;
    double *data0 = (double *)arg0.data_d;
    int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim;
    double *data4 = (double *)arg4.data_d;
    int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim;
    double *data9 = (double *)arg9.data_d;
    int dat9size =
        (arg9.opt ? 1 : 0) * getSetSizeFromOpArg(&arg9) * arg9.dat->dim;
    double *data13 = (double *)arg13.data_d;
    int dat13size =
        (arg13.opt ? 1 : 0) * getSetSizeFromOpArg(&arg13) * arg13.dat->dim;

    op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);
    ncolors = Plan->ncolors;
    int *col_reord = Plan->col_reord;

    // execute plan
    for ( int col=0; col<Plan->ncolors; col++ ){
      if (col==1) {
        op_mpi_wait_all_cuda(nargs, args);
      }
      int start = Plan->col_offsets[0][col];
      int end = Plan->col_offsets[0][col+1];

      res_calc_omp4_kernel(
        map0,
        map0size,
        data8,
        dat8size,
        data0,
        dat0size,
        data4,
        dat4size,
        data9,
        dat9size,
        data13,
        dat13size,
        col_reord,
        set_size1,
        start,
        end,
        part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread,
        nthread,
        opDat0_res_calc_stride_OP2CONSTANT,
        direct_res_calc_stride_OP2CONSTANT);

    }
    OP_kernels[0].transfer  += Plan->transfer;
    OP_kernels[0].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size || ncolors == 1) {
    op_mpi_wait_all_cuda(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit_cuda(nargs, args);

  if (OP_diags>1) deviceSync();
  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
コード例 #10
0
void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4 ){


  int    nargs   = 5;
  op_arg args[5];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;

  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  EvolveValuesRK2_1\n");
  }

  op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;

  // set number of threads

#ifdef _OPENMP
  int nthreads = omp_get_max_threads( );
#else
  int nthreads = 1;
#endif

  if (set->size >0) {

    op_timers_core(&cpu_t1, &wall_t1);

  // execute plan

#pragma omp parallel for
  for (int thr=0; thr<nthreads; thr++) {
    int start  = (set->size* thr   )/nthreads;
    int finish = (set->size*(thr+1))/nthreads;
    op_x86_EvolveValuesRK2_1( (float *) arg0.data,
                              (float *) arg1.data,
                              (float *) arg2.data,
                              (float *) arg3.data,
                              (float *) arg4.data,
                              start, finish );
  }

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].time     += wall_t2 - wall_t1;
  OP_kernels[0].transfer += (float)set->size * arg1.size * 2.0f;
  OP_kernels[0].transfer += (float)set->size * arg2.size;
  OP_kernels[0].transfer += (float)set->size * arg3.size;
  OP_kernels[0].transfer += (float)set->size * arg4.size;
}
コード例 #11
0
ファイル: res_calc_kernel.cpp プロジェクト: ioz9/OP2-Common
void op_par_loop_res_calc(char const *name, op_set set,                 
  op_arg arg0,                                                          
  op_arg arg1,                                                          
  op_arg arg2,                                                          
  op_arg arg3,                                                          
  op_arg arg4,                                                          
  op_arg arg5,                                                          
  op_arg arg6,                                                          
  op_arg arg7 ){                                                        
                                                                        
                                                                        
  int    nargs   = 8;                                                   
  op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7};           
                                                                        
  int    ninds   = 4;                                                   
  int    inds[8] = {0,0,1,1,2,2,3,3};                                   
                                                                        
  if (OP_diags>2) {                                                     
    printf(" kernel routine with indirection: res_calc \n");            
  }                                                                     
                                                                        
  // get plan                                                           
                                                                        
  #ifdef OP_PART_SIZE_2                                                 
    int part_size = OP_PART_SIZE_2;                                     
  #else                                                                 
    int part_size = OP_part_size;                                       
  #endif                                                                
                                                                        
  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
                                                                        
  // initialise timers                                                  
                                                                        
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                              
  op_timers_core(&cpu_t1, &wall_t1);                                         
                                                                        
  // set number of threads                                              
                                                                        
#ifdef _OPENMP                                                          
  int nthreads = omp_get_max_threads( );                                
#else                                                                   
  int nthreads = 1;                                                     
#endif                                                                  
                                                                        
  // execute plan                                                       
                                                                        
  int block_offset = 0;                                                 
                                                                        
  for (int col=0; col < Plan->ncolors; col++) {                         
    int nblocks = Plan->ncolblk[col];                                   
                                                                        
#pragma omp parallel for                                                
    for (int blockIdx=0; blockIdx<nblocks; blockIdx++)                  
     op_x86_res_calc( blockIdx,                                         
       (double *)arg0.data, Plan->ind_maps[0],                          
       (double *)arg2.data, Plan->ind_maps[1],                          
       (double *)arg4.data, Plan->ind_maps[2],                          
       (double *)arg6.data, Plan->ind_maps[3],                          
       Plan->loc_maps[0],                                               
       Plan->loc_maps[1],                                               
       Plan->loc_maps[2],                                               
       Plan->loc_maps[3],                                               
       Plan->loc_maps[4],                                               
       Plan->loc_maps[5],                                               
       Plan->loc_maps[6],                                               
       Plan->loc_maps[7],                                               
       Plan->ind_sizes,                                                 
       Plan->ind_offs,                                                  
       block_offset,                                                    
       Plan->blkmap,                                                    
       Plan->offset,                                                    
       Plan->nelems,                                                    
       Plan->nthrcol,                                                   
       Plan->thrcol);                                                   
                                                                        
    block_offset += nblocks;                                            
  }                                                                     
                                                                        
  // combine reduction data                                             
                                                                        
  // update kernel record                                               
                                                                        
  op_timers_core(&cpu_t2, &wall_t2);                                         
  op_timing_realloc(2);                                                 
  OP_kernels[2].name      = name;                                       
  OP_kernels[2].count    += 1;                                          
  OP_kernels[2].time     += wall_t2 - wall_t1;                          
  OP_kernels[2].transfer  += Plan->transfer;                            
  OP_kernels[2].transfer2 += Plan->transfer2;                           
}                                                                       
コード例 #12
0
// host stub function
void op_par_loop_adt_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5){

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(1);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[1].name      = name;
  OP_kernels[1].count    += 1;

  int  ninds   = 1;
  int  inds[6] = {0,0,0,0,-1,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: adt_calc\n");
  }

  // get plan
  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);

  #ifdef OP_PART_SIZE_1
    int part_size = OP_PART_SIZE_1;
  #else
    int part_size = OP_part_size;
  #endif
  #ifdef OP_BLOCK_SIZE_1
    int nthread = OP_BLOCK_SIZE_1;
  #else
    int nthread = OP_block_size;
  #endif


  int ncolors = 0;
  int set_size1 = set->size + set->exec_size;

  if (set->size >0) {

    //Set up typed device pointers for OpenMP
    int *map0 = arg0.map_data_d;
     int map0size = arg0.map->dim * set_size1;

    float* data4 = (float*)arg4.data_d;
    int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim;
    float* data5 = (float*)arg5.data_d;
    int dat5size = getSetSizeFromOpArg(&arg5) * arg5.dat->dim;
    float *data0 = (float *)arg0.data_d;
    int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim;

    op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);
    ncolors = Plan->ncolors;
    int *col_reord = Plan->col_reord;

    // execute plan
    for ( int col=0; col<Plan->ncolors; col++ ){
      if (col==1) {
        op_mpi_wait_all_cuda(nargs, args);
      }
      int start = Plan->col_offsets[0][col];
      int end = Plan->col_offsets[0][col+1];

      adt_calc_omp4_kernel(map0, map0size, data4, dat4size, data5, dat5size,
                           data0, dat0size, col_reord, set_size1, start, end,
                           part_size != 0 ? (end - start - 1) / part_size + 1
                                          : (end - start - 1) / nthread,
                           nthread);
    }
    OP_kernels[1].transfer  += Plan->transfer;
    OP_kernels[1].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size || ncolors == 1) {
    op_mpi_wait_all_cuda(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit_cuda(nargs, args);

  if (OP_diags>1) deviceSync();
  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[1].time     += wall_t2 - wall_t1;
}
コード例 #13
0
ファイル: op_cuda_decl.c プロジェクト: ioz9/OP2-Common
void op_timers(double * cpu, double * et)
{
  op_timers_core(cpu,et);
}
コード例 #14
0
ファイル: op_mpi_decl.c プロジェクト: ioz9/OP2-Common
void op_timers(double * cpu, double * et)
{
  MPI_Barrier(MPI_COMM_WORLD);
  op_timers_core(cpu,et);
}
コード例 #15
0
ファイル: update_omp4kernel.cpp プロジェクト: OP2/OP2-Common
// host stub function
void op_par_loop_update(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3){

  double*arg3h = (double *)arg3.data;
  int nargs = 4;
  op_arg args[4];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(8);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[8].name      = name;
  OP_kernels[8].count    += 1;


  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  update");
  }

  op_mpi_halo_exchanges_cuda(set, nargs, args);

  #ifdef OP_PART_SIZE_8
    int part_size = OP_PART_SIZE_8;
  #else
    int part_size = OP_part_size;
  #endif
  #ifdef OP_BLOCK_SIZE_8
    int nthread = OP_BLOCK_SIZE_8;
  #else
    int nthread = OP_block_size;
  #endif

  double arg3_l = arg3h[0];

  if (set->size >0) {

    //Set up typed device pointers for OpenMP

    double* data0 = (double*)arg0.data_d;
    int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim;
    double* data1 = (double*)arg1.data_d;
    int dat1size = getSetSizeFromOpArg(&arg1) * arg1.dat->dim;
    double* data2 = (double*)arg2.data_d;
    int dat2size = getSetSizeFromOpArg(&arg2) * arg2.dat->dim;
    update_omp4_kernel(
      data0,
      dat0size,
      data1,
      dat1size,
      data2,
      dat2size,
      &arg3_l,
      set->size,
      part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread,
      nthread);

  }

  // combine reduction data
  arg3h[0] = arg3_l;
  op_mpi_reduce_double(&arg3,arg3h);
  op_mpi_set_dirtybit_cuda(nargs, args);

  if (OP_diags>1) deviceSync();
  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[8].time     += wall_t2 - wall_t1;
  OP_kernels[8].transfer += (float)set->size * arg0.size * 2.0f;
  OP_kernels[8].transfer += (float)set->size * arg1.size * 2.0f;
  OP_kernels[8].transfer += (float)set->size * arg2.size;
}
コード例 #16
0
void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1 ){

  int *arg1h = (int *)arg1.data;

  int    nargs   = 2;
  op_arg args[2];

  args[0] = arg0;
  args[1] = arg1;

  int    ninds   = 1;
  int    inds[2] = {0,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;

  // set number of threads

#ifdef _OPENMP
  int nthreads = omp_get_max_threads( );
#else
  int nthreads = 1;
#endif

  // allocate and initialise arrays for global reduction

  int arg1_l[1+64*64];
  for (int thr=0; thr<nthreads; thr++)
    for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int;

  if (set->size >0) {

    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);

    op_timers_core(&cpu_t1, &wall_t1);

    // execute plan

    int block_offset = 0;

    for (int col=0; col < Plan->ncolors; col++) {
      if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

      int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res_calc( blockIdx,
         (double *)arg0.data,
         Plan->ind_map,
         Plan->loc_map,
         &arg1_l[64*omp_get_thread_num()],
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);


  // combine reduction data
    if (col == Plan->ncolors_owned-1) {
      for (int thr=0; thr<nthreads; thr++)
        for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64];
    }

      block_offset += nblocks;
    }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_reduce(&arg1,arg1h);

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
コード例 #17
0
ファイル: res_kernel.cpp プロジェクト: xyuan/OP2-Common
void op_par_loop_res(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3 ){

  float *arg3h = (float *)arg3.data;

  int    nargs   = 4;
  op_arg args[4] = {arg0,arg1,arg2,arg3};

  int    ninds   = 2;
  int    inds[4] = {-1,0,1,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res \n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timers_core(&cpu_t1, &wall_t1);

  if (set->size >0) {


  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
  // execute plan

  int block_offset = 0;

  for (int col=0; col < Plan->ncolors; col++) {
    if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

    int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res( blockIdx,
         (float *)arg1.data,
         (float *)arg2.data,
         Plan->ind_map,
         Plan->loc_map,
         (float *)arg0.data,
         (float *)arg3.data,
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);

    block_offset += nblocks;
  }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
コード例 #18
0
ファイル: res_calc_seqkernel.cpp プロジェクト: OP2/OP2-Common
// host stub function
void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg4,
  op_arg arg8,
  op_arg arg9){

  int nargs = 13;
  op_arg args[13];

  arg0.idx = 0;
  args[0] = arg0;
  for ( int v=1; v<4; v++ ){
    args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ);
  }

  arg4.idx = 0;
  args[4] = arg4;
  for ( int v=1; v<4; v++ ){
    args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ);
  }

  args[8] = arg8;
  arg9.idx = 0;
  args[9] = arg9;
  for ( int v=1; v<4; v++ ){
    args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC);
  }


  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(0);
  op_timers_core(&cpu_t1, &wall_t1);

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    for ( int n=0; n<set_size; n++ ){
      if (n==set->core_size) {
        op_mpi_wait_all(nargs, args);
      }
      int map0idx = arg0.map_data[n * arg0.map->dim + 0];
      int map1idx = arg0.map_data[n * arg0.map->dim + 1];
      int map2idx = arg0.map_data[n * arg0.map->dim + 2];
      int map3idx = arg0.map_data[n * arg0.map->dim + 3];

      const double* arg0_vec[] = {
         &((double*)arg0.data)[2 * map0idx],
         &((double*)arg0.data)[2 * map1idx],
         &((double*)arg0.data)[2 * map2idx],
         &((double*)arg0.data)[2 * map3idx]};
      const double* arg4_vec[] = {
         &((double*)arg4.data)[1 * map0idx],
         &((double*)arg4.data)[1 * map1idx],
         &((double*)arg4.data)[1 * map2idx],
         &((double*)arg4.data)[1 * map3idx]};
      double* arg9_vec[] = {
         &((double*)arg9.data)[1 * map0idx],
         &((double*)arg9.data)[1 * map1idx],
         &((double*)arg9.data)[1 * map2idx],
         &((double*)arg9.data)[1 * map3idx]};

      res_calc(
        arg0_vec,
        arg4_vec,
        &((double*)arg8.data)[16 * n],
        arg9_vec);
    }
  }

  if (set_size == 0 || set_size == set->core_size) {
    op_mpi_wait_all(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
  OP_kernels[0].transfer += (float)set->size * arg0.size;
  OP_kernels[0].transfer += (float)set->size * arg4.size;
  OP_kernels[0].transfer += (float)set->size * arg9.size * 2.0f;
  OP_kernels[0].transfer += (float)set->size * arg8.size;
  OP_kernels[0].transfer += (float)set->size * arg0.map->dim * 4.0f;
}
コード例 #19
0
// host stub function
void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0,
                          op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4,
                          op_arg arg5) {

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(1);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[1].name = name;
  OP_kernels[1].count += 1;

  int ninds = 1;
  int inds[6] = {0, 0, 0, 0, -1, -1};

  if (OP_diags > 2) {
    printf(" kernel routine with indirection: adt_calc\n");
  }

// get plan
#ifdef OP_PART_SIZE_1
  int part_size = OP_PART_SIZE_1;
#else
  int part_size = OP_part_size;
#endif

  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);

  int ncolors = 0;

  if (set->size > 0) {

    // Set up typed device pointers for OpenACC
    int *map0 = arg0.map_data_d;

    float *data4 = (float *)arg4.data_d;
    float *data5 = (float *)arg5.data_d;
    float *data0 = (float *)arg0.data_d;

    op_plan *Plan = op_plan_get_stage(name, set, part_size, nargs, args, ninds,
                                      inds, OP_COLOR2);
    ncolors = Plan->ncolors;
    int *col_reord = Plan->col_reord;
    int set_size1 = set->size + set->exec_size;

    // execute plan
    for (int col = 0; col < Plan->ncolors; col++) {
      if (col == 1) {
        op_mpi_wait_all_cuda(nargs, args);
      }
      int start = Plan->col_offsets[0][col];
      int end = Plan->col_offsets[0][col + 1];

#pragma acc parallel loop independent deviceptr(col_reord, map0, data4, data5, \
                                                data0)
      for (int e = start; e < end; e++) {
        int n = col_reord[e];
        int map0idx = map0[n + set_size1 * 0];
        int map1idx = map0[n + set_size1 * 1];
        int map2idx = map0[n + set_size1 * 2];
        int map3idx = map0[n + set_size1 * 3];

        adt_calc(&data0[2 * map0idx], &data0[2 * map1idx], &data0[2 * map2idx],
                 &data0[2 * map3idx], &data4[4 * n], &data5[1 * n]);
      }
    }
    OP_kernels[1].transfer += Plan->transfer;
    OP_kernels[1].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size || ncolors == 1) {
    op_mpi_wait_all_cuda(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit_cuda(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[1].time += wall_t2 - wall_t1;
}
コード例 #20
0
ファイル: res_calc_kernel.cpp プロジェクト: xyuan/OP2-Common
void op_par_loop_res_calc(char const *name, op_set set,
                          op_arg arg0,
                          op_arg arg1,
                          op_arg arg2,
                          op_arg arg3,
                          op_arg arg4,
                          op_arg arg5,
                          op_arg arg6,
                          op_arg arg7 ) {


    int    nargs   = 8;
    op_arg args[8];

    args[0] = arg0;
    args[1] = arg1;
    args[2] = arg2;
    args[3] = arg3;
    args[4] = arg4;
    args[5] = arg5;
    args[6] = arg6;
    args[7] = arg7;

    int    ninds   = 4;
    int    inds[8] = {0,0,1,1,2,2,3,3};

    if (OP_diags>2) {
        printf(" kernel routine with indirection: res_calc\n");
    }

    // get plan

#ifdef OP_PART_SIZE_2
    int part_size = OP_PART_SIZE_2;
#else
    int part_size = OP_part_size;
#endif

    int set_size = op_mpi_halo_exchanges(set, nargs, args);

    // initialise timers

    double cpu_t1, cpu_t2, wall_t1, wall_t2;
    op_timers_core(&cpu_t1, &wall_t1);

    if (set->size >0) {


        op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
        // execute plan

        int block_offset = 0;

        for (int col=0; col < Plan->ncolors; col++) {
            if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

            int nblocks = Plan->ncolblk[col];

            #pragma omp parallel for
            for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
                op_x86_res_calc( blockIdx,
                                 (double *)arg0.data,
                                 (double *)arg2.data,
                                 (double *)arg4.data,
                                 (double *)arg6.data,
                                 Plan->ind_map,
                                 Plan->loc_map,
                                 Plan->ind_sizes,
                                 Plan->ind_offs,
                                 block_offset,
                                 Plan->blkmap,
                                 Plan->offset,
                                 Plan->nelems,
                                 Plan->nthrcol,
                                 Plan->thrcol,
                                 set_size);

            block_offset += nblocks;
        }

        op_timing_realloc(2);
        OP_kernels[2].transfer  += Plan->transfer;
        OP_kernels[2].transfer2 += Plan->transfer2;

    }


    // combine reduction data

    op_mpi_set_dirtybit(nargs, args);

    // update kernel record

    op_timers_core(&cpu_t2, &wall_t2);
    op_timing_realloc(2);
    OP_kernels[2].name      = name;
    OP_kernels[2].count    += 1;
    OP_kernels[2].time     += wall_t2 - wall_t1;
}
コード例 #21
0
ファイル: res_kernel.cpp プロジェクト: OP2/OP2-Common
// host stub function
void op_par_loop_res(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3){

  int nargs = 4;
  op_arg args[4];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(0);
  op_timers_core(&cpu_t1, &wall_t1);

  int  ninds   = 2;
  int  inds[4] = {-1,0,1,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res\n");
  }

  // get plan
  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    op_plan *Plan = op_plan_get_stage_upload(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_ALL,0);

    // execute plan
    int block_offset = 0;
    for ( int col=0; col<Plan->ncolors; col++ ){
      if (col==Plan->ncolors_core) {
        op_mpi_wait_all(nargs, args);
      }
      int nblocks = Plan->ncolblk[col];

      #pragma omp parallel for
      for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){
        int blockId  = Plan->blkmap[blockIdx + block_offset];
        int nelem    = Plan->nelems[blockId];
        int offset_b = Plan->offset[blockId];
        for ( int n=offset_b; n<offset_b+nelem; n++ ){
          int map1idx = arg1.map_data[n * arg1.map->dim + 1];
          int map2idx = arg1.map_data[n * arg1.map->dim + 0];


          res(
            &((float*)arg0.data)[1 * n],
            &((float*)arg1.data)[1 * map1idx],
            &((float*)arg2.data)[1 * map2idx],
            (float*)arg3.data);
        }
      }

      block_offset += nblocks;
    }
    OP_kernels[0].transfer  += Plan->transfer;
    OP_kernels[0].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size) {
    op_mpi_wait_all(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
コード例 #22
0
void op_par_loop_SpaceDiscretization(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5,
  op_arg arg6 ){


  int    nargs   = 8;
  op_arg args[8];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;
  arg6.idx = 0;
  args[6] = arg6;
  for (int v = 1; v < 2; v++) {
    args[6 + v] = op_arg_dat(arg6.dat, v, arg6.map, 1, "float", OP_READ);
  }

  int    ninds   = 2;
  int    inds[8] = {0,0,-1,-1,-1,-1,1,1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: SpaceDiscretization\n");
  }

  // get plan

  #ifdef OP_PART_SIZE_18
    int part_size = OP_PART_SIZE_18;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
  op_timing_realloc(18);
  OP_kernels[18].name      = name;
  OP_kernels[18].count    += 1;

  if (set->size >0) {

    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);

    op_timers_core(&cpu_t1, &wall_t1);

    // execute plan

    int block_offset = 0;

    for (int col=0; col < Plan->ncolors; col++) {
      if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

      int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_SpaceDiscretization( blockIdx,
         (float *)arg0.data,
         (float *)arg6.data,
         Plan->ind_map,
         Plan->loc_map,
         (float *)arg2.data,
         (float *)arg3.data,
         (float *)arg4.data,
         (int *)arg5.data,
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);

      block_offset += nblocks;
    }

  op_timing_realloc(18);
  OP_kernels[18].transfer  += Plan->transfer;
  OP_kernels[18].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[18].time     += wall_t2 - wall_t1;
}
コード例 #23
0
void op_par_loop_res_calc(char const *name, op_set set,                 
  op_arg arg0,                                                          
  op_arg arg1,                                                          
  op_arg arg2,                                                          
  op_arg arg3,                                                          
  op_arg arg4,                                                          
  op_arg arg5,                                                          
  op_arg arg6,                                                          
  op_arg arg7 ){     

  int    nargs   = 8;                                                   
  op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7};  

  int    ninds   = 4;                                                   
  int    inds[8] = {0,0,1,1,2,2,3,3}; 
  
  int sent[8] = {0,0,0,0,0,0,0,0}; //array to set if halo is exchanged
  if(ninds > 0) //indirect loop
  {
      for(int i = 0; i<nargs; i++)
      {
      	  if(args[i].argtype == OP_ARG_DAT)
      	  {
      	      if (OP_diags==1) reset_halo(args[i]);
      	      sent[i] = exchange_halo(args[i]); 
      	      //if(sent[i] == 1)wait_all(args[i]);
      	  }
      }
  }
                                                                        
  if (OP_diags>2) {                                                     
    printf(" kernel routine with indirection: res_calc \n");            
  }                                                                     
                                                                        
  // get plan             
  int block_offset;  
  op_plan *Plan;
                                                                        
  #ifdef OP_PART_SIZE_2                                                 
    int part_size = OP_PART_SIZE_2;                                     
  #else                                                                 
    int part_size = OP_part_size;                                       
  #endif                                                                
     
  //get offsets
  int core_len = core_num[set->index];
  int noncore_len = set->size + OP_import_exec_list[set->index]->size - core_len;
  
  double cpu_t1, cpu_t2, wall_t1, wall_t2;    

  //process core set
  if (core_len>0) {
      if (OP_latency_sets[set->index].core_set == NULL) {
	op_set core_set = (op_set)malloc(sizeof(op_set_core));
	core_set->index = set->index;
	core_set->name = set->name;
	core_set->size = core_len;
	core_set->exec_size = 0;
	core_set->nonexec_size = 0;
	OP_latency_sets[set->index].core_set = core_set;
      }
      Plan = op_plan_get_offset(name,OP_latency_sets[set->index].core_set,
      	  0,part_size,nargs,args,ninds,inds);
                                  
	  op_timers_core(&cpu_t1, &wall_t1);
	
      // set number of threads                                          
      #ifdef _OPENMP                                                          
      	int nthreads = omp_get_max_threads( );                                
      #else                                                                   
      	int nthreads = 1;                                                     
      #endif                                                                  
                       
      // execute plan                                                       
      int block_offset = 0;                                                 
                                                                        
      for(int col=0; col < Plan->ncolors; col++) {                         
      	  int nblocks = Plan->ncolblk[col];                                   
                                                                        
      	  #pragma omp parallel for                                                
      	  for (int blockIdx=0; blockIdx<nblocks; blockIdx++)  
      	  op_x86_res_calc( blockIdx,
      	      (double *)arg0.data, Plan->ind_maps[0],
      	      (double *)arg2.data, Plan->ind_maps[1],
      	      (double *)arg4.data, Plan->ind_maps[2],
      	      (double *)arg6.data, Plan->ind_maps[3],                           
      	      Plan->loc_maps[0],                                               
      	      Plan->loc_maps[1],                                               
      	      Plan->loc_maps[2],                                               
      	      Plan->loc_maps[3],                                               
  		Plan->loc_maps[4],                                               
  		Plan->loc_maps[5],                                               
  		Plan->loc_maps[6],                                               
  		Plan->loc_maps[7],                                               
  		Plan->ind_sizes,                                                 
  		Plan->ind_offs,                                                  
  		block_offset,                                                    
  		Plan->blkmap,                                                    
  		Plan->offset,                                                    
  		Plan->nelems,                                                    
  		Plan->nthrcol,                                                   
  		Plan->thrcol);                                                   
                                                                        
  	  block_offset += nblocks;                                            
      }
	op_timers_core(&cpu_t2, &wall_t2);
    OP_kernels[2].time     += wall_t2 - wall_t1;
    OP_kernels[2].transfer  += Plan->transfer;                            
    OP_kernels[2].transfer2 += Plan->transfer2;     
  }

  if(ninds > 0) //indirect loop
  {
      for(int i = 0; i<nargs; i++)
      {
      	  if(args[i].argtype == OP_ARG_DAT)
      	  {
      	      if(sent[i] == 1)wait_all(args[i]);
      	  }
      }
  }

  if (noncore_len>0) {
  	if (OP_latency_sets[set->index].noncore_set == NULL) {
		op_set noncore_set = (op_set)malloc(sizeof (op_set_core));
		noncore_set->size = noncore_len;
		noncore_set->name = set->name;
		noncore_set->index = set->index;
		noncore_set->exec_size = 0;
		noncore_set->nonexec_size = 0;
		OP_latency_sets[set->index].noncore_set = noncore_set;
	  }
	  Plan = op_plan_get_offset(name,OP_latency_sets[set->index].noncore_set,core_len,
	       part_size,nargs,args,ninds,inds);
	
	   op_timers_core(&cpu_t1, &wall_t1);
	
	   // set number of threads                                              
	   #ifdef _OPENMP                                                          
	   	int nthreads = omp_get_max_threads( );                                
	   #else
	   	int nthreads = 1;
	   #endif                                                                  
                       
	   // execute plan                                                       
	   int block_offset = 0;                                                 
                                                                        
	   for (int col=0; col < Plan->ncolors; col++) {                         
	       int nblocks = Plan->ncolblk[col];                                   
                                                                        
	       #pragma omp parallel for                                                
	       for (int blockIdx=0; blockIdx<nblocks; blockIdx++)                  
	       op_x86_res_calc( blockIdx,                                         
	       	   (double *)arg0.data, Plan->ind_maps[0],                           
	       	   (double *)arg2.data, Plan->ind_maps[1],                           
	       	   (double *)arg4.data, Plan->ind_maps[2],                           
	       	   (double *)arg6.data, Plan->ind_maps[3],                           
	       	   Plan->loc_maps[0],                                               
	       	   Plan->loc_maps[1],                                               
	       	   Plan->loc_maps[2],                                               
	       	   Plan->loc_maps[3],                                               
	       	   Plan->loc_maps[4],                                               
	       	   Plan->loc_maps[5],                                               
	       	   Plan->loc_maps[6],                                               
	       	   Plan->loc_maps[7],                                               
	       	   Plan->ind_sizes,                                                 
	       	   Plan->ind_offs,                                                  
	       	   block_offset,                                                    
	       	   Plan->blkmap,                                                    
	       	   Plan->offset,                                                    
	       	   Plan->nelems,                                                    
	       	   Plan->nthrcol,                                                   
	       	   Plan->thrcol);                                                   
                                                                        
	       block_offset += nblocks;       
	   } 
	   op_timers_core(&cpu_t2, &wall_t2);
    	OP_kernels[2].time     += wall_t2 - wall_t1;
	OP_kernels[2].transfer  += Plan->transfer;                            
	OP_kernels[2].transfer2 += Plan->transfer2;     
    }
  
  
  //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_DAT)
      	set_dirtybit(args[i]);
  
  //performe any global operations
  // - NONE
  
  // update kernel record                                               

  op_timing_realloc(3);                                                 
  OP_kernels[2].name      = name;                                       
  OP_kernels[2].count    += 1;                                          
}                                                                       
コード例 #24
0
void op_par_loop_dotR(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1 ){

  double *arg1h = (double *)arg1.data;

  int    nargs   = 2;
  op_arg args[2];

  args[0] = arg0;
  args[1] = arg1;

  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  dotR\n");
  }

  op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
  op_timing_realloc(6);
  OP_kernels[6].name      = name;
  OP_kernels[6].count    += 1;

  // set number of threads

#ifdef _OPENMP
  int nthreads = omp_get_max_threads( );
#else
  int nthreads = 1;
#endif

  // allocate and initialise arrays for global reduction

  double arg1_l[1+64*64];
  for (int thr=0; thr<nthreads; thr++)
    for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_double;

  if (set->size >0) {

    op_timers_core(&cpu_t1, &wall_t1);

  // execute plan

#pragma omp parallel for
  for (int thr=0; thr<nthreads; thr++) {
    int start  = (set->size* thr   )/nthreads;
    int finish = (set->size*(thr+1))/nthreads;
    op_x86_dotR( (double *) arg0.data,
                 arg1_l + thr*64,
                 start, finish );
  }

  }


  // combine reduction data

  for (int thr=0; thr<nthreads; thr++)
    for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64];

  op_mpi_reduce(&arg1,arg1h);

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[6].time     += wall_t2 - wall_t1;
  OP_kernels[6].transfer += (float)set->size * arg0.size;
}
コード例 #25
0
op_plan *op_plan_core(char const *name, op_set set, int part_size, int nargs,
                      op_arg *args, int ninds, int *inds, int staging) {
  // set exec length
  int exec_length = set->size;
  for (int i = 0; i < nargs; i++) {
    if (args[i].opt && args[i].idx != -1 && args[i].acc != OP_READ) {
      exec_length += set->exec_size;
      break;
    }
  }

  /* first look for an existing execution plan */

  int ip = 0, match = 0;

  while (match == 0 && ip < OP_plan_index) {
    if ((strcmp(name, OP_plans[ip].name) == 0) && (set == OP_plans[ip].set) &&
        (nargs == OP_plans[ip].nargs) && (ninds == OP_plans[ip].ninds) &&
        (part_size == OP_plans[ip].part_size)) {
      match = 1;
      for (int m = 0; m < nargs; m++) {
        if (args[m].dat != NULL && OP_plans[ip].dats[m] != NULL)
          match = match && (args[m].dat->size == OP_plans[ip].dats[m]->size) &&
                  (args[m].dat->dim == OP_plans[ip].dats[m]->dim) &&
                  (args[m].map == OP_plans[ip].maps[m]) &&
                  (args[m].idx == OP_plans[ip].idxs[m]) &&
                  (args[m].acc == OP_plans[ip].accs[m]);
        else
          match = match && (args[m].dat == OP_plans[ip].dats[m]) &&
                  (args[m].map == OP_plans[ip].maps[m]) &&
                  (args[m].idx == OP_plans[ip].idxs[m]) &&
                  (args[m].acc == OP_plans[ip].accs[m]);
      }
    }
    ip++;
  }

  if (match) {
    ip--;
    if (OP_diags > 3)
      printf(" old execution plan #%d\n", ip);
    OP_plans[ip].count++;
    return &(OP_plans[ip]);
  } else {
    if (OP_diags > 1)
      printf(" new execution plan #%d for kernel %s\n", ip, name);
  }
  double wall_t1, wall_t2, cpu_t1, cpu_t2;
  op_timers_core(&cpu_t1, &wall_t1);
  /* work out worst case shared memory requirement per element */

  int halo_exchange = 0;
  for (int i = 0; i < nargs; i++) {
    if (args[i].opt && args[i].idx != -1 && args[i].acc != OP_WRITE &&
        args[i].acc != OP_INC) {
      halo_exchange = 1;
      break;
    }
  }

  int maxbytes = 0;
  for (int m = 0; m < nargs; m++) {
    if (args[m].opt && inds[m] >= 0) {
      if ((staging == OP_STAGE_INC && args[m].acc == OP_INC) ||
          (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE))
        maxbytes += args[m].dat->size;
    }
  }

  /* set blocksize and number of blocks; adaptive size based on 48kB of shared
   * memory */

  int bsize = part_size; // blocksize
  if (bsize == 0 && maxbytes > 0)
    bsize = MAX((24 * 1024 / (64 * maxbytes)) * 64,
                256); // 48kB exactly is too much, make it 24
  else if (bsize == 0 && maxbytes == 0)
    bsize = 256;

  // If we do 1 level of coloring, do it in one go
  if (staging == OP_COLOR2)
    bsize = exec_length;

  int nblocks = 0;

  int indirect_reduce = 0;
  for (int m = 0; m < nargs; m++) {
    indirect_reduce |=
        (args[m].acc != OP_READ && args[m].argtype == OP_ARG_GBL);
  }
  indirect_reduce &= (ninds > 0);

  /* Work out indirection arrays for OP_INCs */
  int ninds_staged = 0; // number of distinct (unique dat) indirect incs
  int *inds_staged = (int *)op_malloc(nargs * sizeof(int));
  int *inds_to_inds_staged = (int *)op_malloc(ninds * sizeof(int));

  for (int i = 0; i < nargs; i++)
    inds_staged[i] = -1;
  for (int i = 0; i < ninds; i++)
    inds_to_inds_staged[i] = -1;
  for (int i = 0; i < nargs; i++) {
    if (inds[i] >= 0 &&
        ((staging == OP_STAGE_INC && args[i].acc == OP_INC) ||
         (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE))) {
      if (inds_to_inds_staged[inds[i]] == -1) {
        inds_to_inds_staged[inds[i]] = ninds_staged;
        inds_staged[i] = ninds_staged;
        ninds_staged++;
      } else {
        inds_staged[i] = inds_to_inds_staged[inds[i]];
      }
    }
  }

  int *invinds_staged = (int *)op_malloc(ninds_staged * sizeof(int));
  for (int i = 0; i < ninds_staged; i++)
    invinds_staged[i] = -1;
  for (int i = 0; i < nargs; i++)
    if (inds[i] >= 0 &&
        ((staging == OP_STAGE_INC && args[i].acc == OP_INC) ||
         (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE)) &&
        invinds_staged[inds_staged[i]] == -1)
      invinds_staged[inds_staged[i]] = i;

  int prev_offset = 0;
  int next_offset = 0;

  while (next_offset < exec_length) {
    prev_offset = next_offset;
    if (prev_offset + bsize >= set->core_size && prev_offset < set->core_size) {
      next_offset = set->core_size;
    } else if (prev_offset + bsize >= set->size && prev_offset < set->size &&
               indirect_reduce) {
      next_offset = set->size;
    } else if (prev_offset + bsize >= exec_length &&
               prev_offset < exec_length) {
      next_offset = exec_length;
    } else {
      next_offset = prev_offset + bsize;
    }
    nblocks++;
  }

  // If we do 1 level of coloring, we have a single "block"
  if (staging == OP_COLOR2) {
    nblocks = 1;
    prev_offset = 0;
    next_offset = exec_length;
  };

  /* enlarge OP_plans array if needed */

  if (ip == OP_plan_max) {
    // printf("allocating more memory for OP_plans %d\n", OP_plan_max);
    OP_plan_max += 10;
    OP_plans = (op_plan *)op_realloc(OP_plans, OP_plan_max * sizeof(op_plan));
    if (OP_plans == NULL) {
      printf(" op_plan error -- error reallocating memory for OP_plans\n");
      exit(-1);
    }
  }

  /* allocate memory for new execution plan and store input arguments */

  OP_plans[ip].dats = (op_dat *)op_malloc(nargs * sizeof(op_dat));
  OP_plans[ip].idxs = (int *)op_malloc(nargs * sizeof(int));
  OP_plans[ip].optflags = (int *)op_malloc(nargs * sizeof(int));
  OP_plans[ip].maps = (op_map *)op_malloc(nargs * sizeof(op_map));
  OP_plans[ip].accs = (op_access *)op_malloc(nargs * sizeof(op_access));
  OP_plans[ip].inds_staged =
      (op_access *)op_malloc(ninds_staged * sizeof(op_access));

  OP_plans[ip].nthrcol = (int *)op_malloc(nblocks * sizeof(int));
  OP_plans[ip].thrcol = (int *)op_malloc(exec_length * sizeof(int));
  OP_plans[ip].col_reord = (int *)op_malloc((exec_length + 16) * sizeof(int));
  OP_plans[ip].col_offsets = NULL;
  OP_plans[ip].offset = (int *)op_malloc(nblocks * sizeof(int));
  OP_plans[ip].ind_maps = (int **)op_malloc(ninds_staged * sizeof(int *));
  OP_plans[ip].ind_offs =
      (int *)op_malloc(nblocks * ninds_staged * sizeof(int));
  OP_plans[ip].ind_sizes =
      (int *)op_malloc(nblocks * ninds_staged * sizeof(int));
  OP_plans[ip].nindirect = (int *)op_calloc(ninds, sizeof(int));
  OP_plans[ip].loc_maps = (short **)op_malloc(nargs * sizeof(short *));
  OP_plans[ip].nelems = (int *)op_malloc(nblocks * sizeof(int));
  OP_plans[ip].ncolblk =
      (int *)op_calloc(exec_length, sizeof(int)); /* max possibly needed */
  OP_plans[ip].blkmap = (int *)op_calloc(nblocks, sizeof(int));

  int *offsets = (int *)op_malloc((ninds_staged + 1) * sizeof(int));
  offsets[0] = 0;
  for (int m = 0; m < ninds_staged; m++) {
    int count = 0;
    for (int m2 = 0; m2 < nargs; m2++)
      if (inds_staged[m2] == m)
        count++;
    offsets[m + 1] = offsets[m] + count;
  }
  OP_plans[ip].ind_map =
      (int *)op_malloc(offsets[ninds_staged] * exec_length * sizeof(int));
  for (int m = 0; m < ninds_staged; m++) {
    OP_plans[ip].ind_maps[m] = &OP_plans[ip].ind_map[exec_length * offsets[m]];
  }
  free(offsets);

  int counter = 0;
  for (int m = 0; m < nargs; m++) {
    if (inds_staged[m] >= 0)
      counter++;
    else
      OP_plans[ip].loc_maps[m] = NULL;

    OP_plans[ip].dats[m] = args[m].dat;
    OP_plans[ip].idxs[m] = args[m].idx;
    OP_plans[ip].optflags[m] = args[m].opt;
    OP_plans[ip].maps[m] = args[m].map;
    OP_plans[ip].accs[m] = args[m].acc;
  }

  OP_plans[ip].loc_map =
      (short *)op_malloc(counter * exec_length * sizeof(short));
  counter = 0;
  for (int m = 0; m < nargs; m++) {
    if (inds_staged[m] >= 0) {
      OP_plans[ip].loc_maps[m] = &OP_plans[ip].loc_map[exec_length * (counter)];
      counter++;
    }
  }

  OP_plans[ip].name = name;
  OP_plans[ip].set = set;
  OP_plans[ip].nargs = nargs;
  OP_plans[ip].ninds = ninds;
  OP_plans[ip].ninds_staged = ninds_staged;
  OP_plans[ip].part_size = part_size;
  OP_plans[ip].nblocks = nblocks;
  OP_plans[ip].ncolors_core = 0;
  OP_plans[ip].ncolors_owned = 0;
  OP_plans[ip].count = 1;
  OP_plans[ip].inds_staged = inds_staged;

  OP_plan_index++;

  /* define aliases */

  op_dat *dats = OP_plans[ip].dats;
  int *idxs = OP_plans[ip].idxs;
  op_map *maps = OP_plans[ip].maps;
  op_access *accs = OP_plans[ip].accs;

  int *offset = OP_plans[ip].offset;
  int *nelems = OP_plans[ip].nelems;
  int **ind_maps = OP_plans[ip].ind_maps;
  int *ind_offs = OP_plans[ip].ind_offs;
  int *ind_sizes = OP_plans[ip].ind_sizes;
  int *nindirect = OP_plans[ip].nindirect;

  /* allocate working arrays */
  uint **work;
  work = (uint **)op_malloc(ninds * sizeof(uint *));

  for (int m = 0; m < ninds; m++) {
    int m2 = 0;
    while (inds[m2] != m)
      m2++;
    if (args[m2].opt == 0) {
      work[m] = NULL;
      continue;
    }

    int to_size = (maps[m2]->to)->exec_size + (maps[m2]->to)->nonexec_size +
                  (maps[m2]->to)->size;
    work[m] = (uint *)op_malloc(to_size * sizeof(uint));
  }

  int *work2;
  work2 =
      (int *)op_malloc(nargs * bsize * sizeof(int)); /* max possibly needed */

  /* process set one block at a time */

  float total_colors = 0;

  prev_offset = 0;
  next_offset = 0;
  for (int b = 0; b < nblocks; b++) {
    prev_offset = next_offset;
    if (prev_offset + bsize >= set->core_size && prev_offset < set->core_size) {
      next_offset = set->core_size;
    } else if (prev_offset + bsize >= set->size && prev_offset < set->size &&
               indirect_reduce) {
      next_offset = set->size;
    } else if (prev_offset + bsize >= exec_length &&
               prev_offset < exec_length) {
      next_offset = exec_length;
    } else {
      next_offset = prev_offset + bsize;
    }

    if (staging == OP_COLOR2) {
      prev_offset = 0;
      next_offset = exec_length;
    };
    int bs = next_offset - prev_offset;

    offset[b] = prev_offset; /* offset for block */
    nelems[b] = bs;          /* size of block */

    /* loop over indirection sets */
    for (int m = 0; m < ninds; m++) {
      int m2 = 0;
      while (inds[m2] != m)
        m2++;
      int m3 = inds_staged[m2];
      if (m3 < 0)
        continue;
      if (args[m2].opt == 0) {
        if (b == 0) {
          ind_offs[m3 + b * ninds_staged] = 0;
          ind_sizes[m3 + b * ninds_staged] = 0;
        } else {
          ind_offs[m3 + b * ninds_staged] =
              ind_offs[m3 + (b - 1) * ninds_staged];
          ind_sizes[m3 + b * ninds_staged] = 0;
        }
        continue;
      }
      /* build the list of elements indirectly referenced in this block */

      int ne = 0; /* number of elements */
      for (int m2 = 0; m2 < nargs; m2++) {
        if (inds[m2] == m) {
          for (int e = prev_offset; e < next_offset; e++)
            work2[ne++] = maps[m2]->map[idxs[m2] + e * maps[m2]->dim];
        }
      }

      /* sort them, then eliminate duplicates */

      qsort(work2, ne, sizeof(int), comp);

      int nde = 0;
      int p = 0;
      while (p < ne) {
        work2[nde] = work2[p];
        while (p < ne && work2[p] == work2[nde])
          p++;
        nde++;
      }
      ne = nde; /* number of distinct elements */

      /*
         if (OP_diags > 5) { printf(" indirection set %d: ",m); for (int e=0;
         e<ne; e++) printf("
         %d",work2[e]); printf(" \n"); } */

      /* store mapping and renumbered mappings in execution plan */

      for (int e = 0; e < ne; e++) {
        ind_maps[m3][nindirect[m]++] = work2[e];
        work[m][work2[e]] = e; // inverse mapping
      }

      for (int m2 = 0; m2 < nargs; m2++) {
        if (inds[m2] == m) {
          for (int e = prev_offset; e < next_offset; e++)
            OP_plans[ip].loc_maps[m2][e] =
                (short)(work[m][maps[m2]->map[idxs[m2] + e * maps[m2]->dim]]);
        }
      }

      if (b == 0) {
        ind_offs[m3 + b * ninds_staged] = 0;
        ind_sizes[m3 + b * ninds_staged] = nindirect[m];
      } else {
        ind_offs[m3 + b * ninds_staged] =
            ind_offs[m3 + (b - 1) * ninds_staged] +
            ind_sizes[m3 + (b - 1) * ninds_staged];
        ind_sizes[m3 + b * ninds_staged] =
            nindirect[m] - ind_offs[m3 + b * ninds_staged];
      }
    }

    /* now colour main set elements */

    for (int e = prev_offset; e < next_offset; e++)
      OP_plans[ip].thrcol[e] = -1;

    int repeat = 1;
    int ncolor = 0;
    int ncolors = 0;

    while (repeat) {
      repeat = 0;

      for (int m = 0; m < nargs; m++) {
        if (inds[m] >= 0 && args[m].opt)
          for (int e = prev_offset; e < next_offset; e++)
            work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] =
                0; /* zero out color array */
      }

      for (int e = prev_offset; e < next_offset; e++) {
        if (OP_plans[ip].thrcol[e] == -1) {
          int mask = 0;
          if (staging == OP_COLOR2 && halo_exchange && e >= set->core_size &&
              ncolor == 0)
            mask = 1;
          for (int m = 0; m < nargs; m++)
            if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
                args[m].opt)
              mask |=
                  work[inds[m]]
                      [maps[m]->map[idxs[m] +
                                    e * maps[m]->dim]]; /* set bits of mask */

          int color = ffs(~mask) - 1; /* find first bit not set */
          if (color == -1) {          /* run out of colors on this pass */
            repeat = 1;
          } else {
            OP_plans[ip].thrcol[e] = ncolor + color;
            mask = 1 << color;
            ncolors = MAX(ncolors, ncolor + color + 1);

            for (int m = 0; m < nargs; m++)
              if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
                  args[m].opt)
                work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] |=
                    mask; /* set color bit */
          }
        }
      }

      ncolor += 32; /* increment base level */
    }

    OP_plans[ip].nthrcol[b] =
        ncolors; /* number of thread colors in this block */
    total_colors += ncolors;

    // if(ncolors>1) printf(" number of colors in this block = %d \n",ncolors);
  }

  /* create element permutation by color */
  if (staging == OP_STAGE_PERMUTE || staging == OP_COLOR2) {
    int size_of_col_offsets = 0;
    for (int b = 0; b < nblocks; b++) {
      size_of_col_offsets += OP_plans[ip].nthrcol[b] + 1;
    }
    // allocate
    OP_plans[ip].col_offsets = (int **)op_malloc(nblocks * sizeof(int *));
    int *col_offsets = (int *)op_malloc(size_of_col_offsets * sizeof(int *));

    size_of_col_offsets = 0;
    op_keyvalue *kv = (op_keyvalue *)op_malloc(bsize * sizeof(op_keyvalue));
    for (int b = 0; b < nblocks; b++) {
      int ncolor = OP_plans[ip].nthrcol[b];
      for (int e = 0; e < nelems[b]; e++) {
        kv[e].key = OP_plans[ip].thrcol[offset[b] + e];
        kv[e].value = e;
      }
      qsort(kv, nelems[b], sizeof(op_keyvalue), comp2);
      OP_plans[ip].col_offsets[b] = col_offsets + size_of_col_offsets;
      OP_plans[ip].col_offsets[b][0] = 0;
      size_of_col_offsets += (ncolor + 1);

      // Set up permutation and pointers to beginning of each color
      ncolor = 0;
      for (int e = 0; e < nelems[b]; e++) {
        OP_plans[ip].thrcol[offset[b] + e] = kv[e].key;
        OP_plans[ip].col_reord[offset[b] + e] = kv[e].value;
        if (e > 0)
          if (kv[e].key > kv[e - 1].key) {
            ncolor++;
            OP_plans[ip].col_offsets[b][ncolor] = e;
          }
      }
      OP_plans[ip].col_offsets[b][ncolor + 1] = nelems[b];
    }
    for (int i = exec_length; i < exec_length + 16; i++)
      OP_plans[ip].col_reord[i] = 0;
  }

  /* color the blocks, after initialising colors to 0 */

  int *blk_col;

  blk_col = (int *)op_malloc(nblocks * sizeof(int));
  for (int b = 0; b < nblocks; b++)
    blk_col[b] = -1;

  int repeat = 1;
  int ncolor = 0;
  int ncolors = 0;

  while (repeat) {
    repeat = 0;

    for (int m = 0; m < nargs; m++) {
      if (inds[m] >= 0 && args[m].opt) {
        int to_size = (maps[m]->to)->exec_size + (maps[m]->to)->nonexec_size +
                      (maps[m]->to)->size;
        for (int e = 0; e < to_size; e++)
          work[inds[m]][e] = 0; // zero out color arrays
      }
    }
    prev_offset = 0;
    next_offset = 0;
    for (int b = 0; b < nblocks; b++) {
      prev_offset = next_offset;

      if (prev_offset + bsize >= set->core_size &&
          prev_offset < set->core_size) {
        next_offset = set->core_size;
      } else if (prev_offset + bsize >= set->size && prev_offset < set->size &&
                 indirect_reduce) {
        next_offset = set->size;
      } else if (prev_offset + bsize >= exec_length &&
                 prev_offset < exec_length) {
        next_offset = exec_length;
      } else {
        next_offset = prev_offset + bsize;
      }
      if (blk_col[b] == -1) { // color not yet assigned to block
        uint mask = 0;
        if (next_offset > set->core_size) { // should not use block colors from
                                            // the core set when doing the
                                            // non_core ones
          if (prev_offset <= set->core_size)
            OP_plans[ip].ncolors_core = ncolors;
          for (int shifter = 0; shifter < OP_plans[ip].ncolors_core; shifter++)
            mask |= 1 << shifter;
          if (prev_offset == set->size && indirect_reduce)
            OP_plans[ip].ncolors_owned = ncolors;
          for (int shifter = OP_plans[ip].ncolors_core;
               indirect_reduce && shifter < OP_plans[ip].ncolors_owned;
               shifter++)
            mask |= 1 << shifter;
        }

        for (int m = 0; m < nargs; m++) {
          if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
              args[m].opt)
            for (int e = prev_offset; e < next_offset; e++)
              mask |= work[inds[m]]
                          [maps[m]->map[idxs[m] + e * maps[m]->dim]]; // set
                                                                      // bits of
                                                                      // mask
        }

        int color = ffs(~mask) - 1; // find first bit not set
        if (color == -1) {          // run out of colors on this pass
          repeat = 1;
        } else {
          blk_col[b] = ncolor + color;
          mask = 1 << color;
          ncolors = MAX(ncolors, ncolor + color + 1);

          for (int m = 0; m < nargs; m++) {
            if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
                args[m].opt)
              for (int e = prev_offset; e < next_offset; e++)
                work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] |= mask;
          }
        }
      }
    }

    ncolor += 32; // increment base level
  }

  /* store block mapping and number of blocks per color */

  if (indirect_reduce && OP_plans[ip].ncolors_owned == 0)
    OP_plans[ip].ncolors_owned =
        ncolors; // no MPI, so get the reduction arrays after everyting is done
  OP_plans[ip].ncolors = ncolors;
  if (staging == OP_COLOR2)
    OP_plans[ip].ncolors = OP_plans[ip].nthrcol[0];

  /*for(int col = 0; col = OP_plans[ip].ncolors;col++) //should initialize to
    zero because op_calloc returns garbage!!
    {
    OP_plans[ip].ncolblk[col] = 0;
    }*/

  for (int b = 0; b < nblocks; b++)
    OP_plans[ip].ncolblk[blk_col[b]]++; // number of blocks of each color

  for (int c = 1; c < ncolors; c++)
    OP_plans[ip].ncolblk[c] += OP_plans[ip].ncolblk[c - 1]; // cumsum

  for (int c = 0; c < ncolors; c++)
    work2[c] = 0;

  for (int b = 0; b < nblocks; b++) {
    int c = blk_col[b];
    int b2 = work2[c]; // number of preceding blocks of this color
    if (c > 0)
      b2 += OP_plans[ip].ncolblk[c - 1]; // plus previous colors

    OP_plans[ip].blkmap[b2] = b;

    work2[c]++; // increment counter
  }

  for (int c = ncolors - 1; c > 0; c--)
    OP_plans[ip].ncolblk[c] -= OP_plans[ip].ncolblk[c - 1]; // undo cumsum

  /* reorder blocks by color? */

  /* work out shared memory requirements */
  OP_plans[ip].nsharedCol = (int *)op_malloc(ncolors * sizeof(int));
  float total_shared = 0;
  for (int col = 0; col < ncolors; col++) {
    OP_plans[ip].nsharedCol[col] = 0;
    for (int b = 0; b < nblocks; b++) {
      if (blk_col[b] == col) {
        int nbytes = 0;
        for (int m = 0; m < ninds_staged; m++) {
          int m2 = 0;
          while (inds_staged[m2] != m)
            m2++;
          if (args[m2].opt == 0)
            continue;

          nbytes +=
              ROUND_UP_64(ind_sizes[m + b * ninds_staged] * dats[m2]->size);
        }
        OP_plans[ip].nsharedCol[col] =
            MAX(OP_plans[ip].nsharedCol[col], nbytes);
        total_shared += nbytes;
      }
    }
  }

  OP_plans[ip].nshared = 0;
  total_shared = 0;

  for (int b = 0; b < nblocks; b++) {
    int nbytes = 0;
    for (int m = 0; m < ninds_staged; m++) {
      int m2 = 0;
      while (inds_staged[m2] != m)
        m2++;
      if (args[m2].opt == 0)
        continue;

      nbytes += ROUND_UP_64(ind_sizes[m + b * ninds_staged] * dats[m2]->size);
    }
    OP_plans[ip].nshared = MAX(OP_plans[ip].nshared, nbytes);
    total_shared += nbytes;
  }

  /* work out total bandwidth requirements */

  OP_plans[ip].transfer = 0;
  OP_plans[ip].transfer2 = 0;
  float transfer3 = 0;

  if (staging != OP_COLOR2 && staging != OP_STAGE_INC) {
    for (int b = 0; b < nblocks; b++) {
      for (int m = 0; m < nargs; m++) // for each argument
      {
        if (args[m].opt) {
          if (inds[m] < 0) // if it is directly addressed
          {
            float fac = 2.0f;
            if (accs[m] == OP_READ ||
                accs[m] == OP_WRITE) // if you only read or write it
              fac = 1.0f;
            if (dats[m] != NULL) {
              OP_plans[ip].transfer +=
                  fac * nelems[b] * dats[m]->size; // cost of reading it all
              OP_plans[ip].transfer2 += fac * nelems[b] * dats[m]->size;
              transfer3 += fac * nelems[b] * dats[m]->size;
            }
          } else // if it is indirectly addressed: cost of reading the pointer
                 // to it
          {
            OP_plans[ip].transfer += nelems[b] * sizeof(short);
            OP_plans[ip].transfer2 += nelems[b] * sizeof(short);
            transfer3 += nelems[b] * sizeof(short);
          }
        }
      }
      for (int m = 0; m < ninds; m++) // for each indirect mapping
      {
        int m2 = 0;
        while (inds[m2] != m) // find the first argument that uses this mapping
          m2++;
        if (args[m2].opt == 0)
          continue;
        float fac = 2.0f;
        if (accs[m2] == OP_READ || accs[m2] == OP_WRITE) // only read it
          fac = 1.0f;
        if (staging == OP_STAGE_INC && accs[m2] != OP_INC) {
          OP_plans[ip].transfer += 1;
          OP_plans[ip].transfer2 += 1;
          continue;
        }
        OP_plans[ip].transfer +=
            fac * ind_sizes[m + b * ninds] *
            dats[m2]->size; // simply read all data one by one

        /* work out how many cache lines are used by indirect addressing */

        int i_map, l_new, l_old;
        int e0 = ind_offs[m + b * ninds];       // where it starts
        int e1 = e0 + ind_sizes[m + b * ninds]; // where it ends

        l_old = -1;

        for (int e = e0; e < e1;
             e++) // iterate through every indirectly accessed data element
        {
          i_map = ind_maps[m][e]; // the pointer to the data element
          l_new = (i_map * dats[m2]->size) /
                  OP_cache_line_size; // which cache line it is on (full size,
                                      // dim*sizeof(type))
          if (l_new > l_old) // if it is on a further cache line (that is not
                             // yet loaded, - i_map is ordered)
            OP_plans[ip].transfer2 +=
                fac * OP_cache_line_size; // load the cache line
          l_old = l_new;
          l_new = ((i_map + 1) * dats[m2]->size - 1) /
                  OP_cache_line_size; // the last byte of the data
          OP_plans[ip].transfer2 += fac * (l_new - l_old) *
                                    OP_cache_line_size; // again, if not loaded,
                                                        // load it (can be
                                                        // multiple cache lines)
          l_old = l_new;
        }

        l_old = -1;

        for (int e = e0; e < e1; e++) {
          i_map = ind_maps[m][e]; // pointer to the data element
          l_new = (i_map * dats[m2]->size) /
                  (dats[m2]->dim * OP_cache_line_size); // which cache line the
                                                        // first dimension of
                                                        // the data is on
          if (l_new > l_old)
            transfer3 +=
                fac * dats[m2]->dim *
                OP_cache_line_size; // if not loaded yet, load all cache lines
          l_old = l_new;
          l_new =
              ((i_map + 1) * dats[m2]->size - 1) /
              (dats[m2]->dim * OP_cache_line_size); // primitve type's last byte
          transfer3 += fac * (l_new - l_old) * dats[m2]->dim *
                       OP_cache_line_size; // load it
          l_old = l_new;
        }

        /* also include mappings to load/store data */

        fac = 1.0f;
        if (accs[m2] == OP_RW)
          fac = 2.0f;
        OP_plans[ip].transfer += fac * ind_sizes[m + b * ninds] * sizeof(int);
        OP_plans[ip].transfer2 += fac * ind_sizes[m + b * ninds] * sizeof(int);
        transfer3 += fac * ind_sizes[m + b * ninds] * sizeof(int);
      }
    }
  }

  /* print out useful information */

  if (OP_diags > 1) {
    printf(" number of blocks       = %d \n", nblocks);
    printf(" number of block colors = %d \n", OP_plans[ip].ncolors);
    printf(" maximum block size     = %d \n", bsize);
    printf(" average thread colors  = %.2f \n", total_colors / nblocks);
    printf(" shared memory required = ");
    for (int i = 0; i < ncolors - 1; i++)
      printf(" %.2f KB,", OP_plans[ip].nsharedCol[i] / 1024.0f);
    printf(" %.2f KB\n", OP_plans[ip].nsharedCol[ncolors - 1] / 1024.0f);
    printf(" average data reuse     = %.2f \n",
           maxbytes * (exec_length / total_shared));
    printf(" data transfer (used)   = %.2f MB \n",
           OP_plans[ip].transfer / (1024.0f * 1024.0f));
    printf(" data transfer (total)  = %.2f MB \n",
           OP_plans[ip].transfer2 / (1024.0f * 1024.0f));
    printf(" SoA/AoS transfer ratio = %.2f \n\n",
           transfer3 / OP_plans[ip].transfer2);
  }

  /* validate plan info */

  op_plan_check(OP_plans[ip], ninds_staged, inds_staged);

  /* free work arrays */

  for (int m = 0; m < ninds; m++)
    free(work[m]);
  free(work);
  free(work2);
  free(blk_col);
  free(inds_to_inds_staged);
  free(invinds_staged);
  op_timers_core(&cpu_t2, &wall_t2);
  for (int i = 0; i < OP_kern_max; i++) {
    if (strcmp(name, OP_kernels[i].name) == 0) {
      OP_kernels[i].plan_time += wall_t2 - wall_t1;
      break;
    }
  }
  /* return pointer to plan */
  OP_plan_time += wall_t2 - wall_t1;
  return &(OP_plans[ip]);
}
コード例 #26
0
// host stub function
void op_par_loop_update(char const *name, op_set set, op_arg arg0,
                        op_arg arg1) {

  int *arg1h = (int *)arg1.data;
  int nargs = 2;
  op_arg args[2];

  args[0] = arg0;
  args[1] = arg1;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(1);
  op_timers_core(&cpu_t1, &wall_t1);

  if (OP_diags > 2) {
    printf(" kernel routine w/o indirection:  update");
  }

  op_mpi_halo_exchanges(set, nargs, args);
// set number of threads
#ifdef _OPENMP
  int nthreads = omp_get_max_threads();
#else
  int nthreads = 1;
#endif

  // allocate and initialise arrays for global reduction
  int arg1_l[nthreads * 64];
  for (int thr = 0; thr < nthreads; thr++) {
    for (int d = 0; d < 1; d++) {
      arg1_l[d + thr * 64] = ZERO_int;
    }
  }

  if (set->size > 0) {

// execute plan
#pragma omp parallel for
    for (int thr = 0; thr < nthreads; thr++) {
      int start = (set->size * thr) / nthreads;
      int finish = (set->size * (thr + 1)) / nthreads;
      for (int n = start; n < finish; n++) {
        update(&((double *)arg0.data)[4 * n],
               &arg1_l[64 * omp_get_thread_num()]);
      }
    }
  }

  // combine reduction data
  for (int thr = 0; thr < nthreads; thr++) {
    for (int d = 0; d < 1; d++) {
      arg1h[d] += arg1_l[d + thr * 64];
    }
  }
  op_mpi_reduce(&arg1, arg1h);
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[1].name = name;
  OP_kernels[1].count += 1;
  OP_kernels[1].time += wall_t2 - wall_t1;
  OP_kernels[1].transfer += (float)set->size * arg0.size * 2.0f;
}
コード例 #27
0
ファイル: res_calc_acckernel.c プロジェクト: OP2/OP2-Common
// host stub function
void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg4,
  op_arg arg8,
  op_arg arg9,
  op_arg arg13){

  int nargs = 17;
  op_arg args[17];

  arg0.idx = 0;
  args[0] = arg0;
  for ( int v=1; v<4; v++ ){
    args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ);
  }

  arg4.idx = 0;
  args[4] = arg4;
  for ( int v=1; v<4; v++ ){
    args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ);
  }

  args[8] = arg8;
  arg9.idx = 0;
  args[9] = arg9;
  for ( int v=1; v<4; v++ ){
    args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW);
  }

  arg13.idx = 0;
  args[13] = arg13;
  for ( int v=1; v<4; v++ ){
    args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC);
  }


  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(0);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;

  int  ninds   = 4;
  int  inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  // get plan
  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);


  int ncolors = 0;

  if (set->size >0) {

    if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) {
      opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0);
      opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST;
    }
    if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) {
      direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8);
      direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST;
    }

    //Set up typed device pointers for OpenACC
    int *map0 = arg0.map_data_d;

    double* data8 = (double*)arg8.data_d;
    double *data0 = (double *)arg0.data_d;
    double *data4 = (double *)arg4.data_d;
    double *data9 = (double *)arg9.data_d;
    double *data13 = (double *)arg13.data_d;

    op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);
    ncolors = Plan->ncolors;
    int *col_reord = Plan->col_reord;
    int set_size1 = set->size + set->exec_size;

    // execute plan
    for ( int col=0; col<Plan->ncolors; col++ ){
      if (col==1) {
        op_mpi_wait_all_cuda(nargs, args);
      }
      int start = Plan->col_offsets[0][col];
      int end = Plan->col_offsets[0][col+1];

      #pragma acc parallel loop independent deviceptr(col_reord,map0,data8,data0,data4,data9,data13)
      for ( int e=start; e<end; e++ ){
        int n = col_reord[e];
        int map0idx = map0[n + set_size1 * 0];
        int map1idx = map0[n + set_size1 * 1];
        int map2idx = map0[n + set_size1 * 2];
        int map3idx = map0[n + set_size1 * 3];

        const double* arg0_vec[] = {
           &data0[2 * map0idx],
           &data0[2 * map1idx],
           &data0[2 * map2idx],
           &data0[2 * map3idx]};
        const double* arg4_vec[] = {
           &data4[1 * map0idx],
           &data4[1 * map1idx],
           &data4[1 * map2idx],
           &data4[1 * map3idx]};
        double* arg9_vec[] = {
           &data9[1 * map0idx],
           &data9[1 * map1idx],
           &data9[1 * map2idx],
           &data9[1 * map3idx]};
        double* arg13_vec[] = {
           &data13[2 * map0idx],
           &data13[2 * map1idx],
           &data13[2 * map2idx],
           &data13[2 * map3idx]};

        res_calc(
          arg0_vec,
          arg4_vec,
          &data8[n],
          arg9_vec,
          arg13_vec);
      }

    }
    OP_kernels[0].transfer  += Plan->transfer;
    OP_kernels[0].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size || ncolors == 1) {
    op_mpi_wait_all_cuda(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit_cuda(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
コード例 #28
0
// host stub function
void op_par_loop_adt_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5){

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(1);
  op_timers_core(&cpu_t1, &wall_t1);

  if (OP_diags>2) {
    printf(" kernel routine with indirection: adt_calc\n");
  }

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    for ( int n=0; n<set_size; n++ ){
      if (n==set->core_size) {
        op_mpi_wait_all(nargs, args);
      }
      int map0idx = arg0.map_data[n * arg0.map->dim + 0];
      int map1idx = arg0.map_data[n * arg0.map->dim + 1];
      int map2idx = arg0.map_data[n * arg0.map->dim + 2];
      int map3idx = arg0.map_data[n * arg0.map->dim + 3];

      adt_calc(
        &((double*)arg0.data)[2 * map0idx],
        &((double*)arg0.data)[2 * map1idx],
        &((double*)arg0.data)[2 * map2idx],
        &((double*)arg0.data)[2 * map3idx],
        &((double*)arg4.data)[4 * n],
        &((double*)arg5.data)[1 * n]);
    }
  }

  if (set_size == 0 || set_size == set->core_size) {
    op_mpi_wait_all(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[1].name      = name;
  OP_kernels[1].count    += 1;
  OP_kernels[1].time     += wall_t2 - wall_t1;
}
コード例 #29
0
ファイル: update_kernel.cpp プロジェクト: ioz9/OP2-Common
void op_par_loop_update(char const *name, op_set set,           
  op_arg arg0,                                                  
  op_arg arg1,                                                  
  op_arg arg2,                                                  
  op_arg arg3,                                                  
  op_arg arg4 ){                                                
                                                                
  float *arg4h = (float *)arg4.data;                            
                                                                
  if (OP_diags>2) {                                             
    printf(" kernel routine w/o indirection:  update \n");      
  }                                                             
                                                                
  // initialise timers                                          
                                                                
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                      
  op_timers_core(&cpu_t1, &wall_t1);                                 
                                                                
  // set number of threads                                      
                                                                
#ifdef _OPENMP                                                  
  int nthreads = omp_get_max_threads( );                        
#else                                                           
  int nthreads = 1;                                             
#endif                                                          
                                                                
  // allocate and initialise arrays for global reduction        
                                                                
  float arg4_l[1+64*64];                                        
  for (int thr=0; thr<nthreads; thr++)                          
    for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_float;        
                                                                
  // execute plan                                               
                                                                
#pragma omp parallel for                                        
  for (int thr=0; thr<nthreads; thr++) {                        
    int start  = (set->size* thr   )/nthreads;                  
    int finish = (set->size*(thr+1))/nthreads;                  
    op_x86_update( (float *) arg0.data,                         
                   (float *) arg1.data,                         
                   (float *) arg2.data,                         
                   (float *) arg3.data,                         
                   arg4_l + thr*64,                             
                   start, finish );                             
  }                                                             
                                                                
  // combine reduction data                                     
                                                                
  for (int thr=0; thr<nthreads; thr++)                          
    for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64];        
                                                                
  // update kernel record                                       
                                                                
  op_timers_core(&cpu_t2, &wall_t2);                                 
  op_timing_realloc(4);                                         
  OP_kernels[4].name      = name;                               
  OP_kernels[4].count    += 1;                                  
  OP_kernels[4].time     += wall_t2 - wall_t1;                  
  OP_kernels[4].transfer += (float)set->size * arg0.size;       
  OP_kernels[4].transfer += (float)set->size * arg1.size;       
  OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f;
  OP_kernels[4].transfer += (float)set->size * arg3.size;       
}