コード例 #1
0
void op_par_loop_bres_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5 ){


  int    nargs   = 6;
  op_arg args[6];

  arg0.idx = 0;
  args[0] = arg0;
  for (int v = 1; v < 2; v++) {
    args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "float", OP_READ);
  }
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  int    ninds   = 4;
  int    inds[6] = {0,0,1,2,3,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: bres_calc\n");
  }

  // get plan

  #ifdef OP_PART_SIZE_3
    int part_size = OP_PART_SIZE_3;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
  op_timing_realloc(3);
  OP_kernels[3].name      = name;
  OP_kernels[3].count    += 1;

  if (set->size >0) {

    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);

    op_timers_core(&cpu_t1, &wall_t1);

    // execute plan

    int block_offset = 0;

    for (int col=0; col < Plan->ncolors; col++) {
      if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

      int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_bres_calc( blockIdx,
         (float *)arg0.data,
         (float *)arg2.data,
         (float *)arg3.data,
         (float *)arg4.data,
         Plan->ind_map,
         Plan->loc_map,
         (int *)arg5.data,
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);

      block_offset += nblocks;
    }

  op_timing_realloc(3);
  OP_kernels[3].transfer  += Plan->transfer;
  OP_kernels[3].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[3].time     += wall_t2 - wall_t1;
}
コード例 #2
0
void op_par_loop_bres_calc(char const *name, op_set set,                
  op_arg arg0,                                                          
  op_arg arg1,                                                          
  op_arg arg2,                                                          
  op_arg arg3,                                                          
  op_arg arg4,                                                          
  op_arg arg5 ){                                                        
                                                                        
                                                                        
  int    nargs   = 6;                                                   
  op_arg args[6] = {arg0,arg1,arg2,arg3,arg4,arg5};                     
                                                                        
  int    ninds   = 4;                                                   
  int    inds[6] = {0,0,1,2,3,-1};                                      
                                                                        
  if (OP_diags>2) {                                                     
    printf(" kernel routine with indirection: bres_calc \n");           
  }                                                                     
                                                                        
  // get plan                                                           
                                                                        
  #ifdef OP_PART_SIZE_3                                                 
    int part_size = OP_PART_SIZE_3;                                     
  #else                                                                 
    int part_size = OP_part_size;                                       
  #endif                                                                
                                                                        
  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
                                                                        
  // initialise timers                                                  
                                                                        
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                              
  op_timers(&cpu_t1, &wall_t1);                                         
                                                                        
  // set number of threads                                              
                                                                        
#ifdef _OPENMP                                                          
  int nthreads = omp_get_max_threads( );                                
#else                                                                   
  int nthreads = 1;                                                     
#endif                                                                  
                                                                        
  // execute plan                                                       
                                                                        
  int block_offset = 0;                                                 
                                                                        
  for (int col=0; col < Plan->ncolors; col++) {                         
    int nblocks = Plan->ncolblk[col];                                   
                                                                        
#pragma omp parallel for                                                
    for (int blockIdx=0; blockIdx<nblocks; blockIdx++)                  
     op_x86_bres_calc( blockIdx,                                        
       (float *)arg0.data, Plan->ind_maps[0],                           
       (float *)arg2.data, Plan->ind_maps[1],                           
       (float *)arg3.data, Plan->ind_maps[2],                           
       (float *)arg4.data, Plan->ind_maps[3],                           
       Plan->loc_maps[0],                                               
       Plan->loc_maps[1],                                               
       Plan->loc_maps[2],                                               
       Plan->loc_maps[3],                                               
       Plan->loc_maps[4],                                               
       (int *)arg5.data,                                                
       Plan->ind_sizes,                                                 
       Plan->ind_offs,                                                  
       block_offset,                                                    
       Plan->blkmap,                                                    
       Plan->offset,                                                    
       Plan->nelems,                                                    
       Plan->nthrcol,                                                   
       Plan->thrcol);                                                   
                                                                        
    block_offset += nblocks;                                            
  }                                                                     
                                                                        
  // update kernel record                                               
                                                                        
  op_timers(&cpu_t2, &wall_t2);                                         
  op_timing_realloc(3);                                                 
  OP_kernels[3].name      = name;                                       
  OP_kernels[3].count    += 1;                                          
  OP_kernels[3].time     += wall_t2 - wall_t1;                          
  OP_kernels[3].transfer  += Plan->transfer;                            
  OP_kernels[3].transfer2 += Plan->transfer2;                           
}