Пример #1
0
void op_par_loop_res(char const *name, op_set set,                      
  op_arg arg0,                                                          
  op_arg arg1,                                                          
  op_arg arg2,                                                          
  op_arg arg3 ){                                                        
                                                                        
  float *arg3h = (float *)arg3.data;                                    
                                                                        
  int    nargs   = 4;                                                   
  op_arg args[4] = {arg0,arg1,arg2,arg3};                               
                                                                        
  int    ninds   = 2;                                                   
  int    inds[4] = {-1,0,1,-1};                                         
                                                                        
  if (OP_diags>2) {                                                     
    printf(" kernel routine with indirection: res \n");                 
  }                                                                     
                                                                        
  // get plan                                                           
                                                                        
  #ifdef OP_PART_SIZE_0                                                 
    int part_size = OP_PART_SIZE_0;                                     
  #else                                                                 
    int part_size = OP_part_size;                                       
  #endif                                                                
                                                                        
  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
                                                                        
  // initialise timers                                                  
                                                                        
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                              
  op_timers(&cpu_t1, &wall_t1);                                         
                                                                        
  // set number of threads                                              
                                                                        
#ifdef _OPENMP                                                          
  int nthreads = omp_get_max_threads( );                                
#else                                                                   
  int nthreads = 1;                                                     
#endif                                                                  
                                                                        
  // execute plan                                                       
                                                                        
  int block_offset = 0;                                                 
                                                                        
  for (int col=0; col < Plan->ncolors; col++) {                         
    int nblocks = Plan->ncolblk[col];                                   
                                                                        
#pragma omp parallel for                                                
    for (int blockIdx=0; blockIdx<nblocks; blockIdx++)                  
     op_x86_res( blockIdx,                                              
       (float *)arg1.data, Plan->ind_maps[0],                           
       (float *)arg2.data, Plan->ind_maps[1],                           
       (float *)arg0.data,                                              
       Plan->loc_maps[1],                                               
       Plan->loc_maps[2],                                               
       (float *)arg3.data,                                              
       Plan->ind_sizes,                                                 
       Plan->ind_offs,                                                  
       block_offset,                                                    
       Plan->blkmap,                                                    
       Plan->offset,                                                    
       Plan->nelems,                                                    
       Plan->nthrcol,                                                   
       Plan->thrcol);                                                   
                                                                        
    block_offset += nblocks;                                            
  }                                                                     
                                                                        
  // update kernel record                                               
                                                                        
  op_timers(&cpu_t2, &wall_t2);                                         
  op_timing_realloc(0);                                                 
  OP_kernels[0].name      = name;                                       
  OP_kernels[0].count    += 1;                                          
  OP_kernels[0].time     += wall_t2 - wall_t1;                          
  OP_kernels[0].transfer  += Plan->transfer;                            
  OP_kernels[0].transfer2 += Plan->transfer2;                           
}                                                                       
Пример #2
0
void op_par_loop_res(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3 ){

  float *arg3h = (float *)arg3.data;

  int    nargs   = 4;
  op_arg args[4] = {arg0,arg1,arg2,arg3};

  int    ninds   = 2;
  int    inds[4] = {-1,0,1,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res \n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timers_core(&cpu_t1, &wall_t1);

  if (set->size >0) {


  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
  // execute plan

  int block_offset = 0;

  for (int col=0; col < Plan->ncolors; col++) {
    if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

    int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res( blockIdx,
         (float *)arg1.data,
         (float *)arg2.data,
         Plan->ind_map,
         Plan->loc_map,
         (float *)arg0.data,
         (float *)arg3.data,
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);

    block_offset += nblocks;
  }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
}