// host stub function
void op_par_loop_bres_calc_cpu(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5){

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(3);
  op_timers_core(&cpu_t1, &wall_t1);

  int  ninds   = 4;
  int  inds[6] = {0,0,1,2,3,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: bres_calc\n");
  }

  // get plan
  #ifdef OP_PART_SIZE_3
    int part_size = OP_PART_SIZE_3;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);

    // execute plan
    int block_offset = 0;
    for ( int col=0; col<Plan->ncolors; col++ ){
      if (col==Plan->ncolors_core) {
        op_mpi_wait_all(nargs, args);
      }
      int nblocks = Plan->ncolblk[col];

      #pragma omp parallel for
      for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){
        int blockId  = Plan->blkmap[blockIdx + block_offset];
        int nelem    = Plan->nelems[blockId];
        int offset_b = Plan->offset[blockId];
        for ( int n=offset_b; n<offset_b+nelem; n++ ){
          int map0idx = arg0.map_data[n * arg0.map->dim + 0];
          int map1idx = arg0.map_data[n * arg0.map->dim + 1];
          int map2idx = arg2.map_data[n * arg2.map->dim + 0];

          bres_calc(
            &((double*)arg0.data)[2 * map0idx],
            &((double*)arg0.data)[2 * map1idx],
            &((double*)arg2.data)[4 * map2idx],
            &((double*)arg3.data)[1 * map2idx],
            &((double*)arg4.data)[4 * map2idx],
            &((int*)arg5.data)[1 * n]);
        }
      }

      block_offset += nblocks;
    }
    OP_kernels[3].transfer  += Plan->transfer;
    OP_kernels[3].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size) {
    op_mpi_wait_all(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[3].name      = name;
  OP_kernels[3].count    += 1;
  OP_kernels[3].time     += wall_t2 - wall_t1;
}
Exemplo n.º 2
0
void op_x86_bres_calc(
  int    blockIdx,
  float *ind_arg0,
  float *ind_arg1,
  float *ind_arg2,
  float *ind_arg3,
  int   *ind_map,
  short *arg_map,
  int *arg5,
  int   *ind_arg_sizes,
  int   *ind_arg_offs,
  int    block_offset,
  int   *blkmap,
  int   *offset,
  int   *nelems,
  int   *ncolors,
  int   *colors,
  int   set_size) {

  float arg4_l[4];
  float *arg0_vec[2];

  int   *ind_arg0_map, ind_arg0_size;
  int   *ind_arg1_map, ind_arg1_size;
  int   *ind_arg2_map, ind_arg2_size;
  int   *ind_arg3_map, ind_arg3_size;
  float *ind_arg0_s;
  float *ind_arg1_s;
  float *ind_arg2_s;
  float *ind_arg3_s;
  int    nelem, offset_b;

  char shared[128000];

  if (0==0) {

    // get sizes and shift pointers and direct-mapped data

    int blockId = blkmap[blockIdx + block_offset];
    nelem    = nelems[blockId];
    offset_b = offset[blockId];

    ind_arg0_size = ind_arg_sizes[0+blockId*4];
    ind_arg1_size = ind_arg_sizes[1+blockId*4];
    ind_arg2_size = ind_arg_sizes[2+blockId*4];
    ind_arg3_size = ind_arg_sizes[3+blockId*4];

    ind_arg0_map = &ind_map[0*set_size] + ind_arg_offs[0+blockId*4];
    ind_arg1_map = &ind_map[2*set_size] + ind_arg_offs[1+blockId*4];
    ind_arg2_map = &ind_map[3*set_size] + ind_arg_offs[2+blockId*4];
    ind_arg3_map = &ind_map[4*set_size] + ind_arg_offs[3+blockId*4];

    // set shared memory pointers

    int nbytes = 0;
    ind_arg0_s = (float *) &shared[nbytes];
    nbytes    += ROUND_UP(ind_arg0_size*sizeof(float)*2);
    ind_arg1_s = (float *) &shared[nbytes];
    nbytes    += ROUND_UP(ind_arg1_size*sizeof(float)*4);
    ind_arg2_s = (float *) &shared[nbytes];
    nbytes    += ROUND_UP(ind_arg2_size*sizeof(float)*1);
    ind_arg3_s = (float *) &shared[nbytes];
  }

  // copy indirect datasets into shared memory or zero increment

  for (int n=0; n<ind_arg0_size; n++)
    for (int d=0; d<2; d++)
      ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2];

  for (int n=0; n<ind_arg1_size; n++)
    for (int d=0; d<4; d++)
      ind_arg1_s[d+n*4] = ind_arg1[d+ind_arg1_map[n]*4];

  for (int n=0; n<ind_arg2_size; n++)
    for (int d=0; d<1; d++)
      ind_arg2_s[d+n*1] = ind_arg2[d+ind_arg2_map[n]*1];

  for (int n=0; n<ind_arg3_size; n++)
    for (int d=0; d<4; d++)
      ind_arg3_s[d+n*4] = ZERO_float;


  // process set elements

  for (int n=0; n<nelem; n++) {

    // initialise local variables

    for (int d=0; d<4; d++)
      arg4_l[d] = ZERO_float;

    arg0_vec[0] = ind_arg0_s+arg_map[0*set_size+n+offset_b]*2;
    arg0_vec[1] = ind_arg0_s+arg_map[1*set_size+n+offset_b]*2;

    // user-supplied kernel call


    bres_calc(  arg0_vec,
                ind_arg1_s+arg_map[2*set_size+n+offset_b]*4,
                ind_arg2_s+arg_map[3*set_size+n+offset_b]*1,
                arg4_l,
                arg5+(n+offset_b)*1 );

    // store local variables

    int arg4_map = arg_map[4*set_size+n+offset_b];

    for (int d=0; d<4; d++)
      ind_arg3_s[d+arg4_map*4] += arg4_l[d];
  }

  // apply pointered write/increment

  for (int n=0; n<ind_arg3_size; n++)
    for (int d=0; d<4; d++)
      ind_arg3[d+ind_arg3_map[n]*4] += ind_arg3_s[d+n*4];

}
Exemplo n.º 3
0
void op_x86_bres_calc(                                                  
  int    blockIdx,                                                      
  float *ind_arg0, int *ind_arg0_maps,                                  
  float *ind_arg1, int *ind_arg1_maps,                                  
  float *ind_arg2, int *ind_arg2_maps,                                  
  float *ind_arg3, int *ind_arg3_maps,                                  
  short *arg0_maps,                                                     
  short *arg1_maps,                                                     
  short *arg2_maps,                                                     
  short *arg3_maps,                                                     
  short *arg4_maps,                                                     
  int *arg5,                                                            
  int   *ind_arg_sizes,                                                 
  int   *ind_arg_offs,                                                  
  int    block_offset,                                                  
  int   *blkmap,                                                        
  int   *offset,                                                        
  int   *nelems,                                                        
  int   *ncolors,                                                       
  int   *colors) {                                                      
                                                                        
  float arg4_l[4];                                                      
                                                                        
  int   *ind_arg0_map, ind_arg0_size;                        
  int   *ind_arg1_map, ind_arg1_size;                        
  int   *ind_arg2_map, ind_arg2_size;                        
  int   *ind_arg3_map, ind_arg3_size;                        
  float *ind_arg0_s;                                         
  float *ind_arg1_s;                                         
  float *ind_arg2_s;                                         
  float *ind_arg3_s;                                         
  int    nelems2, ncolor;                                    
  int    nelem, offset_b;                                    
                                                                        
  char shared[64000];                                        
                                                                        
  if (0==0) {                                                           
                                                                        
    // get sizes and shift pointers and direct-mapped data              
                                                                        
    int blockId = blkmap[blockIdx + block_offset];                      
    nelem    = nelems[blockId];                                         
    offset_b = offset[blockId];                                         
                                                                        
    nelems2  = nelem;                                                   
    ncolor   = ncolors[blockId];                                        
                                                                        
    ind_arg0_size = ind_arg_sizes[0+blockId*4];                         
    ind_arg1_size = ind_arg_sizes[1+blockId*4];                         
    ind_arg2_size = ind_arg_sizes[2+blockId*4];                         
    ind_arg3_size = ind_arg_sizes[3+blockId*4];                         
                                                                        
    ind_arg0_map = ind_arg0_maps + ind_arg_offs[0+blockId*4];           
    ind_arg1_map = ind_arg1_maps + ind_arg_offs[1+blockId*4];           
    ind_arg2_map = ind_arg2_maps + ind_arg_offs[2+blockId*4];           
    ind_arg3_map = ind_arg3_maps + ind_arg_offs[3+blockId*4];           
                                                                        
    // set shared memory pointers                                       
                                                                        
    int nbytes = 0;                                                     
    ind_arg0_s = (float *) &shared[nbytes];                             
    nbytes    += ROUND_UP(ind_arg0_size*sizeof(float)*2);               
    ind_arg1_s = (float *) &shared[nbytes];                             
    nbytes    += ROUND_UP(ind_arg1_size*sizeof(float)*4);               
    ind_arg2_s = (float *) &shared[nbytes];                             
    nbytes    += ROUND_UP(ind_arg2_size*sizeof(float)*1);               
    ind_arg3_s = (float *) &shared[nbytes];                             
  }                                                                     
                                                                        
  __syncthreads(); // make sure all of above completed                  
                                                                        
  // copy indirect datasets into shared memory or zero increment        
                                                                        
  for (int n=0; n<ind_arg0_size; n++)                                   
    for (int d=0; d<2; d++)                                             
      ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2];                
                                                                        
  for (int n=0; n<ind_arg1_size; n++)                                   
    for (int d=0; d<4; d++)                                             
      ind_arg1_s[d+n*4] = ind_arg1[d+ind_arg1_map[n]*4];                
                                                                        
  for (int n=0; n<ind_arg2_size; n++)                                   
    for (int d=0; d<1; d++)                                             
      ind_arg2_s[d+n*1] = ind_arg2[d+ind_arg2_map[n]*1];                
                                                                        
  for (int n=0; n<ind_arg3_size; n++)                                   
    for (int d=0; d<4; d++)                                             
      ind_arg3_s[d+n*4] = ZERO_float;                                   
                                                                        
  __syncthreads();                                                      
                                                                        
  // process set elements                                               
                                                                        
  for (int n=0; n<nelems2; n++) {                                       
    int col2 = -1;                                                      
                                                                        
    if (n<nelem) {                                                      
                                                                        
      // initialise local variables                                     
                                                                        
      for (int d=0; d<4; d++)                                           
        arg4_l[d] = ZERO_float;                                         
                                                                        
      // user-supplied kernel call                                      
                                                                        
      bres_calc( ind_arg0_s+arg0_maps[n+offset_b]*2,                    
                 ind_arg0_s+arg1_maps[n+offset_b]*2,                    
                 ind_arg1_s+arg2_maps[n+offset_b]*4,                    
                 ind_arg2_s+arg3_maps[n+offset_b]*1,                    
                 arg4_l,                                                
                 arg5+(n+offset_b)*1 );                                 
                                                                        
      col2 = colors[n+offset_b];                                        
    }                                                                   
                                                                        
    // store local variables                                            
                                                                        
    int arg4_map = arg4_maps[n+offset_b];                               
                                                                        
    for (int col=0; col<ncolor; col++) {                                
      if (col2==col) {                                                  
        for (int d=0; d<4; d++)                                         
          ind_arg3_s[d+arg4_map*4] += arg4_l[d];                        
      }                                                                 
      __syncthreads();                                                  
    }                                                                   
                                                                        
  }                                                                     
                                                                        
  // apply pointered write/increment                                    
                                                                        
  for (int n=0; n<ind_arg3_size; n++)                                   
    for (int d=0; d<4; d++)                                             
      ind_arg3[d+ind_arg3_map[n]*4] += ind_arg3_s[d+n*4];               
                                                                        
}                                                                       
Exemplo n.º 4
0
// host stub function
void op_par_loop_bres_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5){

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(3);
  op_timers_core(&cpu_t1, &wall_t1);

  if (OP_diags>2) {
    printf(" kernel routine with indirection: bres_calc\n");
  }

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    for ( int n=0; n<set_size; n++ ){
      if (n==set->core_size) {
        op_mpi_wait_all(nargs, args);
      }
      int map0idx = arg0.map_data[n * arg0.map->dim + 0];
      int map1idx = arg0.map_data[n * arg0.map->dim + 1];
      int map2idx = arg2.map_data[n * arg2.map->dim + 0];


      bres_calc(
        &((float*)arg0.data)[2 * map0idx],
        &((float*)arg0.data)[2 * map1idx],
        &((float*)arg2.data)[4 * map2idx],
        &((float*)arg3.data)[1 * map2idx],
        &((float*)arg4.data)[4 * map2idx],
        &((int*)arg5.data)[1 * n]);
    }
  }

  if (set_size == 0 || set_size == set->core_size) {
    op_mpi_wait_all(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[3].name      = name;
  OP_kernels[3].count    += 1;
  OP_kernels[3].time     += wall_t2 - wall_t1;
  OP_kernels[3].transfer += (float)set->size * arg0.size;
  OP_kernels[3].transfer += (float)set->size * arg2.size;
  OP_kernels[3].transfer += (float)set->size * arg3.size;
  OP_kernels[3].transfer += (float)set->size * arg4.size * 2.0f;
  OP_kernels[3].transfer += (float)set->size * arg5.size;
  OP_kernels[3].transfer += (float)set->size * arg0.map->dim * 4.0f;
  OP_kernels[3].transfer += (float)set->size * arg2.map->dim * 4.0f;
}
Exemplo n.º 5
0
// host stub function
void op_par_loop_bres_calc(char const *name, op_set set, op_arg arg0,
                           op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4,
                           op_arg arg5) {

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(3);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[3].name = name;
  OP_kernels[3].count += 1;

  int ninds = 4;
  int inds[6] = {0, 0, 1, 2, 3, -1};

  if (OP_diags > 2) {
    printf(" kernel routine with indirection: bres_calc\n");
  }

// get plan
#ifdef OP_PART_SIZE_3
  int part_size = OP_PART_SIZE_3;
#else
  int part_size = OP_part_size;
#endif

  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);

  int ncolors = 0;

  if (set->size > 0) {

    // Set up typed device pointers for OpenACC
    int *map0 = arg0.map_data_d;
    int *map2 = arg2.map_data_d;

    int *data5 = (int *)arg5.data_d;
    double *data0 = (double *)arg0.data_d;
    double *data2 = (double *)arg2.data_d;
    double *data3 = (double *)arg3.data_d;
    double *data4 = (double *)arg4.data_d;

    op_plan *Plan = op_plan_get_stage(name, set, part_size, nargs, args, ninds,
                                      inds, OP_COLOR2);
    ncolors = Plan->ncolors;
    int *col_reord = Plan->col_reord;
    int set_size1 = set->size + set->exec_size;

    // execute plan
    for (int col = 0; col < Plan->ncolors; col++) {
      if (col == 1) {
        op_mpi_wait_all_cuda(nargs, args);
      }
      int start = Plan->col_offsets[0][col];
      int end = Plan->col_offsets[0][col + 1];

#pragma acc parallel loop independent deviceptr(col_reord, map0, map2, data5,  \
                                                data0, data2, data3, data4)
      for (int e = start; e < end; e++) {
        int n = col_reord[e];
        int map0idx = map0[n + set_size1 * 0];
        int map1idx = map0[n + set_size1 * 1];
        int map2idx = map2[n + set_size1 * 0];

        bres_calc(&data0[2 * map0idx], &data0[2 * map1idx], &data2[4 * map2idx],
                  &data3[1 * map2idx], &data4[4 * map2idx], &data5[1 * n]);
      }
    }
    OP_kernels[3].transfer += Plan->transfer;
    OP_kernels[3].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size || ncolors == 1) {
    op_mpi_wait_all_cuda(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit_cuda(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[3].time += wall_t2 - wall_t1;
}