// host stub function
void op_par_loop_adt_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5){

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(1);
  op_timers_core(&cpu_t1, &wall_t1);

  if (OP_diags>2) {
    printf(" kernel routine with indirection: adt_calc\n");
  }

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    for ( int n=0; n<set_size; n++ ){
      if (n==set->core_size) {
        op_mpi_wait_all(nargs, args);
      }
      int map0idx = arg0.map_data[n * arg0.map->dim + 0];
      int map1idx = arg0.map_data[n * arg0.map->dim + 1];
      int map2idx = arg0.map_data[n * arg0.map->dim + 2];
      int map3idx = arg0.map_data[n * arg0.map->dim + 3];

      adt_calc(
        &((double*)arg0.data)[2 * map0idx],
        &((double*)arg0.data)[2 * map1idx],
        &((double*)arg0.data)[2 * map2idx],
        &((double*)arg0.data)[2 * map3idx],
        &((double*)arg4.data)[4 * n],
        &((double*)arg5.data)[1 * n]);
    }
  }

  if (set_size == 0 || set_size == set->core_size) {
    op_mpi_wait_all(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[1].name      = name;
  OP_kernels[1].count    += 1;
  OP_kernels[1].time     += wall_t2 - wall_t1;
}
void op_x86_adt_calc(                                                   
  int    blockIdx,                                                      
  double *ind_arg0, int *ind_arg0_maps,                                  
  short *arg0_maps,                                                     
  short *arg1_maps,                                                     
  short *arg2_maps,                                                     
  short *arg3_maps,                                                     
  double *arg4,                                                          
  double *arg5,                                                          
  int   *ind_arg_sizes,                                                 
  int   *ind_arg_offs,                                                  
  int    block_offset,                                                  
  int   *blkmap,                                                        
  int   *offset,                                                        
  int   *nelems,                                                        
  int   *ncolors,                                                       
  int   *colors) {                                                      
                                                                        
                                                                        
  int   *ind_arg0_map, ind_arg0_size;                        
  double *ind_arg0_s;                                         
  int    nelem, offset_b;                                    
                                                                        
  char shared[64000];                                        
                                                                        
  if (0==0) {                                                           
                                                                        
    // get sizes and shift pointers and direct-mapped data              
                                                                        
    int blockId = blkmap[blockIdx + block_offset];                      
    nelem    = nelems[blockId];                                         
    offset_b = offset[blockId];                                         
                                                                        
    ind_arg0_size = ind_arg_sizes[0+blockId*1];                         
                                                                        
    ind_arg0_map = ind_arg0_maps + ind_arg_offs[0+blockId*1];           
                                                                        
    // set shared memory pointers                                       
                                                                        
    int nbytes = 0;                                                     
    ind_arg0_s = (double *) &shared[nbytes];                             
  }                                                                     
                                                                        
  __syncthreads(); // make sure all of above completed                  
                                                                        
  // copy indirect datasets into shared memory or zero increment        
                                                                        
  for (int n=0; n<ind_arg0_size; n++)                                   
    for (int d=0; d<2; d++)                                             
      ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2];                
                                                                        
  __syncthreads();                                                      
                                                                        
  // process set elements                                               
                                                                        
  for (int n=0; n<nelem; n++) {                                         
                                                                        
      // user-supplied kernel call                                      
                                                                        
      adt_calc( ind_arg0_s+arg0_maps[n+offset_b]*2,                     
                ind_arg0_s+arg1_maps[n+offset_b]*2,                     
                ind_arg0_s+arg2_maps[n+offset_b]*2,                     
                ind_arg0_s+arg3_maps[n+offset_b]*2,                     
                arg4+(n+offset_b)*4,                                    
                arg5+(n+offset_b)*1 );                                  
  }                                                                     
                                                                        
}                                                                       
// host stub function
void op_par_loop_adt_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5){

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(1);
  op_timers_core(&cpu_t1, &wall_t1);

  int  ninds   = 1;
  int  inds[6] = {0,0,0,0,-1,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: adt_calc\n");
  }

  // get plan
  #ifdef OP_PART_SIZE_1
    int part_size = OP_PART_SIZE_1;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);

    // execute plan
    int block_offset = 0;
    for ( int col=0; col<Plan->ncolors; col++ ){
      if (col==Plan->ncolors_core) {
        op_mpi_wait_all(nargs, args);
      }
      int nblocks = Plan->ncolblk[col];

      #pragma omp parallel for
      for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){
        int blockId  = Plan->blkmap[blockIdx + block_offset];
        int nelem    = Plan->nelems[blockId];
        int offset_b = Plan->offset[blockId];
        for ( int n=offset_b; n<offset_b+nelem; n++ ){
          int map0idx = arg0.map_data[n * arg0.map->dim + 0];
          int map1idx = arg0.map_data[n * arg0.map->dim + 1];
          int map2idx = arg0.map_data[n * arg0.map->dim + 2];
          int map3idx = arg0.map_data[n * arg0.map->dim + 3];

          adt_calc(
            &((double*)arg0.data)[2 * map0idx],
            &((double*)arg0.data)[2 * map1idx],
            &((double*)arg0.data)[2 * map2idx],
            &((double*)arg0.data)[2 * map3idx],
            &((double*)arg4.data)[4 * n],
            &((double*)arg5.data)[1 * n]);
        }
      }

      block_offset += nblocks;
    }
    OP_kernels[1].transfer  += Plan->transfer;
    OP_kernels[1].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size) {
    op_mpi_wait_all(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[1].name      = name;
  OP_kernels[1].count    += 1;
  OP_kernels[1].time     += wall_t2 - wall_t1;
}
Exemple #4
0
void op_x86_adt_calc(                                                   
  int    blockIdx,                                                      
  float *ind_arg0, int *ind_arg0_maps,                                  
  short *arg0_maps,                                                     
  short *arg1_maps,                                                     
  short *arg2_maps,                                                     
  short *arg3_maps,                                                     
  float *arg4,                                                          
  float *arg5,                                                          
  int   *ind_arg_sizes,                                                 
  int   *ind_arg_offs,                                                  
  int    block_offset,                                                  
  int   *blkmap,                                                        
  int   *offset,                                                        
  int   *nelems,                                                        
  int   *ncolors,                                                       
  int   *colors) {                                                      
                                                                        
  float *arg0_vec[4];                                                   
                                                                        
  int   *ind_arg0_map, ind_arg0_size;                                   
  float *ind_arg0_s;                                                    
  int    nelem, offset_b;                                               
                                                                        
  char shared[64000];                                                   
                                                                        
  if (0==0) {                                                           
                                                                        
    // get sizes and shift pointers and direct-mapped data              
                                                                        
    int blockId = blkmap[blockIdx + block_offset];                      
    nelem    = nelems[blockId];                                         
    offset_b = offset[blockId];                                         
                                                                        
    ind_arg0_size = ind_arg_sizes[0+blockId*1];                         
                                                                        
    ind_arg0_map = ind_arg0_maps + ind_arg_offs[0+blockId*1];           
                                                                        
    // set shared memory pointers                                       
                                                                        
    int nbytes = 0;                                                     
    ind_arg0_s = (float *) &shared[nbytes];                             
  }                                                                     
                                                                        
  // copy indirect datasets into shared memory or zero increment        
                                                                        
  for (int n=0; n<ind_arg0_size; n++)                                   
    for (int d=0; d<2; d++)                                             
      ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2];                
                                                                        
                                                                        
  // process set elements                                               
                                                                        
  for (int n=0; n<nelem; n++) {                                         
                                                                        
      arg0_vec[0] = ind_arg0_s+arg0_maps[n+offset_b]*2;                 
      arg0_vec[1] = ind_arg0_s+arg1_maps[n+offset_b]*2;                 
      arg0_vec[2] = ind_arg0_s+arg2_maps[n+offset_b]*2;                 
      arg0_vec[3] = ind_arg0_s+arg3_maps[n+offset_b]*2;                 
                                                                        
    // user-supplied kernel call                                        
                                                                        
    adt_calc(  arg0_vec, arg4+(n+offset_b)*4, arg5+(n+offset_b)*1 );    
  }                                                                     
                                                                        
}                                                                       
Exemple #5
0
// host stub function
void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0,
                          op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4,
                          op_arg arg5) {

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(1);
  op_timers_core(&cpu_t1, &wall_t1);
  OP_kernels[1].name = name;
  OP_kernels[1].count += 1;

  int ninds = 1;
  int inds[6] = {0, 0, 0, 0, -1, -1};

  if (OP_diags > 2) {
    printf(" kernel routine with indirection: adt_calc\n");
  }

// get plan
#ifdef OP_PART_SIZE_1
  int part_size = OP_PART_SIZE_1;
#else
  int part_size = OP_part_size;
#endif

  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);

  int ncolors = 0;

  if (set->size > 0) {

    // Set up typed device pointers for OpenACC
    int *map0 = arg0.map_data_d;

    float *data4 = (float *)arg4.data_d;
    float *data5 = (float *)arg5.data_d;
    float *data0 = (float *)arg0.data_d;

    op_plan *Plan = op_plan_get_stage(name, set, part_size, nargs, args, ninds,
                                      inds, OP_COLOR2);
    ncolors = Plan->ncolors;
    int *col_reord = Plan->col_reord;
    int set_size1 = set->size + set->exec_size;

    // execute plan
    for (int col = 0; col < Plan->ncolors; col++) {
      if (col == 1) {
        op_mpi_wait_all_cuda(nargs, args);
      }
      int start = Plan->col_offsets[0][col];
      int end = Plan->col_offsets[0][col + 1];

#pragma acc parallel loop independent deviceptr(col_reord, map0, data4, data5, \
                                                data0)
      for (int e = start; e < end; e++) {
        int n = col_reord[e];
        int map0idx = map0[n + set_size1 * 0];
        int map1idx = map0[n + set_size1 * 1];
        int map2idx = map0[n + set_size1 * 2];
        int map3idx = map0[n + set_size1 * 3];

        adt_calc(&data0[2 * map0idx], &data0[2 * map1idx], &data0[2 * map2idx],
                 &data0[2 * map3idx], &data4[4 * n], &data5[1 * n]);
      }
    }
    OP_kernels[1].transfer += Plan->transfer;
    OP_kernels[1].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size || ncolors == 1) {
    op_mpi_wait_all_cuda(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit_cuda(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[1].time += wall_t2 - wall_t1;
}
void op_x86_adt_calc(
  int    blockIdx,
  double *ind_arg0,
  int   *ind_map,
  short *arg_map,
  double *arg4,
  double *arg5,
  int   *ind_arg_sizes,
  int   *ind_arg_offs,
  int    block_offset,
  int   *blkmap,
  int   *offset,
  int   *nelems,
  int   *ncolors,
  int   *colors,
  int   set_size) {


  int   *ind_arg0_map, ind_arg0_size;
  double *ind_arg0_s;
  int    nelem, offset_b;

  char shared[128000];

  if (0==0) {

    // get sizes and shift pointers and direct-mapped data

    int blockId = blkmap[blockIdx + block_offset];
    nelem    = nelems[blockId];
    offset_b = offset[blockId];

    ind_arg0_size = ind_arg_sizes[0+blockId*1];

    ind_arg0_map = &ind_map[0*set_size] + ind_arg_offs[0+blockId*1];

    // set shared memory pointers

    int nbytes = 0;
    ind_arg0_s = (double *) &shared[nbytes];
  }

  // copy indirect datasets into shared memory or zero increment

  for (int n=0; n<ind_arg0_size; n++)
    for (int d=0; d<2; d++)
      ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2];


  // process set elements

  for (int n=0; n<nelem; n++) {


    // user-supplied kernel call


    adt_calc(  ind_arg0_s+arg_map[0*set_size+n+offset_b]*2,
               ind_arg0_s+arg_map[1*set_size+n+offset_b]*2,
               ind_arg0_s+arg_map[2*set_size+n+offset_b]*2,
               ind_arg0_s+arg_map[3*set_size+n+offset_b]*2,
               arg4+(n+offset_b)*4,
               arg5+(n+offset_b)*1 );
  }

}