void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ){ int nargs = 8; op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7}; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc \n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } // combine reduction data // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ) { int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; args[6] = arg6; args[7] = arg7; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, (double *)arg2.data, (double *)arg4.data, (double *)arg6.data, Plan->ind_map, Plan->loc_map, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(2); OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; int ninds = 1; int inds[2] = {0,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_map, Plan->loc_map, &arg1_l[64*omp_get_thread_num()], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); // combine reduction data if (col == Plan->ncolors_owned-1) { for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; } block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9 ){ int nargs = 13; op_arg args[13]; arg0.idx = 0; args[0] = arg0; for (int v = 1; v < 4; v++) { args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for (int v = 1; v < 4; v++) { args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for (int v = 1; v < 4; v++) { args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC); } int ninds = 3; int inds[13] = {0,0,0,0,1,1,1,1,-1,2,2,2,2}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, (double *)arg4.data, (double *)arg9.data, Plan->ind_map, Plan->loc_map, (double *)arg8.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ){ int nargs = 8; op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7}; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; int sent[8] = {0,0,0,0,0,0,0,0}; //array to set if halo is exchanged if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if (OP_diags==1) reset_halo(args[i]); sent[i] = exchange_halo(args[i]); //if(sent[i] == 1)wait_all(args[i]); } } } if (OP_diags>2) { printf(" kernel routine with indirection: res_calc \n"); } // get plan int block_offset; op_plan *Plan; #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif //get offsets int core_len = core_num[set->index]; int noncore_len = set->size + OP_import_exec_list[set->index]->size - core_len; double cpu_t1, cpu_t2, wall_t1, wall_t2; //process core set if (core_len>0) { if (OP_latency_sets[set->index].core_set == NULL) { op_set core_set = (op_set)malloc(sizeof(op_set_core)); core_set->index = set->index; core_set->name = set->name; core_set->size = core_len; core_set->exec_size = 0; core_set->nonexec_size = 0; OP_latency_sets[set->index].core_set = core_set; } Plan = op_plan_get_offset(name,OP_latency_sets[set->index].core_set, 0,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for(int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if(sent[i] == 1)wait_all(args[i]); } } } if (noncore_len>0) { if (OP_latency_sets[set->index].noncore_set == NULL) { op_set noncore_set = (op_set)malloc(sizeof (op_set_core)); noncore_set->size = noncore_len; noncore_set->name = set->name; noncore_set->index = set->index; noncore_set->exec_size = 0; noncore_set->nonexec_size = 0; OP_latency_sets[set->index].noncore_set = noncore_set; } Plan = op_plan_get_offset(name,OP_latency_sets[set->index].noncore_set,core_len, part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timing_realloc(3); OP_kernels[2].name = name; OP_kernels[2].count += 1; }