void op_par_loop_save_soln(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ int ninds = 0; int nargs = 2; op_arg args[2] = {arg0,arg1}; if (OP_diags>2) { printf(" kernel routine w/o indirection: save_soln \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_save_soln( (double *) arg0.data, (double *) arg1.data, start, finish ); } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (double)set->size * arg0.size; OP_kernels[0].transfer += (double)set->size * arg1.size; }
void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5 ){ int nargs = 6; op_arg args[6] = {arg0,arg1,arg2,arg3,arg4,arg5}; int ninds = 1; int inds[6] = {0,0,0,0,-1,-1}; int sent[6] = {0,0,0,0,0,0}; if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if (OP_diags==1) reset_halo(args[i]); sent[0] = exchange_halo(args[i]); if(sent[0] == 1)wait_all(args[i]); } } } if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc \n"); } // get plan #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_adt_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], (double *)arg4.data, (double *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(1); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; }
void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ int ninds = 0; int nargs = 5; op_arg args[5] = {arg0,arg1,arg2,arg3,arg4}; double *arg4h = (double *)arg4.data; if (OP_diags>2) { printf(" kernel routine w/o indirection: update \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg4_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_double; // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_update( (double *) arg0.data, (double *) arg1.data, (double *) arg2.data, (double *) arg3.data, arg4_l + thr*64, start, finish ); } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64]; //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_GBL) global_reduce(&args[i]); // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(4); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (double)set->size * arg0.size; OP_kernels[4].transfer += (double)set->size * arg1.size; OP_kernels[4].transfer += (double)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (double)set->size * arg3.size; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ){ int nargs = 8; op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7}; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; int sent[8] = {0,0,0,0,0,0,0,0}; //array to set if halo is exchanged if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if (OP_diags==1) reset_halo(args[i]); sent[i] = exchange_halo(args[i]); //if(sent[i] == 1)wait_all(args[i]); } } } if (OP_diags>2) { printf(" kernel routine with indirection: res_calc \n"); } // get plan int block_offset; op_plan *Plan; #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif //get offsets int core_len = core_num[set->index]; int noncore_len = set->size + OP_import_exec_list[set->index]->size - core_len; double cpu_t1, cpu_t2, wall_t1, wall_t2; //process core set if (core_len>0) { if (OP_latency_sets[set->index].core_set == NULL) { op_set core_set = (op_set)malloc(sizeof(op_set_core)); core_set->index = set->index; core_set->name = set->name; core_set->size = core_len; core_set->exec_size = 0; core_set->nonexec_size = 0; OP_latency_sets[set->index].core_set = core_set; } Plan = op_plan_get_offset(name,OP_latency_sets[set->index].core_set, 0,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for(int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if(sent[i] == 1)wait_all(args[i]); } } } if (noncore_len>0) { if (OP_latency_sets[set->index].noncore_set == NULL) { op_set noncore_set = (op_set)malloc(sizeof (op_set_core)); noncore_set->size = noncore_len; noncore_set->name = set->name; noncore_set->index = set->index; noncore_set->exec_size = 0; noncore_set->nonexec_size = 0; OP_latency_sets[set->index].noncore_set = noncore_set; } Plan = op_plan_get_offset(name,OP_latency_sets[set->index].noncore_set,core_len, part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timing_realloc(3); OP_kernels[2].name = name; OP_kernels[2].count += 1; }