// host stub function void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3){ int nargs = 4; op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: res\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map1idx = arg1.map_data[n * arg1.map->dim + 1]; int map2idx = arg1.map_data[n * arg1.map->dim + 0]; res( &((double*)arg0.data)[1 * n], &((double*)arg1.data)[1 * map1idx], &((double*)arg2.data)[1 * map2idx], (double*)arg3.data); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg1.size; OP_kernels[0].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg3.size; OP_kernels[0].transfer += (float)set->size * arg1.map->dim * 4.0f; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4) { double *arg4h = (double *)arg4.data; int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(4); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[4].name = name; OP_kernels[4].count += 1; if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } op_mpi_halo_exchanges_cuda(set, nargs, args); double arg4_l = arg4h[0]; if (set->size > 0) { // Set up typed device pointers for OpenACC double *data0 = (double *)arg0.data_d; double *data1 = (double *)arg1.data_d; double *data2 = (double *)arg2.data_d; double *data3 = (double *)arg3.data_d; #pragma acc parallel loop independent deviceptr(data0, data1, data2, \ data3) reduction(+ : arg4_l) for (int n = 0; n < set->size; n++) { update(&data0[4 * n], &data1[4 * n], &data2[4 * n], &data3[1 * n], &arg4_l); } } // combine reduction data arg4h[0] = arg4_l; op_mpi_reduce_double(&arg4, arg4h); op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (float)set->size * arg0.size; OP_kernels[4].transfer += (float)set->size * arg1.size; OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg3.size; }
void op_par_loop_save_soln(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ int ninds = 0; int nargs = 2; op_arg args[2] = {arg0,arg1}; if (OP_diags>2) { printf(" kernel routine w/o indirection: save_soln \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_save_soln( (double *) arg0.data, (double *) arg1.data, start, finish ); } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (double)set->size * arg0.size; OP_kernels[0].transfer += (double)set->size * arg1.size; }
// host stub function void op_par_loop_save_soln_cpu(char const *name, op_set set, op_arg arg0, op_arg arg1){ int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine w/o indirection: save_soln"); } op_mpi_halo_exchanges(set, nargs, args); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads(); #else int nthreads = 1; #endif if (set->size >0) { // execute plan #pragma omp parallel for for ( int thr=0; thr<nthreads; thr++ ){ int start = (set->size* thr)/nthreads; int finish = (set->size*(thr+1))/nthreads; for ( int n=start; n<finish; n++ ){ save_soln( &((double*)arg0.data)[4*n], &((double*)arg1.data)[4*n]); } } } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg1.size; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4){ int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine w/o indirection: update"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ update( &((float*)arg0.data)[1*n], &((float*)arg1.data)[1*n], &((float*)arg2.data)[1*n], (float*)arg3.data, (float*)arg4.data); } } // combine reduction data op_mpi_reduce_float(&arg3,(float*)arg3.data); op_mpi_reduce_float(&arg4,(float*)arg4.data); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += (float)set->size * arg0.size; OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[1].transfer += (float)set->size * arg2.size * 2.0f; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9 ){ int nargs = 13; op_arg args[13]; arg0.idx = 0; args[0] = arg0; for (int v = 1; v < 4; v++) { args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for (int v = 1; v < 4; v++) { args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for (int v = 1; v < 4; v++) { args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC); } int ninds = 3; int inds[13] = {0,0,0,0,1,1,1,1,-1,2,2,2,2}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, (double *)arg4.data, (double *)arg9.data, Plan->ind_map, Plan->loc_map, (double *)arg8.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4) { int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // create aligned pointers for dats ALIGNED_double const double *__restrict__ ptr0 = (double *)arg0.data; __assume_aligned(ptr0, double_ALIGN); ALIGNED_double double *__restrict__ ptr1 = (double *)arg1.data; __assume_aligned(ptr1, double_ALIGN); ALIGNED_double double *__restrict__ ptr2 = (double *)arg2.data; __assume_aligned(ptr2, double_ALIGN); ALIGNED_double const double *__restrict__ ptr3 = (double *)arg3.data; __assume_aligned(ptr3, double_ALIGN); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(4); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } int exec_size = op_mpi_halo_exchanges(set, nargs, args); if (exec_size > 0) { #ifdef VECTORIZE #pragma novector for (int n = 0; n < (exec_size / SIMD_VEC) * SIMD_VEC; n += SIMD_VEC) { double dat4[SIMD_VEC] = {0.0}; #pragma simd for (int i = 0; i < SIMD_VEC; i++) { update(&(ptr0)[4 * (n + i)], &(ptr1)[4 * (n + i)], &(ptr2)[4 * (n + i)], &(ptr3)[1 * (n + i)], &dat4[i]); } for (int i = 0; i < SIMD_VEC; i++) { *(double *)arg4.data += dat4[i]; } } // remainder for (int n = (exec_size / SIMD_VEC) * SIMD_VEC; n < exec_size; n++) { #else for (int n = 0; n < exec_size; n++) { #endif update(&(ptr0)[4 * n], &(ptr1)[4 * n], &(ptr2)[4 * n], &(ptr3)[1 * n], (double *)arg4.data); } } // combine reduction data op_mpi_reduce(&arg4, (double *)arg4.data); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (float)set->size * arg0.size; OP_kernels[4].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg3.size; }
void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ int ninds = 0; int nargs = 5; op_arg args[5] = {arg0,arg1,arg2,arg3,arg4}; double *arg4h = (double *)arg4.data; if (OP_diags>2) { printf(" kernel routine w/o indirection: update \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg4_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_double; // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_update( (double *) arg0.data, (double *) arg1.data, (double *) arg2.data, (double *) arg3.data, arg4_l + thr*64, start, finish ); } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64]; //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_GBL) global_reduce(&args[i]); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(4); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (double)set->size * arg0.size; OP_kernels[4].transfer += (double)set->size * arg1.size; OP_kernels[4].transfer += (double)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (double)set->size * arg3.size; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9, op_arg arg13){ int nargs = 17; op_arg args[17]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW); } arg13.idx = 0; args[13] = arg13; for ( int v=1; v<4; v++ ){ args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[0].name = name; OP_kernels[0].count += 1; int ninds = 4; int inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif #ifdef OP_BLOCK_SIZE_0 int nthread = OP_BLOCK_SIZE_0; #else int nthread = OP_block_size; #endif int ncolors = 0; int set_size1 = set->size + set->exec_size; if (set->size >0) { if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) { opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0); opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST; } if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) { direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8); direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST; } //Set up typed device pointers for OpenMP int *map0 = arg0.map_data_d; int map0size = arg0.map->dim * set_size1; double* data8 = (double*)arg8.data_d; int dat8size = (arg8.opt?1:0) * getSetSizeFromOpArg(&arg8) * arg8.dat->dim; double *data0 = (double *)arg0.data_d; int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim; double *data4 = (double *)arg4.data_d; int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim; double *data9 = (double *)arg9.data_d; int dat9size = (arg9.opt ? 1 : 0) * getSetSizeFromOpArg(&arg9) * arg9.dat->dim; double *data13 = (double *)arg13.data_d; int dat13size = (arg13.opt ? 1 : 0) * getSetSizeFromOpArg(&arg13) * arg13.dat->dim; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; res_calc_omp4_kernel( map0, map0size, data8, dat8size, data0, dat0size, data4, dat4size, data9, dat9size, data13, dat13size, col_reord, set_size1, start, end, part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread, nthread, opDat0_res_calc_stride_OP2CONSTANT, direct_res_calc_stride_OP2CONSTANT); } OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); if (OP_diags>1) deviceSync(); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; if (OP_diags>2) { printf(" kernel routine w/o indirection: EvolveValuesRK2_1\n"); } op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif if (set->size >0) { op_timers_core(&cpu_t1, &wall_t1); // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_EvolveValuesRK2_1( (float *) arg0.data, (float *) arg1.data, (float *) arg2.data, (float *) arg3.data, (float *) arg4.data, start, finish ); } } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[0].transfer += (float)set->size * arg2.size; OP_kernels[0].transfer += (float)set->size * arg3.size; OP_kernels[0].transfer += (float)set->size * arg4.size; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ){ int nargs = 8; op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7}; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc \n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } // combine reduction data // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; }
// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[1].name = name; OP_kernels[1].count += 1; int ninds = 1; int inds[6] = {0,0,0,0,-1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc\n"); } // get plan int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif #ifdef OP_BLOCK_SIZE_1 int nthread = OP_BLOCK_SIZE_1; #else int nthread = OP_block_size; #endif int ncolors = 0; int set_size1 = set->size + set->exec_size; if (set->size >0) { //Set up typed device pointers for OpenMP int *map0 = arg0.map_data_d; int map0size = arg0.map->dim * set_size1; float* data4 = (float*)arg4.data_d; int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim; float* data5 = (float*)arg5.data_d; int dat5size = getSetSizeFromOpArg(&arg5) * arg5.dat->dim; float *data0 = (float *)arg0.data_d; int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; adt_calc_omp4_kernel(map0, map0size, data4, dat4size, data5, dat5size, data0, dat0size, col_reord, set_size1, start, end, part_size != 0 ? (end - start - 1) / part_size + 1 : (end - start - 1) / nthread, nthread); } OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); if (OP_diags>1) deviceSync(); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].time += wall_t2 - wall_t1; }
void op_timers(double * cpu, double * et) { op_timers_core(cpu,et); }
void op_timers(double * cpu, double * et) { MPI_Barrier(MPI_COMM_WORLD); op_timers_core(cpu,et); }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3){ double*arg3h = (double *)arg3.data; int nargs = 4; op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(8); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[8].name = name; OP_kernels[8].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: update"); } op_mpi_halo_exchanges_cuda(set, nargs, args); #ifdef OP_PART_SIZE_8 int part_size = OP_PART_SIZE_8; #else int part_size = OP_part_size; #endif #ifdef OP_BLOCK_SIZE_8 int nthread = OP_BLOCK_SIZE_8; #else int nthread = OP_block_size; #endif double arg3_l = arg3h[0]; if (set->size >0) { //Set up typed device pointers for OpenMP double* data0 = (double*)arg0.data_d; int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim; double* data1 = (double*)arg1.data_d; int dat1size = getSetSizeFromOpArg(&arg1) * arg1.dat->dim; double* data2 = (double*)arg2.data_d; int dat2size = getSetSizeFromOpArg(&arg2) * arg2.dat->dim; update_omp4_kernel( data0, dat0size, data1, dat1size, data2, dat2size, &arg3_l, set->size, part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread, nthread); } // combine reduction data arg3h[0] = arg3_l; op_mpi_reduce_double(&arg3,arg3h); op_mpi_set_dirtybit_cuda(nargs, args); if (OP_diags>1) deviceSync(); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[8].time += wall_t2 - wall_t1; OP_kernels[8].transfer += (float)set->size * arg0.size * 2.0f; OP_kernels[8].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[8].transfer += (float)set->size * arg2.size; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; int ninds = 1; int inds[2] = {0,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_map, Plan->loc_map, &arg1_l[64*omp_get_thread_num()], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); // combine reduction data if (col == Plan->ncolors_owned-1) { for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; } block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3 ){ float *arg3h = (float *)arg3.data; int nargs = 4; op_arg args[4] = {arg0,arg1,arg2,arg3}; int ninds = 2; int inds[4] = {-1,0,1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res \n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res( blockIdx, (float *)arg1.data, (float *)arg2.data, Plan->ind_map, Plan->loc_map, (float *)arg0.data, (float *)arg3.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9){ int nargs = 13; op_arg args[13]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg0.map_data[n * arg0.map->dim + 2]; int map3idx = arg0.map_data[n * arg0.map->dim + 3]; const double* arg0_vec[] = { &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg0.data)[2 * map2idx], &((double*)arg0.data)[2 * map3idx]}; const double* arg4_vec[] = { &((double*)arg4.data)[1 * map0idx], &((double*)arg4.data)[1 * map1idx], &((double*)arg4.data)[1 * map2idx], &((double*)arg4.data)[1 * map3idx]}; double* arg9_vec[] = { &((double*)arg9.data)[1 * map0idx], &((double*)arg9.data)[1 * map1idx], &((double*)arg9.data)[1 * map2idx], &((double*)arg9.data)[1 * map3idx]}; res_calc( arg0_vec, arg4_vec, &((double*)arg8.data)[16 * n], arg9_vec); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg4.size; OP_kernels[0].transfer += (float)set->size * arg9.size * 2.0f; OP_kernels[0].transfer += (float)set->size * arg8.size; OP_kernels[0].transfer += (float)set->size * arg0.map->dim * 4.0f; }
// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5) { int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[1].name = name; OP_kernels[1].count += 1; int ninds = 1; int inds[6] = {0, 0, 0, 0, -1, -1}; if (OP_diags > 2) { printf(" kernel routine with indirection: adt_calc\n"); } // get plan #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); int ncolors = 0; if (set->size > 0) { // Set up typed device pointers for OpenACC int *map0 = arg0.map_data_d; float *data4 = (float *)arg4.data_d; float *data5 = (float *)arg5.data_d; float *data0 = (float *)arg0.data_d; op_plan *Plan = op_plan_get_stage(name, set, part_size, nargs, args, ninds, inds, OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; int set_size1 = set->size + set->exec_size; // execute plan for (int col = 0; col < Plan->ncolors; col++) { if (col == 1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col + 1]; #pragma acc parallel loop independent deviceptr(col_reord, map0, data4, data5, \ data0) for (int e = start; e < end; e++) { int n = col_reord[e]; int map0idx = map0[n + set_size1 * 0]; int map1idx = map0[n + set_size1 * 1]; int map2idx = map0[n + set_size1 * 2]; int map3idx = map0[n + set_size1 * 3]; adt_calc(&data0[2 * map0idx], &data0[2 * map1idx], &data0[2 * map2idx], &data0[2 * map3idx], &data4[4 * n], &data5[1 * n]); } } OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].time += wall_t2 - wall_t1; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ) { int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; args[6] = arg6; args[7] = arg7; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, (double *)arg2.data, (double *)arg4.data, (double *)arg6.data, Plan->ind_map, Plan->loc_map, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(2); OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3){ int nargs = 4; op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); int ninds = 2; int inds[4] = {-1,0,1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { op_plan *Plan = op_plan_get_stage_upload(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_ALL,0); // execute plan int block_offset = 0; for ( int col=0; col<Plan->ncolors; col++ ){ if (col==Plan->ncolors_core) { op_mpi_wait_all(nargs, args); } int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){ int blockId = Plan->blkmap[blockIdx + block_offset]; int nelem = Plan->nelems[blockId]; int offset_b = Plan->offset[blockId]; for ( int n=offset_b; n<offset_b+nelem; n++ ){ int map1idx = arg1.map_data[n * arg1.map->dim + 1]; int map2idx = arg1.map_data[n * arg1.map->dim + 0]; res( &((float*)arg0.data)[1 * n], &((float*)arg1.data)[1 * map1idx], &((float*)arg2.data)[1 * map2idx], (float*)arg3.data); } } block_offset += nblocks; } OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_SpaceDiscretization(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6 ){ int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; arg6.idx = 0; args[6] = arg6; for (int v = 1; v < 2; v++) { args[6 + v] = op_arg_dat(arg6.dat, v, arg6.map, 1, "float", OP_READ); } int ninds = 2; int inds[8] = {0,0,-1,-1,-1,-1,1,1}; if (OP_diags>2) { printf(" kernel routine with indirection: SpaceDiscretization\n"); } // get plan #ifdef OP_PART_SIZE_18 int part_size = OP_PART_SIZE_18; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(18); OP_kernels[18].name = name; OP_kernels[18].count += 1; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_SpaceDiscretization( blockIdx, (float *)arg0.data, (float *)arg6.data, Plan->ind_map, Plan->loc_map, (float *)arg2.data, (float *)arg3.data, (float *)arg4.data, (int *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(18); OP_kernels[18].transfer += Plan->transfer; OP_kernels[18].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[18].time += wall_t2 - wall_t1; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ){ int nargs = 8; op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7}; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; int sent[8] = {0,0,0,0,0,0,0,0}; //array to set if halo is exchanged if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if (OP_diags==1) reset_halo(args[i]); sent[i] = exchange_halo(args[i]); //if(sent[i] == 1)wait_all(args[i]); } } } if (OP_diags>2) { printf(" kernel routine with indirection: res_calc \n"); } // get plan int block_offset; op_plan *Plan; #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif //get offsets int core_len = core_num[set->index]; int noncore_len = set->size + OP_import_exec_list[set->index]->size - core_len; double cpu_t1, cpu_t2, wall_t1, wall_t2; //process core set if (core_len>0) { if (OP_latency_sets[set->index].core_set == NULL) { op_set core_set = (op_set)malloc(sizeof(op_set_core)); core_set->index = set->index; core_set->name = set->name; core_set->size = core_len; core_set->exec_size = 0; core_set->nonexec_size = 0; OP_latency_sets[set->index].core_set = core_set; } Plan = op_plan_get_offset(name,OP_latency_sets[set->index].core_set, 0,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for(int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if(sent[i] == 1)wait_all(args[i]); } } } if (noncore_len>0) { if (OP_latency_sets[set->index].noncore_set == NULL) { op_set noncore_set = (op_set)malloc(sizeof (op_set_core)); noncore_set->size = noncore_len; noncore_set->name = set->name; noncore_set->index = set->index; noncore_set->exec_size = 0; noncore_set->nonexec_size = 0; OP_latency_sets[set->index].noncore_set = noncore_set; } Plan = op_plan_get_offset(name,OP_latency_sets[set->index].noncore_set,core_len, part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timing_realloc(3); OP_kernels[2].name = name; OP_kernels[2].count += 1; }
void op_par_loop_dotR(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ double *arg1h = (double *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; if (OP_diags>2) { printf(" kernel routine w/o indirection: dotR\n"); } op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(6); OP_kernels[6].name = name; OP_kernels[6].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_double; if (set->size >0) { op_timers_core(&cpu_t1, &wall_t1); // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_dotR( (double *) arg0.data, arg1_l + thr*64, start, finish ); } } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[6].time += wall_t2 - wall_t1; OP_kernels[6].transfer += (float)set->size * arg0.size; }
op_plan *op_plan_core(char const *name, op_set set, int part_size, int nargs, op_arg *args, int ninds, int *inds, int staging) { // set exec length int exec_length = set->size; for (int i = 0; i < nargs; i++) { if (args[i].opt && args[i].idx != -1 && args[i].acc != OP_READ) { exec_length += set->exec_size; break; } } /* first look for an existing execution plan */ int ip = 0, match = 0; while (match == 0 && ip < OP_plan_index) { if ((strcmp(name, OP_plans[ip].name) == 0) && (set == OP_plans[ip].set) && (nargs == OP_plans[ip].nargs) && (ninds == OP_plans[ip].ninds) && (part_size == OP_plans[ip].part_size)) { match = 1; for (int m = 0; m < nargs; m++) { if (args[m].dat != NULL && OP_plans[ip].dats[m] != NULL) match = match && (args[m].dat->size == OP_plans[ip].dats[m]->size) && (args[m].dat->dim == OP_plans[ip].dats[m]->dim) && (args[m].map == OP_plans[ip].maps[m]) && (args[m].idx == OP_plans[ip].idxs[m]) && (args[m].acc == OP_plans[ip].accs[m]); else match = match && (args[m].dat == OP_plans[ip].dats[m]) && (args[m].map == OP_plans[ip].maps[m]) && (args[m].idx == OP_plans[ip].idxs[m]) && (args[m].acc == OP_plans[ip].accs[m]); } } ip++; } if (match) { ip--; if (OP_diags > 3) printf(" old execution plan #%d\n", ip); OP_plans[ip].count++; return &(OP_plans[ip]); } else { if (OP_diags > 1) printf(" new execution plan #%d for kernel %s\n", ip, name); } double wall_t1, wall_t2, cpu_t1, cpu_t2; op_timers_core(&cpu_t1, &wall_t1); /* work out worst case shared memory requirement per element */ int halo_exchange = 0; for (int i = 0; i < nargs; i++) { if (args[i].opt && args[i].idx != -1 && args[i].acc != OP_WRITE && args[i].acc != OP_INC) { halo_exchange = 1; break; } } int maxbytes = 0; for (int m = 0; m < nargs; m++) { if (args[m].opt && inds[m] >= 0) { if ((staging == OP_STAGE_INC && args[m].acc == OP_INC) || (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE)) maxbytes += args[m].dat->size; } } /* set blocksize and number of blocks; adaptive size based on 48kB of shared * memory */ int bsize = part_size; // blocksize if (bsize == 0 && maxbytes > 0) bsize = MAX((24 * 1024 / (64 * maxbytes)) * 64, 256); // 48kB exactly is too much, make it 24 else if (bsize == 0 && maxbytes == 0) bsize = 256; // If we do 1 level of coloring, do it in one go if (staging == OP_COLOR2) bsize = exec_length; int nblocks = 0; int indirect_reduce = 0; for (int m = 0; m < nargs; m++) { indirect_reduce |= (args[m].acc != OP_READ && args[m].argtype == OP_ARG_GBL); } indirect_reduce &= (ninds > 0); /* Work out indirection arrays for OP_INCs */ int ninds_staged = 0; // number of distinct (unique dat) indirect incs int *inds_staged = (int *)op_malloc(nargs * sizeof(int)); int *inds_to_inds_staged = (int *)op_malloc(ninds * sizeof(int)); for (int i = 0; i < nargs; i++) inds_staged[i] = -1; for (int i = 0; i < ninds; i++) inds_to_inds_staged[i] = -1; for (int i = 0; i < nargs; i++) { if (inds[i] >= 0 && ((staging == OP_STAGE_INC && args[i].acc == OP_INC) || (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE))) { if (inds_to_inds_staged[inds[i]] == -1) { inds_to_inds_staged[inds[i]] = ninds_staged; inds_staged[i] = ninds_staged; ninds_staged++; } else { inds_staged[i] = inds_to_inds_staged[inds[i]]; } } } int *invinds_staged = (int *)op_malloc(ninds_staged * sizeof(int)); for (int i = 0; i < ninds_staged; i++) invinds_staged[i] = -1; for (int i = 0; i < nargs; i++) if (inds[i] >= 0 && ((staging == OP_STAGE_INC && args[i].acc == OP_INC) || (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE)) && invinds_staged[inds_staged[i]] == -1) invinds_staged[inds_staged[i]] = i; int prev_offset = 0; int next_offset = 0; while (next_offset < exec_length) { prev_offset = next_offset; if (prev_offset + bsize >= set->core_size && prev_offset < set->core_size) { next_offset = set->core_size; } else if (prev_offset + bsize >= set->size && prev_offset < set->size && indirect_reduce) { next_offset = set->size; } else if (prev_offset + bsize >= exec_length && prev_offset < exec_length) { next_offset = exec_length; } else { next_offset = prev_offset + bsize; } nblocks++; } // If we do 1 level of coloring, we have a single "block" if (staging == OP_COLOR2) { nblocks = 1; prev_offset = 0; next_offset = exec_length; }; /* enlarge OP_plans array if needed */ if (ip == OP_plan_max) { // printf("allocating more memory for OP_plans %d\n", OP_plan_max); OP_plan_max += 10; OP_plans = (op_plan *)op_realloc(OP_plans, OP_plan_max * sizeof(op_plan)); if (OP_plans == NULL) { printf(" op_plan error -- error reallocating memory for OP_plans\n"); exit(-1); } } /* allocate memory for new execution plan and store input arguments */ OP_plans[ip].dats = (op_dat *)op_malloc(nargs * sizeof(op_dat)); OP_plans[ip].idxs = (int *)op_malloc(nargs * sizeof(int)); OP_plans[ip].optflags = (int *)op_malloc(nargs * sizeof(int)); OP_plans[ip].maps = (op_map *)op_malloc(nargs * sizeof(op_map)); OP_plans[ip].accs = (op_access *)op_malloc(nargs * sizeof(op_access)); OP_plans[ip].inds_staged = (op_access *)op_malloc(ninds_staged * sizeof(op_access)); OP_plans[ip].nthrcol = (int *)op_malloc(nblocks * sizeof(int)); OP_plans[ip].thrcol = (int *)op_malloc(exec_length * sizeof(int)); OP_plans[ip].col_reord = (int *)op_malloc((exec_length + 16) * sizeof(int)); OP_plans[ip].col_offsets = NULL; OP_plans[ip].offset = (int *)op_malloc(nblocks * sizeof(int)); OP_plans[ip].ind_maps = (int **)op_malloc(ninds_staged * sizeof(int *)); OP_plans[ip].ind_offs = (int *)op_malloc(nblocks * ninds_staged * sizeof(int)); OP_plans[ip].ind_sizes = (int *)op_malloc(nblocks * ninds_staged * sizeof(int)); OP_plans[ip].nindirect = (int *)op_calloc(ninds, sizeof(int)); OP_plans[ip].loc_maps = (short **)op_malloc(nargs * sizeof(short *)); OP_plans[ip].nelems = (int *)op_malloc(nblocks * sizeof(int)); OP_plans[ip].ncolblk = (int *)op_calloc(exec_length, sizeof(int)); /* max possibly needed */ OP_plans[ip].blkmap = (int *)op_calloc(nblocks, sizeof(int)); int *offsets = (int *)op_malloc((ninds_staged + 1) * sizeof(int)); offsets[0] = 0; for (int m = 0; m < ninds_staged; m++) { int count = 0; for (int m2 = 0; m2 < nargs; m2++) if (inds_staged[m2] == m) count++; offsets[m + 1] = offsets[m] + count; } OP_plans[ip].ind_map = (int *)op_malloc(offsets[ninds_staged] * exec_length * sizeof(int)); for (int m = 0; m < ninds_staged; m++) { OP_plans[ip].ind_maps[m] = &OP_plans[ip].ind_map[exec_length * offsets[m]]; } free(offsets); int counter = 0; for (int m = 0; m < nargs; m++) { if (inds_staged[m] >= 0) counter++; else OP_plans[ip].loc_maps[m] = NULL; OP_plans[ip].dats[m] = args[m].dat; OP_plans[ip].idxs[m] = args[m].idx; OP_plans[ip].optflags[m] = args[m].opt; OP_plans[ip].maps[m] = args[m].map; OP_plans[ip].accs[m] = args[m].acc; } OP_plans[ip].loc_map = (short *)op_malloc(counter * exec_length * sizeof(short)); counter = 0; for (int m = 0; m < nargs; m++) { if (inds_staged[m] >= 0) { OP_plans[ip].loc_maps[m] = &OP_plans[ip].loc_map[exec_length * (counter)]; counter++; } } OP_plans[ip].name = name; OP_plans[ip].set = set; OP_plans[ip].nargs = nargs; OP_plans[ip].ninds = ninds; OP_plans[ip].ninds_staged = ninds_staged; OP_plans[ip].part_size = part_size; OP_plans[ip].nblocks = nblocks; OP_plans[ip].ncolors_core = 0; OP_plans[ip].ncolors_owned = 0; OP_plans[ip].count = 1; OP_plans[ip].inds_staged = inds_staged; OP_plan_index++; /* define aliases */ op_dat *dats = OP_plans[ip].dats; int *idxs = OP_plans[ip].idxs; op_map *maps = OP_plans[ip].maps; op_access *accs = OP_plans[ip].accs; int *offset = OP_plans[ip].offset; int *nelems = OP_plans[ip].nelems; int **ind_maps = OP_plans[ip].ind_maps; int *ind_offs = OP_plans[ip].ind_offs; int *ind_sizes = OP_plans[ip].ind_sizes; int *nindirect = OP_plans[ip].nindirect; /* allocate working arrays */ uint **work; work = (uint **)op_malloc(ninds * sizeof(uint *)); for (int m = 0; m < ninds; m++) { int m2 = 0; while (inds[m2] != m) m2++; if (args[m2].opt == 0) { work[m] = NULL; continue; } int to_size = (maps[m2]->to)->exec_size + (maps[m2]->to)->nonexec_size + (maps[m2]->to)->size; work[m] = (uint *)op_malloc(to_size * sizeof(uint)); } int *work2; work2 = (int *)op_malloc(nargs * bsize * sizeof(int)); /* max possibly needed */ /* process set one block at a time */ float total_colors = 0; prev_offset = 0; next_offset = 0; for (int b = 0; b < nblocks; b++) { prev_offset = next_offset; if (prev_offset + bsize >= set->core_size && prev_offset < set->core_size) { next_offset = set->core_size; } else if (prev_offset + bsize >= set->size && prev_offset < set->size && indirect_reduce) { next_offset = set->size; } else if (prev_offset + bsize >= exec_length && prev_offset < exec_length) { next_offset = exec_length; } else { next_offset = prev_offset + bsize; } if (staging == OP_COLOR2) { prev_offset = 0; next_offset = exec_length; }; int bs = next_offset - prev_offset; offset[b] = prev_offset; /* offset for block */ nelems[b] = bs; /* size of block */ /* loop over indirection sets */ for (int m = 0; m < ninds; m++) { int m2 = 0; while (inds[m2] != m) m2++; int m3 = inds_staged[m2]; if (m3 < 0) continue; if (args[m2].opt == 0) { if (b == 0) { ind_offs[m3 + b * ninds_staged] = 0; ind_sizes[m3 + b * ninds_staged] = 0; } else { ind_offs[m3 + b * ninds_staged] = ind_offs[m3 + (b - 1) * ninds_staged]; ind_sizes[m3 + b * ninds_staged] = 0; } continue; } /* build the list of elements indirectly referenced in this block */ int ne = 0; /* number of elements */ for (int m2 = 0; m2 < nargs; m2++) { if (inds[m2] == m) { for (int e = prev_offset; e < next_offset; e++) work2[ne++] = maps[m2]->map[idxs[m2] + e * maps[m2]->dim]; } } /* sort them, then eliminate duplicates */ qsort(work2, ne, sizeof(int), comp); int nde = 0; int p = 0; while (p < ne) { work2[nde] = work2[p]; while (p < ne && work2[p] == work2[nde]) p++; nde++; } ne = nde; /* number of distinct elements */ /* if (OP_diags > 5) { printf(" indirection set %d: ",m); for (int e=0; e<ne; e++) printf(" %d",work2[e]); printf(" \n"); } */ /* store mapping and renumbered mappings in execution plan */ for (int e = 0; e < ne; e++) { ind_maps[m3][nindirect[m]++] = work2[e]; work[m][work2[e]] = e; // inverse mapping } for (int m2 = 0; m2 < nargs; m2++) { if (inds[m2] == m) { for (int e = prev_offset; e < next_offset; e++) OP_plans[ip].loc_maps[m2][e] = (short)(work[m][maps[m2]->map[idxs[m2] + e * maps[m2]->dim]]); } } if (b == 0) { ind_offs[m3 + b * ninds_staged] = 0; ind_sizes[m3 + b * ninds_staged] = nindirect[m]; } else { ind_offs[m3 + b * ninds_staged] = ind_offs[m3 + (b - 1) * ninds_staged] + ind_sizes[m3 + (b - 1) * ninds_staged]; ind_sizes[m3 + b * ninds_staged] = nindirect[m] - ind_offs[m3 + b * ninds_staged]; } } /* now colour main set elements */ for (int e = prev_offset; e < next_offset; e++) OP_plans[ip].thrcol[e] = -1; int repeat = 1; int ncolor = 0; int ncolors = 0; while (repeat) { repeat = 0; for (int m = 0; m < nargs; m++) { if (inds[m] >= 0 && args[m].opt) for (int e = prev_offset; e < next_offset; e++) work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] = 0; /* zero out color array */ } for (int e = prev_offset; e < next_offset; e++) { if (OP_plans[ip].thrcol[e] == -1) { int mask = 0; if (staging == OP_COLOR2 && halo_exchange && e >= set->core_size && ncolor == 0) mask = 1; for (int m = 0; m < nargs; m++) if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) && args[m].opt) mask |= work[inds[m]] [maps[m]->map[idxs[m] + e * maps[m]->dim]]; /* set bits of mask */ int color = ffs(~mask) - 1; /* find first bit not set */ if (color == -1) { /* run out of colors on this pass */ repeat = 1; } else { OP_plans[ip].thrcol[e] = ncolor + color; mask = 1 << color; ncolors = MAX(ncolors, ncolor + color + 1); for (int m = 0; m < nargs; m++) if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) && args[m].opt) work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] |= mask; /* set color bit */ } } } ncolor += 32; /* increment base level */ } OP_plans[ip].nthrcol[b] = ncolors; /* number of thread colors in this block */ total_colors += ncolors; // if(ncolors>1) printf(" number of colors in this block = %d \n",ncolors); } /* create element permutation by color */ if (staging == OP_STAGE_PERMUTE || staging == OP_COLOR2) { int size_of_col_offsets = 0; for (int b = 0; b < nblocks; b++) { size_of_col_offsets += OP_plans[ip].nthrcol[b] + 1; } // allocate OP_plans[ip].col_offsets = (int **)op_malloc(nblocks * sizeof(int *)); int *col_offsets = (int *)op_malloc(size_of_col_offsets * sizeof(int *)); size_of_col_offsets = 0; op_keyvalue *kv = (op_keyvalue *)op_malloc(bsize * sizeof(op_keyvalue)); for (int b = 0; b < nblocks; b++) { int ncolor = OP_plans[ip].nthrcol[b]; for (int e = 0; e < nelems[b]; e++) { kv[e].key = OP_plans[ip].thrcol[offset[b] + e]; kv[e].value = e; } qsort(kv, nelems[b], sizeof(op_keyvalue), comp2); OP_plans[ip].col_offsets[b] = col_offsets + size_of_col_offsets; OP_plans[ip].col_offsets[b][0] = 0; size_of_col_offsets += (ncolor + 1); // Set up permutation and pointers to beginning of each color ncolor = 0; for (int e = 0; e < nelems[b]; e++) { OP_plans[ip].thrcol[offset[b] + e] = kv[e].key; OP_plans[ip].col_reord[offset[b] + e] = kv[e].value; if (e > 0) if (kv[e].key > kv[e - 1].key) { ncolor++; OP_plans[ip].col_offsets[b][ncolor] = e; } } OP_plans[ip].col_offsets[b][ncolor + 1] = nelems[b]; } for (int i = exec_length; i < exec_length + 16; i++) OP_plans[ip].col_reord[i] = 0; } /* color the blocks, after initialising colors to 0 */ int *blk_col; blk_col = (int *)op_malloc(nblocks * sizeof(int)); for (int b = 0; b < nblocks; b++) blk_col[b] = -1; int repeat = 1; int ncolor = 0; int ncolors = 0; while (repeat) { repeat = 0; for (int m = 0; m < nargs; m++) { if (inds[m] >= 0 && args[m].opt) { int to_size = (maps[m]->to)->exec_size + (maps[m]->to)->nonexec_size + (maps[m]->to)->size; for (int e = 0; e < to_size; e++) work[inds[m]][e] = 0; // zero out color arrays } } prev_offset = 0; next_offset = 0; for (int b = 0; b < nblocks; b++) { prev_offset = next_offset; if (prev_offset + bsize >= set->core_size && prev_offset < set->core_size) { next_offset = set->core_size; } else if (prev_offset + bsize >= set->size && prev_offset < set->size && indirect_reduce) { next_offset = set->size; } else if (prev_offset + bsize >= exec_length && prev_offset < exec_length) { next_offset = exec_length; } else { next_offset = prev_offset + bsize; } if (blk_col[b] == -1) { // color not yet assigned to block uint mask = 0; if (next_offset > set->core_size) { // should not use block colors from // the core set when doing the // non_core ones if (prev_offset <= set->core_size) OP_plans[ip].ncolors_core = ncolors; for (int shifter = 0; shifter < OP_plans[ip].ncolors_core; shifter++) mask |= 1 << shifter; if (prev_offset == set->size && indirect_reduce) OP_plans[ip].ncolors_owned = ncolors; for (int shifter = OP_plans[ip].ncolors_core; indirect_reduce && shifter < OP_plans[ip].ncolors_owned; shifter++) mask |= 1 << shifter; } for (int m = 0; m < nargs; m++) { if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) && args[m].opt) for (int e = prev_offset; e < next_offset; e++) mask |= work[inds[m]] [maps[m]->map[idxs[m] + e * maps[m]->dim]]; // set // bits of // mask } int color = ffs(~mask) - 1; // find first bit not set if (color == -1) { // run out of colors on this pass repeat = 1; } else { blk_col[b] = ncolor + color; mask = 1 << color; ncolors = MAX(ncolors, ncolor + color + 1); for (int m = 0; m < nargs; m++) { if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) && args[m].opt) for (int e = prev_offset; e < next_offset; e++) work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] |= mask; } } } } ncolor += 32; // increment base level } /* store block mapping and number of blocks per color */ if (indirect_reduce && OP_plans[ip].ncolors_owned == 0) OP_plans[ip].ncolors_owned = ncolors; // no MPI, so get the reduction arrays after everyting is done OP_plans[ip].ncolors = ncolors; if (staging == OP_COLOR2) OP_plans[ip].ncolors = OP_plans[ip].nthrcol[0]; /*for(int col = 0; col = OP_plans[ip].ncolors;col++) //should initialize to zero because op_calloc returns garbage!! { OP_plans[ip].ncolblk[col] = 0; }*/ for (int b = 0; b < nblocks; b++) OP_plans[ip].ncolblk[blk_col[b]]++; // number of blocks of each color for (int c = 1; c < ncolors; c++) OP_plans[ip].ncolblk[c] += OP_plans[ip].ncolblk[c - 1]; // cumsum for (int c = 0; c < ncolors; c++) work2[c] = 0; for (int b = 0; b < nblocks; b++) { int c = blk_col[b]; int b2 = work2[c]; // number of preceding blocks of this color if (c > 0) b2 += OP_plans[ip].ncolblk[c - 1]; // plus previous colors OP_plans[ip].blkmap[b2] = b; work2[c]++; // increment counter } for (int c = ncolors - 1; c > 0; c--) OP_plans[ip].ncolblk[c] -= OP_plans[ip].ncolblk[c - 1]; // undo cumsum /* reorder blocks by color? */ /* work out shared memory requirements */ OP_plans[ip].nsharedCol = (int *)op_malloc(ncolors * sizeof(int)); float total_shared = 0; for (int col = 0; col < ncolors; col++) { OP_plans[ip].nsharedCol[col] = 0; for (int b = 0; b < nblocks; b++) { if (blk_col[b] == col) { int nbytes = 0; for (int m = 0; m < ninds_staged; m++) { int m2 = 0; while (inds_staged[m2] != m) m2++; if (args[m2].opt == 0) continue; nbytes += ROUND_UP_64(ind_sizes[m + b * ninds_staged] * dats[m2]->size); } OP_plans[ip].nsharedCol[col] = MAX(OP_plans[ip].nsharedCol[col], nbytes); total_shared += nbytes; } } } OP_plans[ip].nshared = 0; total_shared = 0; for (int b = 0; b < nblocks; b++) { int nbytes = 0; for (int m = 0; m < ninds_staged; m++) { int m2 = 0; while (inds_staged[m2] != m) m2++; if (args[m2].opt == 0) continue; nbytes += ROUND_UP_64(ind_sizes[m + b * ninds_staged] * dats[m2]->size); } OP_plans[ip].nshared = MAX(OP_plans[ip].nshared, nbytes); total_shared += nbytes; } /* work out total bandwidth requirements */ OP_plans[ip].transfer = 0; OP_plans[ip].transfer2 = 0; float transfer3 = 0; if (staging != OP_COLOR2 && staging != OP_STAGE_INC) { for (int b = 0; b < nblocks; b++) { for (int m = 0; m < nargs; m++) // for each argument { if (args[m].opt) { if (inds[m] < 0) // if it is directly addressed { float fac = 2.0f; if (accs[m] == OP_READ || accs[m] == OP_WRITE) // if you only read or write it fac = 1.0f; if (dats[m] != NULL) { OP_plans[ip].transfer += fac * nelems[b] * dats[m]->size; // cost of reading it all OP_plans[ip].transfer2 += fac * nelems[b] * dats[m]->size; transfer3 += fac * nelems[b] * dats[m]->size; } } else // if it is indirectly addressed: cost of reading the pointer // to it { OP_plans[ip].transfer += nelems[b] * sizeof(short); OP_plans[ip].transfer2 += nelems[b] * sizeof(short); transfer3 += nelems[b] * sizeof(short); } } } for (int m = 0; m < ninds; m++) // for each indirect mapping { int m2 = 0; while (inds[m2] != m) // find the first argument that uses this mapping m2++; if (args[m2].opt == 0) continue; float fac = 2.0f; if (accs[m2] == OP_READ || accs[m2] == OP_WRITE) // only read it fac = 1.0f; if (staging == OP_STAGE_INC && accs[m2] != OP_INC) { OP_plans[ip].transfer += 1; OP_plans[ip].transfer2 += 1; continue; } OP_plans[ip].transfer += fac * ind_sizes[m + b * ninds] * dats[m2]->size; // simply read all data one by one /* work out how many cache lines are used by indirect addressing */ int i_map, l_new, l_old; int e0 = ind_offs[m + b * ninds]; // where it starts int e1 = e0 + ind_sizes[m + b * ninds]; // where it ends l_old = -1; for (int e = e0; e < e1; e++) // iterate through every indirectly accessed data element { i_map = ind_maps[m][e]; // the pointer to the data element l_new = (i_map * dats[m2]->size) / OP_cache_line_size; // which cache line it is on (full size, // dim*sizeof(type)) if (l_new > l_old) // if it is on a further cache line (that is not // yet loaded, - i_map is ordered) OP_plans[ip].transfer2 += fac * OP_cache_line_size; // load the cache line l_old = l_new; l_new = ((i_map + 1) * dats[m2]->size - 1) / OP_cache_line_size; // the last byte of the data OP_plans[ip].transfer2 += fac * (l_new - l_old) * OP_cache_line_size; // again, if not loaded, // load it (can be // multiple cache lines) l_old = l_new; } l_old = -1; for (int e = e0; e < e1; e++) { i_map = ind_maps[m][e]; // pointer to the data element l_new = (i_map * dats[m2]->size) / (dats[m2]->dim * OP_cache_line_size); // which cache line the // first dimension of // the data is on if (l_new > l_old) transfer3 += fac * dats[m2]->dim * OP_cache_line_size; // if not loaded yet, load all cache lines l_old = l_new; l_new = ((i_map + 1) * dats[m2]->size - 1) / (dats[m2]->dim * OP_cache_line_size); // primitve type's last byte transfer3 += fac * (l_new - l_old) * dats[m2]->dim * OP_cache_line_size; // load it l_old = l_new; } /* also include mappings to load/store data */ fac = 1.0f; if (accs[m2] == OP_RW) fac = 2.0f; OP_plans[ip].transfer += fac * ind_sizes[m + b * ninds] * sizeof(int); OP_plans[ip].transfer2 += fac * ind_sizes[m + b * ninds] * sizeof(int); transfer3 += fac * ind_sizes[m + b * ninds] * sizeof(int); } } } /* print out useful information */ if (OP_diags > 1) { printf(" number of blocks = %d \n", nblocks); printf(" number of block colors = %d \n", OP_plans[ip].ncolors); printf(" maximum block size = %d \n", bsize); printf(" average thread colors = %.2f \n", total_colors / nblocks); printf(" shared memory required = "); for (int i = 0; i < ncolors - 1; i++) printf(" %.2f KB,", OP_plans[ip].nsharedCol[i] / 1024.0f); printf(" %.2f KB\n", OP_plans[ip].nsharedCol[ncolors - 1] / 1024.0f); printf(" average data reuse = %.2f \n", maxbytes * (exec_length / total_shared)); printf(" data transfer (used) = %.2f MB \n", OP_plans[ip].transfer / (1024.0f * 1024.0f)); printf(" data transfer (total) = %.2f MB \n", OP_plans[ip].transfer2 / (1024.0f * 1024.0f)); printf(" SoA/AoS transfer ratio = %.2f \n\n", transfer3 / OP_plans[ip].transfer2); } /* validate plan info */ op_plan_check(OP_plans[ip], ninds_staged, inds_staged); /* free work arrays */ for (int m = 0; m < ninds; m++) free(work[m]); free(work); free(work2); free(blk_col); free(inds_to_inds_staged); free(invinds_staged); op_timers_core(&cpu_t2, &wall_t2); for (int i = 0; i < OP_kern_max; i++) { if (strcmp(name, OP_kernels[i].name) == 0) { OP_kernels[i].plan_time += wall_t2 - wall_t1; break; } } /* return pointer to plan */ OP_plan_time += wall_t2 - wall_t1; return &(OP_plans[ip]); }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1) { int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } op_mpi_halo_exchanges(set, nargs, args); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads(); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[nthreads * 64]; for (int thr = 0; thr < nthreads; thr++) { for (int d = 0; d < 1; d++) { arg1_l[d + thr * 64] = ZERO_int; } } if (set->size > 0) { // execute plan #pragma omp parallel for for (int thr = 0; thr < nthreads; thr++) { int start = (set->size * thr) / nthreads; int finish = (set->size * (thr + 1)) / nthreads; for (int n = start; n < finish; n++) { update(&((double *)arg0.data)[4 * n], &arg1_l[64 * omp_get_thread_num()]); } } } // combine reduction data for (int thr = 0; thr < nthreads; thr++) { for (int d = 0; d < 1; d++) { arg1h[d] += arg1_l[d + thr * 64]; } } op_mpi_reduce(&arg1, arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += (float)set->size * arg0.size * 2.0f; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9, op_arg arg13){ int nargs = 17; op_arg args[17]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW); } arg13.idx = 0; args[13] = arg13; for ( int v=1; v<4; v++ ){ args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[0].name = name; OP_kernels[0].count += 1; int ninds = 4; int inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); int ncolors = 0; if (set->size >0) { if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) { opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0); opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST; } if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) { direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8); direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST; } //Set up typed device pointers for OpenACC int *map0 = arg0.map_data_d; double* data8 = (double*)arg8.data_d; double *data0 = (double *)arg0.data_d; double *data4 = (double *)arg4.data_d; double *data9 = (double *)arg9.data_d; double *data13 = (double *)arg13.data_d; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; int set_size1 = set->size + set->exec_size; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; #pragma acc parallel loop independent deviceptr(col_reord,map0,data8,data0,data4,data9,data13) for ( int e=start; e<end; e++ ){ int n = col_reord[e]; int map0idx = map0[n + set_size1 * 0]; int map1idx = map0[n + set_size1 * 1]; int map2idx = map0[n + set_size1 * 2]; int map3idx = map0[n + set_size1 * 3]; const double* arg0_vec[] = { &data0[2 * map0idx], &data0[2 * map1idx], &data0[2 * map2idx], &data0[2 * map3idx]}; const double* arg4_vec[] = { &data4[1 * map0idx], &data4[1 * map1idx], &data4[1 * map2idx], &data4[1 * map3idx]}; double* arg9_vec[] = { &data9[1 * map0idx], &data9[1 * map1idx], &data9[1 * map2idx], &data9[1 * map3idx]}; double* arg13_vec[] = { &data13[2 * map0idx], &data13[2 * map1idx], &data13[2 * map2idx], &data13[2 * map3idx]}; res_calc( arg0_vec, arg4_vec, &data8[n], arg9_vec, arg13_vec); } } OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg0.map_data[n * arg0.map->dim + 2]; int map3idx = arg0.map_data[n * arg0.map->dim + 3]; adt_calc( &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg0.data)[2 * map2idx], &((double*)arg0.data)[2 * map3idx], &((double*)arg4.data)[4 * n], &((double*)arg5.data)[1 * n]); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; }
void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ float *arg4h = (float *)arg4.data; if (OP_diags>2) { printf(" kernel routine w/o indirection: update \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction float arg4_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_float; // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_update( (float *) arg0.data, (float *) arg1.data, (float *) arg2.data, (float *) arg3.data, arg4_l + thr*64, start, finish ); } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64]; // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(4); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (float)set->size * arg0.size; OP_kernels[4].transfer += (float)set->size * arg1.size; OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg3.size; }