// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4){ int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine w/o indirection: update"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ update( &((float*)arg0.data)[1*n], &((float*)arg1.data)[1*n], &((float*)arg2.data)[1*n], (float*)arg3.data, (float*)arg4.data); } } // combine reduction data op_mpi_reduce(&arg3,(float*)arg3.data); op_mpi_reduce(&arg4,(float*)arg4.data); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += (float)set->size * arg0.size; OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[1].transfer += (float)set->size * arg2.size * 2.0f; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1) { int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } op_mpi_halo_exchanges(set, nargs, args); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads(); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[nthreads * 64]; for (int thr = 0; thr < nthreads; thr++) { for (int d = 0; d < 1; d++) { arg1_l[d + thr * 64] = ZERO_int; } } if (set->size > 0) { // execute plan #pragma omp parallel for for (int thr = 0; thr < nthreads; thr++) { int start = (set->size * thr) / nthreads; int finish = (set->size * (thr + 1)) / nthreads; for (int n = start; n < finish; n++) { update(&((double *)arg0.data)[4 * n], &arg1_l[64 * omp_get_thread_num()]); } } } // combine reduction data for (int thr = 0; thr < nthreads; thr++) { for (int d = 0; d < 1; d++) { arg1h[d] += arg1_l[d + thr * 64]; } } op_mpi_reduce(&arg1, arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += (float)set->size * arg0.size * 2.0f; }
void op_par_loop_dotR(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ double *arg1h = (double *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; if (OP_diags>2) { printf(" kernel routine w/o indirection: dotR\n"); } op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(6); OP_kernels[6].name = name; OP_kernels[6].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_double; if (set->size >0) { op_timers_core(&cpu_t1, &wall_t1); // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_dotR( (double *) arg0.data, arg1_l + thr*64, start, finish ); } } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[6].time += wall_t2 - wall_t1; OP_kernels[6].transfer += (float)set->size * arg0.size; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4) { int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // create aligned pointers for dats ALIGNED_double const double *__restrict__ ptr0 = (double *)arg0.data; __assume_aligned(ptr0, double_ALIGN); ALIGNED_double double *__restrict__ ptr1 = (double *)arg1.data; __assume_aligned(ptr1, double_ALIGN); ALIGNED_double double *__restrict__ ptr2 = (double *)arg2.data; __assume_aligned(ptr2, double_ALIGN); ALIGNED_double const double *__restrict__ ptr3 = (double *)arg3.data; __assume_aligned(ptr3, double_ALIGN); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(4); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } int exec_size = op_mpi_halo_exchanges(set, nargs, args); if (exec_size > 0) { #ifdef VECTORIZE #pragma novector for (int n = 0; n < (exec_size / SIMD_VEC) * SIMD_VEC; n += SIMD_VEC) { double dat4[SIMD_VEC] = {0.0}; #pragma simd for (int i = 0; i < SIMD_VEC; i++) { update(&(ptr0)[4 * (n + i)], &(ptr1)[4 * (n + i)], &(ptr2)[4 * (n + i)], &(ptr3)[1 * (n + i)], &dat4[i]); } for (int i = 0; i < SIMD_VEC; i++) { *(double *)arg4.data += dat4[i]; } } // remainder for (int n = (exec_size / SIMD_VEC) * SIMD_VEC; n < exec_size; n++) { #else for (int n = 0; n < exec_size; n++) { #endif update(&(ptr0)[4 * n], &(ptr1)[4 * n], &(ptr2)[4 * n], &(ptr3)[1 * n], (double *)arg4.data); } } // combine reduction data op_mpi_reduce(&arg4, (double *)arg4.data); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (float)set->size * arg0.size; OP_kernels[4].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg3.size; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; int ninds = 1; int inds[2] = {0,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_map, Plan->loc_map, &arg1_l[64*omp_get_thread_num()], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); // combine reduction data if (col == Plan->ncolors_owned-1) { for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; } block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }