// host stub function void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3){ int nargs = 4; op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: res\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map1idx = arg1.map_data[n * arg1.map->dim + 1]; int map2idx = arg1.map_data[n * arg1.map->dim + 0]; res( &((double*)arg0.data)[1 * n], &((double*)arg1.data)[1 * map1idx], &((double*)arg2.data)[1 * map2idx], (double*)arg3.data); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg1.size; OP_kernels[0].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg3.size; OP_kernels[0].transfer += (float)set->size * arg1.map->dim * 4.0f; }
// host stub function void op_par_loop_save_soln_cpu(char const *name, op_set set, op_arg arg0, op_arg arg1){ int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine w/o indirection: save_soln"); } op_mpi_halo_exchanges(set, nargs, args); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads(); #else int nthreads = 1; #endif if (set->size >0) { // execute plan #pragma omp parallel for for ( int thr=0; thr<nthreads; thr++ ){ int start = (set->size* thr)/nthreads; int finish = (set->size*(thr+1))/nthreads; for ( int n=start; n<finish; n++ ){ save_soln( &((double*)arg0.data)[4*n], &((double*)arg1.data)[4*n]); } } } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg1.size; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4){ int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine w/o indirection: update"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ update( &((float*)arg0.data)[1*n], &((float*)arg1.data)[1*n], &((float*)arg2.data)[1*n], (float*)arg3.data, (float*)arg4.data); } } // combine reduction data op_mpi_reduce_float(&arg3,(float*)arg3.data); op_mpi_reduce_float(&arg4,(float*)arg4.data); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += (float)set->size * arg0.size; OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[1].transfer += (float)set->size * arg2.size * 2.0f; }
// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg0.map_data[n * arg0.map->dim + 2]; int map3idx = arg0.map_data[n * arg0.map->dim + 3]; adt_calc( &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg0.data)[2 * map2idx], &((double*)arg0.data)[2 * map3idx], &((double*)arg4.data)[4 * n], &((double*)arg5.data)[1 * n]); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3){ int nargs = 4; op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); int ninds = 2; int inds[4] = {-1,0,1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { op_plan *Plan = op_plan_get_stage_upload(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_ALL,0); // execute plan int block_offset = 0; for ( int col=0; col<Plan->ncolors; col++ ){ if (col==Plan->ncolors_core) { op_mpi_wait_all(nargs, args); } int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){ int blockId = Plan->blkmap[blockIdx + block_offset]; int nelem = Plan->nelems[blockId]; int offset_b = Plan->offset[blockId]; for ( int n=offset_b; n<offset_b+nelem; n++ ){ int map1idx = arg1.map_data[n * arg1.map->dim + 1]; int map2idx = arg1.map_data[n * arg1.map->dim + 0]; res( &((float*)arg0.data)[1 * n], &((float*)arg1.data)[1 * map1idx], &((float*)arg2.data)[1 * map2idx], (float*)arg3.data); } } block_offset += nblocks; } OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1) { int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } op_mpi_halo_exchanges(set, nargs, args); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads(); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[nthreads * 64]; for (int thr = 0; thr < nthreads; thr++) { for (int d = 0; d < 1; d++) { arg1_l[d + thr * 64] = ZERO_int; } } if (set->size > 0) { // execute plan #pragma omp parallel for for (int thr = 0; thr < nthreads; thr++) { int start = (set->size * thr) / nthreads; int finish = (set->size * (thr + 1)) / nthreads; for (int n = start; n < finish; n++) { update(&((double *)arg0.data)[4 * n], &arg1_l[64 * omp_get_thread_num()]); } } } // combine reduction data for (int thr = 0; thr < nthreads; thr++) { for (int d = 0; d < 1; d++) { arg1h[d] += arg1_l[d + thr * 64]; } } op_mpi_reduce(&arg1, arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += (float)set->size * arg0.size * 2.0f; }
void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; if (OP_diags>2) { printf(" kernel routine w/o indirection: EvolveValuesRK2_1\n"); } op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif if (set->size >0) { op_timers_core(&cpu_t1, &wall_t1); // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_EvolveValuesRK2_1( (float *) arg0.data, (float *) arg1.data, (float *) arg2.data, (float *) arg3.data, (float *) arg4.data, start, finish ); } } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[0].transfer += (float)set->size * arg2.size; OP_kernels[0].transfer += (float)set->size * arg3.size; OP_kernels[0].transfer += (float)set->size * arg4.size; }
void op_par_loop_dotR(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ double *arg1h = (double *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; if (OP_diags>2) { printf(" kernel routine w/o indirection: dotR\n"); } op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(6); OP_kernels[6].name = name; OP_kernels[6].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_double; if (set->size >0) { op_timers_core(&cpu_t1, &wall_t1); // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_dotR( (double *) arg0.data, arg1_l + thr*64, start, finish ); } } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[6].time += wall_t2 - wall_t1; OP_kernels[6].transfer += (float)set->size * arg0.size; }
void op_par_loop_SpaceDiscretization(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6 ){ int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; arg6.idx = 0; args[6] = arg6; for (int v = 1; v < 2; v++) { args[6 + v] = op_arg_dat(arg6.dat, v, arg6.map, 1, "float", OP_READ); } int ninds = 2; int inds[8] = {0,0,-1,-1,-1,-1,1,1}; if (OP_diags>2) { printf(" kernel routine with indirection: SpaceDiscretization\n"); } // get plan #ifdef OP_PART_SIZE_18 int part_size = OP_PART_SIZE_18; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(18); OP_kernels[18].name = name; OP_kernels[18].count += 1; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_SpaceDiscretization( blockIdx, (float *)arg0.data, (float *)arg6.data, Plan->ind_map, Plan->loc_map, (float *)arg2.data, (float *)arg3.data, (float *)arg4.data, (int *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(18); OP_kernels[18].transfer += Plan->transfer; OP_kernels[18].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[18].time += wall_t2 - wall_t1; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ) { int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; args[6] = arg6; args[7] = arg7; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, (double *)arg2.data, (double *)arg4.data, (double *)arg6.data, Plan->ind_map, Plan->loc_map, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(2); OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9){ int nargs = 13; op_arg args[13]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg0.map_data[n * arg0.map->dim + 2]; int map3idx = arg0.map_data[n * arg0.map->dim + 3]; const double* arg0_vec[] = { &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg0.data)[2 * map2idx], &((double*)arg0.data)[2 * map3idx]}; const double* arg4_vec[] = { &((double*)arg4.data)[1 * map0idx], &((double*)arg4.data)[1 * map1idx], &((double*)arg4.data)[1 * map2idx], &((double*)arg4.data)[1 * map3idx]}; double* arg9_vec[] = { &((double*)arg9.data)[1 * map0idx], &((double*)arg9.data)[1 * map1idx], &((double*)arg9.data)[1 * map2idx], &((double*)arg9.data)[1 * map3idx]}; res_calc( arg0_vec, arg4_vec, &((double*)arg8.data)[16 * n], arg9_vec); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg4.size; OP_kernels[0].transfer += (float)set->size * arg9.size * 2.0f; OP_kernels[0].transfer += (float)set->size * arg8.size; OP_kernels[0].transfer += (float)set->size * arg0.map->dim * 4.0f; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9 ){ int nargs = 13; op_arg args[13]; arg0.idx = 0; args[0] = arg0; for (int v = 1; v < 4; v++) { args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for (int v = 1; v < 4; v++) { args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for (int v = 1; v < 4; v++) { args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC); } int ninds = 3; int inds[13] = {0,0,0,0,1,1,1,1,-1,2,2,2,2}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, (double *)arg4.data, (double *)arg9.data, Plan->ind_map, Plan->loc_map, (double *)arg8.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4) { int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // create aligned pointers for dats ALIGNED_double const double *__restrict__ ptr0 = (double *)arg0.data; __assume_aligned(ptr0, double_ALIGN); ALIGNED_double double *__restrict__ ptr1 = (double *)arg1.data; __assume_aligned(ptr1, double_ALIGN); ALIGNED_double double *__restrict__ ptr2 = (double *)arg2.data; __assume_aligned(ptr2, double_ALIGN); ALIGNED_double const double *__restrict__ ptr3 = (double *)arg3.data; __assume_aligned(ptr3, double_ALIGN); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(4); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } int exec_size = op_mpi_halo_exchanges(set, nargs, args); if (exec_size > 0) { #ifdef VECTORIZE #pragma novector for (int n = 0; n < (exec_size / SIMD_VEC) * SIMD_VEC; n += SIMD_VEC) { double dat4[SIMD_VEC] = {0.0}; #pragma simd for (int i = 0; i < SIMD_VEC; i++) { update(&(ptr0)[4 * (n + i)], &(ptr1)[4 * (n + i)], &(ptr2)[4 * (n + i)], &(ptr3)[1 * (n + i)], &dat4[i]); } for (int i = 0; i < SIMD_VEC; i++) { *(double *)arg4.data += dat4[i]; } } // remainder for (int n = (exec_size / SIMD_VEC) * SIMD_VEC; n < exec_size; n++) { #else for (int n = 0; n < exec_size; n++) { #endif update(&(ptr0)[4 * n], &(ptr1)[4 * n], &(ptr2)[4 * n], &(ptr3)[1 * n], (double *)arg4.data); } } // combine reduction data op_mpi_reduce(&arg4, (double *)arg4.data); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (float)set->size * arg0.size; OP_kernels[4].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg3.size; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; int ninds = 1; int inds[2] = {0,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_map, Plan->loc_map, &arg1_l[64*omp_get_thread_num()], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); // combine reduction data if (col == Plan->ncolors_owned-1) { for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; } block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3 ){ float *arg3h = (float *)arg3.data; int nargs = 4; op_arg args[4] = {arg0,arg1,arg2,arg3}; int ninds = 2; int inds[4] = {-1,0,1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res \n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res( blockIdx, (float *)arg1.data, (float *)arg2.data, Plan->ind_map, Plan->loc_map, (float *)arg0.data, (float *)arg3.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }