void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ int ninds = 0; int nargs = 5; op_arg args[5] = {arg0,arg1,arg2,arg3,arg4}; double *arg4h = (double *)arg4.data; if (OP_diags>2) { printf(" kernel routine w/o indirection: update \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg4_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_double; // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_update( (double *) arg0.data, (double *) arg1.data, (double *) arg2.data, (double *) arg3.data, arg4_l + thr*64, start, finish ); } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64]; //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_GBL) global_reduce(&args[i]); // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(4); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (double)set->size * arg0.size; OP_kernels[4].transfer += (double)set->size * arg1.size; OP_kernels[4].transfer += (double)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (double)set->size * arg3.size; }
void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3) { double *arg3h = (double *)arg3.data; int nargs = 4; op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; if (OP_diags > 2) { printf(" kernel routine w/o indirection: update\n"); } op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1 = 0, wall_t2 = 0; op_timing_realloc(8); OP_kernels[8].name = name; OP_kernels[8].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads(); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg3_l[1 + 64 * 64]; for (int thr = 0; thr < nthreads; thr++) for (int d = 0; d < 1; d++) arg3_l[d + thr * 64] = ZERO_double; if (set->size > 0) { op_timers_core(&cpu_t1, &wall_t1); // execute plan #pragma omp parallel for for (int thr = 0; thr < nthreads; thr++) { int start = (set->size * thr) / nthreads; int finish = (set->size * (thr + 1)) / nthreads; op_x86_update((double *)arg0.data, (double *)arg1.data, (double *)arg2.data, arg3_l + thr * 64, start, finish); } } // combine reduction data for (int thr = 0; thr < nthreads; thr++) for (int d = 0; d < 1; d++) arg3h[d] += arg3_l[d + thr * 64]; op_mpi_reduce(&arg3, arg3h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[8].time += wall_t2 - wall_t1; OP_kernels[8].transfer += (float)set->size * arg0.size * 2.0f; OP_kernels[8].transfer += (float)set->size * arg1.size; OP_kernels[8].transfer += (float)set->size * arg2.size; }
void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ float *arg3h = (float *)arg3.data; float *arg4h = (float *)arg4.data; if (OP_diags>2) { printf(" kernel routine w/o indirection: update \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction float arg3_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg3_l[d+thr*64]=ZERO_float; float arg4_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg4_l[d+thr*64]=arg4h[d]; // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_update( (float *) arg0.data, (float *) arg1.data, (float *) arg2.data, arg3_l + thr*64, arg4_l + thr*64, start, finish ); } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg3h[d] += arg3_l[d+thr*64]; for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg4h[d] = MAX(arg4h[d],arg4_l[d+thr*64]); // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(1); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += (float)set->size * arg0.size; OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[1].transfer += (float)set->size * arg2.size * 2.0f; }