// host stub function
void ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute(
    ops_kernel_descriptor *desc) {
  ops_block block = desc->block;
  int dim = desc->dim;
  int *range = desc->range;
  ops_arg arg0 = desc->args[0];
  ops_arg arg1 = desc->args[1];
  ops_arg arg2 = desc->args[2];
  ops_arg arg3 = desc->args[3];
  ops_arg arg4 = desc->args[4];

  // Timing
  double t1, t2, c1, c2;

  ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4};

#ifdef CHECKPOINTING
  if (!ops_checkpointing_before(args, 5, range, 136))
    return;
#endif

  if (OPS_diags > 1) {
    OPS_kernels[136].count++;
    ops_timers_core(&c2, &t2);
  }

  // compute locally allocated range for the sub-block
  int start[3];
  int end[3];

  for (int n = 0; n < 3; n++) {
    start[n] = range[2 * n];
    end[n] = range[2 * n + 1];
  }

#ifdef OPS_DEBUG
  ops_register_args(args, "advec_mom_kernel_post_pre_advec_z");
#endif

  // set up initial pointers and exchange halos if necessary
  int base0 = args[0].dat->base_offset;
  double *__restrict__ node_mass_post = (double *)(args[0].data + base0);

  int base1 = args[1].dat->base_offset;
  const double *__restrict__ post_vol = (double *)(args[1].data + base1);

  int base2 = args[2].dat->base_offset;
  const double *__restrict__ density1 = (double *)(args[2].data + base2);

  int base3 = args[3].dat->base_offset;
  double *__restrict__ node_mass_pre = (double *)(args[3].data + base3);

  int base4 = args[4].dat->base_offset;
  const double *__restrict__ node_flux = (double *)(args[4].data + base4);

  // initialize global variable with the dimension of dats
  int xdim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[0];
  int ydim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[1];
  int xdim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[0];
  int ydim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[1];
  int xdim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[0];
  int ydim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[1];
  int xdim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[0];
  int ydim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[1];
  int xdim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[0];
  int ydim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[1];

  if (OPS_diags > 1) {
    ops_timers_core(&c1, &t1);
    OPS_kernels[136].mpi_time += t1 - t2;
  }

#pragma omp parallel for collapse(2)
  for (int n_z = start[2]; n_z < end[2]; n_z++) {
    for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(node_mass_post, post_vol, density1, node_mass_pre,    \
                         node_flux)
#else
#pragma simd
#endif
      for (int n_x = start[0]; n_x < end[0]; n_x++) {

        node_mass_post[OPS_ACC0(0, 0, 0)] =
            0.125 *
            (density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] +
             density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] +
             density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] +
             density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] +
             density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] +
             density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] +
             density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] +
             density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]);

        node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] -
                                           node_flux[OPS_ACC4(0, 0, -1)] +
                                           node_flux[OPS_ACC4(0, 0, 0)];
      }
    }
  }
  if (OPS_diags > 1) {
    ops_timers_core(&c2, &t2);
    OPS_kernels[136].time += t2 - t1;
  }

  if (OPS_diags > 1) {
    // Update kernel record
    ops_timers_core(&c1, &t1);
    OPS_kernels[136].mpi_time += t1 - t2;
    OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0);
    OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1);
    OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2);
    OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3);
    OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4);
  }
}
inline void update_halo_kernel2_xvel_minus_4_right(double *xvel0, double *xvel1, const int* fields)
{
  if(fields[FIELD_XVEL0] == 1) xvel0[OPS_ACC0(0,0,0)] = -xvel0[OPS_ACC0(-4,0,0)];
  if(fields[FIELD_XVEL1] == 1) xvel1[OPS_ACC1(0,0,0)] = -xvel1[OPS_ACC1(-4,0,0)];
}
inline void update_halo_kernel3_minus_2_b(double *vol_flux_x, double *mass_flux_x, const int* fields) {
  if(fields[FIELD_VOL_FLUX_X] == 1)  vol_flux_x[OPS_ACC0(0,0)]  = -(vol_flux_x[OPS_ACC0(-2,0)]);
  if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x[OPS_ACC1(0,0)] = -(mass_flux_x[OPS_ACC1(-2,0)]);
}
// user function
inline void reset_field_kernel2(double *xvel0, const double *xvel1,
                                double *yvel0, const double *yvel1) {

  xvel0[OPS_ACC0(0, 0)] = xvel1[OPS_ACC1(0, 0)];
  yvel0[OPS_ACC2(0, 0)] = yvel1[OPS_ACC3(0, 0)];
}
//user function
inline 
void generate_chunk_kernel( const double *vertexx,
                     const double *vertexy, const double *vertexz,
                     double *energy0, double *density0,
                     double *xvel0,  double *yvel0, double *zvel0,
                     const double *cellx, const double *celly, const double *cellz) {

  double radius, x_cent, y_cent, z_cent;


  energy0[OPS_ACC3(0,0,0)]= states[0].energy;
  density0[OPS_ACC4(0,0,0)]= states[0].density;
  xvel0[OPS_ACC5(0,0,0)]=states[0].xvel;
  yvel0[OPS_ACC6(0,0,0)]=states[0].yvel;
  zvel0[OPS_ACC7(0,0,0)]=states[0].zvel;

  for(int i = 1; i<number_of_states; i++) {

    x_cent=states[i].xmin;
    y_cent=states[i].ymin;
    z_cent=states[i].zmin;

    if (states[i].geometry == g_cube) {
      if(vertexx[OPS_ACC0(1,0,0)] >= states[i].xmin  && vertexx[OPS_ACC0(0,0,0)] < states[i].xmax) {
        if(vertexy[OPS_ACC1(0,1,0)] >= states[i].ymin && vertexy[OPS_ACC1(0,0,0)] < states[i].ymax) {
          if(vertexz[OPS_ACC2(0,0,1)] >= states[i].zmin && vertexz[OPS_ACC2(0,0,0)] < states[i].zmax) {

            energy0[OPS_ACC3(0,0,0)] = states[i].energy;
            density0[OPS_ACC4(0,0,0)] = states[i].density;

            for (int ix=0;ix<2;ix++){
              for (int iy=0;iy<2;iy++){
                for (int iz=0;iz<2;iz++){
                  xvel0[OPS_ACC5(ix,iy,iz)] = states[i].xvel;
                  yvel0[OPS_ACC6(ix,iy,iz)] = states[i].yvel;
                  zvel0[OPS_ACC7(ix,iy,iz)] = states[i].zvel;
                }
              }
            }
          }
        }
      }
    }
    else if(states[i].geometry == g_sphe) {
      radius = sqrt ((cellx[OPS_ACC8(0,0,0)] - x_cent) * (cellx[OPS_ACC8(0,0,0)] - x_cent) +
                     (celly[OPS_ACC9(0,0,0)] - y_cent) * (celly[OPS_ACC9(0,0,0)] - y_cent) +
                     (cellz[OPS_ACC10(0,0,0)] - z_cent) * (cellz[OPS_ACC10(0,0,0)] - z_cent));
      if(radius <= states[i].radius) {
        energy0[OPS_ACC3(0,0,0)] = states[i].energy;
        density0[OPS_ACC4(0,0,0)] = states[i].density;

        for (int ix=0;ix<2;ix++){
          for (int iy=0;iy<2;iy++){
            for (int iz=0;iz<2;iz++){
              xvel0[OPS_ACC5(ix,iy,iz)] = states[i].xvel;
              yvel0[OPS_ACC6(ix,iy,iz)] = states[i].yvel;
              zvel0[OPS_ACC7(ix,iy,iz)] = states[i].zvel;
            }
          }
        }
      }
    }
    else if(states[i].geometry == g_point) {
      if(vertexx[OPS_ACC0(0,0,0)] == x_cent && vertexy[OPS_ACC1(0,0,0)] == y_cent && vertexz[OPS_ACC2(0,0,0)] == z_cent) {
        energy0[OPS_ACC3(0,0,0)] = states[i].energy;
        density0[OPS_ACC4(0,0,0)] = states[i].density;

        for (int ix=0;ix<2;ix++){
          for (int iy=0;iy<2;iy++){
            for (int iz=0;iz<2;iz++){
              xvel0[OPS_ACC5(ix,iy,iz)] = states[i].xvel;
              yvel0[OPS_ACC6(ix,iy,iz)] = states[i].yvel;
              zvel0[OPS_ACC7(ix,iy,iz)] = states[i].zvel;
            }
          }
        }
      }
    }
  }
}
inline void update_halo_kernel5_minus_4_front(double *vol_flux_z, double *mass_flux_z, const int* fields) {
  if(fields[FIELD_VOL_FLUX_Z] == 1)  vol_flux_z[OPS_ACC0(0,0,0)]  = -vol_flux_z[OPS_ACC0(0,0,-4)];
  if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z[OPS_ACC1(0,0,0)] = -mass_flux_z[OPS_ACC1(0,0,-4)];
}
    const double *restrict pressure, const double *restrict density0,
    double *restrict density1, const double *restrict viscosity,
    const double *restrict energy0, double *restrict energy1,
    const double *restrict zarea, const double *restrict zvel0, int x_size,
    int y_size, int z_size) {
#pragma omp parallel for
  for (int n_z = 0; n_z < z_size; n_z++) {
    for (int n_y = 0; n_y < y_size; n_y++) {
      for (int n_x = 0; n_x < x_size; n_x++) {

        double recip_volume, energy_change;
        double right_flux, left_flux, top_flux, bottom_flux, back_flux,
            front_flux, total_flux;

        left_flux = (xarea[OPS_ACC0(0, 0, 0)] *
                     (xvel0[OPS_ACC1(0, 0, 0)] + xvel0[OPS_ACC1(0, 1, 0)] +
                      xvel0[OPS_ACC1(0, 0, 1)] + xvel0[OPS_ACC1(0, 1, 1)] +
                      xvel0[OPS_ACC1(0, 0, 0)] + xvel0[OPS_ACC1(0, 1, 0)] +
                      xvel0[OPS_ACC1(0, 0, 1)] + xvel0[OPS_ACC1(0, 1, 1)])) *
                    0.125 * dt * 0.5;
        right_flux = (xarea[OPS_ACC0(1, 0, 0)] *
                      (xvel0[OPS_ACC1(1, 0, 0)] + xvel0[OPS_ACC1(1, 1, 0)] +
                       xvel0[OPS_ACC1(1, 0, 1)] + xvel0[OPS_ACC1(1, 1, 1)] +
                       xvel0[OPS_ACC1(1, 0, 0)] + xvel0[OPS_ACC1(1, 1, 0)] +
                       xvel0[OPS_ACC1(1, 0, 1)] + xvel0[OPS_ACC1(1, 1, 1)])) *
                     0.125 * dt * 0.5;

        bottom_flux = (yarea[OPS_ACC2(0, 0, 0)] *
                       (yvel0[OPS_ACC3(0, 0, 0)] + yvel0[OPS_ACC3(1, 0, 0)] +
                        yvel0[OPS_ACC3(0, 0, 1)] + yvel0[OPS_ACC3(1, 0, 1)] +
                        yvel0[OPS_ACC3(0, 0, 0)] + yvel0[OPS_ACC3(1, 0, 0)] +
   xdim5_reset_field_kernel2 * (y) +                                           \
   xdim5_reset_field_kernel2 * ydim5_reset_field_kernel2 * (z))

// user function

void reset_field_kernel2_c_wrapper(double *restrict xvel0,
                                   const double *restrict xvel1,
                                   double *restrict yvel0,
                                   const double *restrict yvel1,
                                   double *restrict zvel0,
                                   const double *restrict zvel1, int x_size,
                                   int y_size, int z_size) {
#pragma omp parallel for
  for (int n_z = 0; n_z < z_size; n_z++) {
    for (int n_y = 0; n_y < y_size; n_y++) {
      for (int n_x = 0; n_x < x_size; n_x++) {

        xvel0[OPS_ACC0(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, 0)];
        yvel0[OPS_ACC2(0, 0, 0)] = yvel1[OPS_ACC3(0, 0, 0)];
        zvel0[OPS_ACC4(0, 0, 0)] = zvel1[OPS_ACC5(0, 0, 0)];
      }
    }
  }
}
#undef OPS_ACC0
#undef OPS_ACC1
#undef OPS_ACC2
#undef OPS_ACC3
#undef OPS_ACC4
#undef OPS_ACC5
       ydim4_advec_mom_kernel_post_pre_advec_z * (z))

// user function

void advec_mom_kernel_post_pre_advec_z_c_wrapper(
    double *restrict node_mass_post, const double *restrict post_vol,
    const double *restrict density1, double *restrict node_mass_pre,
    const double *restrict node_flux, int x_size, int y_size, int z_size) {
#pragma omp parallel for
  for (int n_z = 0; n_z < z_size; n_z++) {
    for (int n_y = 0; n_y < y_size; n_y++) {
      for (int n_x = 0; n_x < x_size; n_x++) {

        node_mass_post[OPS_ACC0(0, 0, 0)] =
            0.125 *
            (density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] +
             density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] +
             density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] +
             density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] +
             density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] +
             density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] +
             density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] +
             density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]);

        node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] -
                                           node_flux[OPS_ACC4(0, 0, -1)] +
                                           node_flux[OPS_ACC4(0, 0, 0)];
      }
    }
  }
}
inline void update_halo_kernel4_minus_4_a(double *vol_flux_y, double *mass_flux_y, const int* fields) {
  if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y[OPS_ACC0(0,0,0)] = -(vol_flux_y[OPS_ACC0(0,4,0)]);
  if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y[OPS_ACC1(0,0,0)] = -(mass_flux_y[OPS_ACC1(0,4,0)]);
}
int xdim3_revert_kernel;

#define OPS_ACC0(x, y)                                                         \
  (n_x * 1 + n_y * xdim0_revert_kernel * 1 + x + xdim0_revert_kernel * (y))
#define OPS_ACC1(x, y)                                                         \
  (n_x * 1 + n_y * xdim1_revert_kernel * 1 + x + xdim1_revert_kernel * (y))
#define OPS_ACC2(x, y)                                                         \
  (n_x * 1 + n_y * xdim2_revert_kernel * 1 + x + xdim2_revert_kernel * (y))
#define OPS_ACC3(x, y)                                                         \
  (n_x * 1 + n_y * xdim3_revert_kernel * 1 + x + xdim3_revert_kernel * (y))

// user function

void revert_kernel_c_wrapper(const double *restrict density0,
                             double *restrict density1,
                             const double *restrict energy0,
                             double *restrict energy1, int x_size, int y_size) {
#pragma omp parallel for
  for (int n_y = 0; n_y < y_size; n_y++) {
    for (int n_x = 0; n_x < x_size; n_x++) {

      density1[OPS_ACC1(0, 0)] = density0[OPS_ACC0(0, 0)];
      energy1[OPS_ACC3(0, 0)] = energy0[OPS_ACC2(0, 0)];
    }
  }
}
#undef OPS_ACC0
#undef OPS_ACC1
#undef OPS_ACC2
#undef OPS_ACC3
// host stub function
void ops_par_loop_advec_mom_kernel1_z_nonvector_execute(
    ops_kernel_descriptor *desc) {
  ops_block block = desc->block;
  int dim = desc->dim;
  int *range = desc->range;
  ops_arg arg0 = desc->args[0];
  ops_arg arg1 = desc->args[1];
  ops_arg arg2 = desc->args[2];
  ops_arg arg3 = desc->args[3];
  ops_arg arg4 = desc->args[4];

  // Timing
  double t1, t2, c1, c2;

  ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4};

#ifdef CHECKPOINTING
  if (!ops_checkpointing_before(args, 5, range, 137))
    return;
#endif

  if (OPS_diags > 1) {
    OPS_kernels[137].count++;
    ops_timers_core(&c2, &t2);
  }

  // compute locally allocated range for the sub-block
  int start[3];
  int end[3];

  for (int n = 0; n < 3; n++) {
    start[n] = range[2 * n];
    end[n] = range[2 * n + 1];
  }

#ifdef OPS_DEBUG
  ops_register_args(args, "advec_mom_kernel1_z_nonvector");
#endif

  // set up initial pointers and exchange halos if necessary
  int base0 = args[0].dat->base_offset;
  const double *__restrict__ node_flux = (double *)(args[0].data + base0);

  int base1 = args[1].dat->base_offset;
  const double *__restrict__ node_mass_pre = (double *)(args[1].data + base1);

  int base2 = args[2].dat->base_offset;
  double *__restrict__ mom_flux = (double *)(args[2].data + base2);

  int base3 = args[3].dat->base_offset;
  const double *__restrict__ celldz = (double *)(args[3].data + base3);

  int base4 = args[4].dat->base_offset;
  const double *__restrict__ vel1 = (double *)(args[4].data + base4);

  // initialize global variable with the dimension of dats
  int xdim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[0];
  int ydim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[1];
  int xdim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[0];
  int ydim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[1];
  int xdim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[0];
  int ydim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[1];
  int xdim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[0];
  int ydim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[1];
  int xdim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[0];
  int ydim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[1];

  if (OPS_diags > 1) {
    ops_timers_core(&c1, &t1);
    OPS_kernels[137].mpi_time += t1 - t2;
  }

#pragma omp parallel for collapse(2)
  for (int n_z = start[2]; n_z < end[2]; n_z++) {
    for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(node_flux, node_mass_pre, mom_flux, celldz, vel1)
#else
#pragma simd
#endif
      for (int n_x = start[0]; n_x < end[0]; n_x++) {

        double sigma, wind, width;
        double vdiffuw, vdiffdw, auw, adw, limiter;
        int upwind, donor, downwind, dif;
        double advec_vel_temp;

        if ((node_flux[OPS_ACC0(0, 0, 0)]) < 0.0) {
          upwind = 2;
          donor = 1;
          downwind = 0;
          dif = donor;
        } else {
          upwind = -1;
          donor = 0;
          downwind = 1;
          dif = upwind;
        }

        sigma = fabs(node_flux[OPS_ACC0(0, 0, 0)]) /
                node_mass_pre[OPS_ACC1(0, 0, donor)];
        width = celldz[OPS_ACC3(0, 0, 0)];
        vdiffuw = vel1[OPS_ACC4(0, 0, donor)] - vel1[OPS_ACC4(0, 0, upwind)];
        vdiffdw = vel1[OPS_ACC4(0, 0, downwind)] - vel1[OPS_ACC4(0, 0, donor)];
        limiter = 0.0;
        if (vdiffuw * vdiffdw > 0.0) {
          auw = fabs(vdiffuw);
          adw = fabs(vdiffdw);
          wind = 1.0;
          if (vdiffdw <= 0.0)
            wind = -1.0;
          limiter =
              wind *
              MIN(width * ((2.0 - sigma) * adw / width +
                           (1.0 + sigma) * auw / celldz[OPS_ACC3(0, 0, dif)]) /
                      6.0,
                  MIN(auw, adw));
        }
        advec_vel_temp = vel1[OPS_ACC4(0, 0, donor)] + (1.0 - sigma) * limiter;
        mom_flux[OPS_ACC2(0, 0, 0)] =
            advec_vel_temp * node_flux[OPS_ACC0(0, 0, 0)];
      }
    }
  }
  if (OPS_diags > 1) {
    ops_timers_core(&c2, &t2);
    OPS_kernels[137].time += t2 - t1;
  }

  if (OPS_diags > 1) {
    // Update kernel record
    ops_timers_core(&c1, &t1);
    OPS_kernels[137].mpi_time += t1 - t2;
    OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0);
    OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1);
    OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2);
    OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3);
    OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg4);
  }
}
inline void update_halo_kernel2_yvel_minus_4_bot(double *yvel0, double *yvel1, const int* fields)
{
  if(fields[FIELD_YVEL0] == 1) yvel0[OPS_ACC0(0,0,0)] = -yvel0[OPS_ACC0(0,4,0)];
  if(fields[FIELD_YVEL1] == 1) yvel1[OPS_ACC1(0,0,0)] = -yvel1[OPS_ACC1(0,4,0)];
}
inline void update_halo_kernel2_yvel_plus_2_front(double *yvel0, double *yvel1, const int* fields)
{
  if(fields[FIELD_YVEL0] == 1) yvel0[OPS_ACC0(0,0,0)] = yvel0[OPS_ACC0(0,0,-2)];
  if(fields[FIELD_YVEL1] == 1) yvel1[OPS_ACC1(0,0,0)] = yvel1[OPS_ACC1(0,0,-2)];
}
// user function
inline void tea_leaf_axpy_kernel(double *u, const double *p,
                                 const double *alpha) {
  u[OPS_ACC0(0, 0)] = u[OPS_ACC0(0, 0)] + (*alpha) * p[OPS_ACC1(0, 0)];
}
inline void update_halo_kernel2_xvel_plus_2_top(double *xvel0, double *xvel1, const int* fields)
{
  if(fields[FIELD_XVEL0] == 1) xvel0[OPS_ACC0(0,0,0)] = xvel0[OPS_ACC0(0,-2,0)];
  if(fields[FIELD_XVEL1] == 1) xvel1[OPS_ACC1(0,0,0)] = xvel1[OPS_ACC1(0,-2,0)];
}
inline void update_halo_kernel4_plus_2_back(double *vol_flux_y, double *mass_flux_y, const int* fields) {
  if(fields[FIELD_VOL_FLUX_Y] == 1)  vol_flux_y[OPS_ACC0(0,0,0)]  = vol_flux_y[OPS_ACC0(0,0,2)];
  if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y[OPS_ACC1(0,0,0)] = mass_flux_y[OPS_ACC1(0,0,2)];
}
// user function
inline void generate_chunk_kernel(const double *vertexx, const double *vertexy,
                                  double *energy0, double *density0,
                                  double *xvel0, double *yvel0,
                                  const double *cellx, const double *celly) {

  double radius, x_cent, y_cent;
  int is_in = 0;
  int is_in2 = 0;

  energy0[OPS_ACC2(0, 0)] = states[0].energy;
  density0[OPS_ACC3(0, 0)] = states[0].density;
  xvel0[OPS_ACC4(0, 0)] = states[0].xvel;
  yvel0[OPS_ACC5(0, 0)] = states[0].yvel;

  for (int i = 1; i < number_of_states; i++) {

    x_cent = states[i].xmin;
    y_cent = states[i].ymin;
    is_in = 0;
    is_in2 = 0;

    if (states[i].geometry == g_rect) {
      for (int i1 = -1; i1 <= 0; i1++) {
        for (int j1 = -1; j1 <= 0; j1++) {
          if (vertexx[OPS_ACC0(1 + i1, 0)] >= states[i].xmin &&
              vertexx[OPS_ACC0(0 + i1, 0)] < states[i].xmax) {
            if (vertexy[OPS_ACC1(0, 1 + j1)] >= states[i].ymin &&
                vertexy[OPS_ACC1(0, 0 + j1)] < states[i].ymax) {
              is_in = 1;
            }
          }
        }
      }
      if (vertexx[OPS_ACC0(1, 0)] >= states[i].xmin &&
          vertexx[OPS_ACC0(0, 0)] < states[i].xmax) {
        if (vertexy[OPS_ACC1(0, 1)] >= states[i].ymin &&
            vertexy[OPS_ACC1(0, 0)] < states[i].ymax) {
          is_in2 = 1;
        }
      }
      if (is_in2) {
        energy0[OPS_ACC2(0, 0)] = states[i].energy;
        density0[OPS_ACC3(0, 0)] = states[i].density;
      }
      if (is_in) {
        xvel0[OPS_ACC4(0, 0)] = states[i].xvel;
        yvel0[OPS_ACC5(0, 0)] = states[i].yvel;
      }
    } else if (states[i].geometry == g_circ) {
      for (int i1 = -1; i1 <= 0; i1++) {
        for (int j1 = -1; j1 <= 0; j1++) {
          radius = sqrt((cellx[OPS_ACC6(i1, 0)] - x_cent) *
                            (cellx[OPS_ACC6(i1, 0)] - x_cent) +
                        (celly[OPS_ACC7(0, j1)] - y_cent) *
                            (celly[OPS_ACC7(0, j1)] - y_cent));
          if (radius <= states[i].radius) {
            is_in = 1;
          }
        }
      }
      if (radius <= states[i].radius)
        is_in2 = 1;

      if (is_in2) {
        energy0[OPS_ACC2(0, 0)] = states[i].energy;
        density0[OPS_ACC3(0, 0)] = states[i].density;
      }

      if (is_in) {
        xvel0[OPS_ACC4(0, 0)] = states[i].xvel;
        yvel0[OPS_ACC5(0, 0)] = states[i].yvel;
      }
    } else if (states[i].geometry == g_point) {
      for (int i1 = -1; i1 <= 0; i1++) {
        for (int j1 = -1; j1 <= 0; j1++) {
          if (vertexx[OPS_ACC0(i1, 0)] == x_cent &&
              vertexy[OPS_ACC1(0, j1)] == y_cent) {
            is_in = 1;
          }
        }
      }
      if (vertexx[OPS_ACC0(0, 0)] == x_cent &&
          vertexy[OPS_ACC1(0, 0)] == y_cent)
        is_in2 = 1;

      if (is_in2) {
        energy0[OPS_ACC2(0, 0)] = states[i].energy;
        density0[OPS_ACC3(0, 0)] = states[i].density;
      }

      if (is_in) {
        xvel0[OPS_ACC4(0, 0)] = states[i].xvel;
        yvel0[OPS_ACC5(0, 0)] = states[i].yvel;
      }
    }
  }
}
inline void update_halo_kernel3_plus_4_front(double *vol_flux_x, double *mass_flux_x, const int* fields) {
  if(fields[FIELD_VOL_FLUX_X] == 1)  vol_flux_x[OPS_ACC0(0,0,0)]  = vol_flux_x[OPS_ACC0(0,0,-4)];
  if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x[OPS_ACC1(0,0,0)] = mass_flux_x[OPS_ACC1(0,0,-4)];
}
inline void advec_cell_kernel3_ydir(const double *vol_flux_y,
                                    const double *pre_vol, const int *yy,
                                    const double *vertexdy,
                                    const double *density1,
                                    const double *energy1, double *mass_flux_y,
                                    double *ener_flux) {

  double sigmat, sigmav, sigmam, sigma3, sigma4;
  double diffuw, diffdw, limiter;
  double one_by_six = 1.0 / 6.0;

  int y_max = field.y_max;

  int upwind, donor, downwind, dif;

  if (vol_flux_y[OPS_ACC0(0, 0)] > 0.0) {
    upwind = -2;
    donor = -1;
    downwind = 0;
    dif = donor;
  } else if (yy[OPS_ACC2(0, 1)] < y_max + 2 - 2) {
    upwind = 1;
    donor = 0;
    downwind = -1;
    dif = upwind;
  } else {
    upwind = 0;
    donor = 0;
    downwind = -1;
    dif = upwind;
  }

  sigmat = fabs(vol_flux_y[OPS_ACC0(0, 0)]) / pre_vol[OPS_ACC1(0, donor)];
  sigma3 =
      (1.0 + sigmat) * (vertexdy[OPS_ACC3(0, 0)] / vertexdy[OPS_ACC3(0, dif)]);
  sigma4 = 2.0 - sigmat;

  sigmav = sigmat;

  diffuw = density1[OPS_ACC4(0, donor)] - density1[OPS_ACC4(0, upwind)];
  diffdw = density1[OPS_ACC4(0, downwind)] - density1[OPS_ACC4(0, donor)];

  if ((diffuw * diffdw) > 0.0)
    limiter = (1.0 - sigmav) * SIGN(1.0, diffdw) *
              MIN(MIN(fabs(diffuw), fabs(diffdw)),
                  one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw)));
  else
    limiter = 0.0;

  mass_flux_y[OPS_ACC6(0, 0)] =
      (vol_flux_y[OPS_ACC0(0, 0)]) * (density1[OPS_ACC4(0, donor)] + limiter);

  sigmam = fabs(mass_flux_y[OPS_ACC6(0, 0)]) /
           (density1[OPS_ACC4(0, donor)] * pre_vol[OPS_ACC1(0, donor)]);
  diffuw = energy1[OPS_ACC5(0, donor)] - energy1[OPS_ACC5(0, upwind)];
  diffdw = energy1[OPS_ACC5(0, downwind)] - energy1[OPS_ACC5(0, donor)];

  if ((diffuw * diffdw) > 0.0)
    limiter = (1.0 - sigmam) * SIGN(1.0, diffdw) *
              MIN(MIN(fabs(diffuw), fabs(diffdw)),
                  one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw)));
  else
    limiter = 0.0;

  ener_flux[OPS_ACC7(0, 0)] =
      mass_flux_y[OPS_ACC6(0, 0)] * (energy1[OPS_ACC5(0, donor)] + limiter);
}
// host stub function
void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) {
  ops_block block = desc->block;
  int dim = desc->dim;
  int *range = desc->range;
  ops_arg arg0 = desc->args[0];
  ops_arg arg1 = desc->args[1];
  ops_arg arg2 = desc->args[2];
  ops_arg arg3 = desc->args[3];

  // Timing
  double t1, t2, c1, c2;

  ops_arg args[4] = {arg0, arg1, arg2, arg3};

#ifdef CHECKPOINTING
  if (!ops_checkpointing_before(args, 4, range, 8))
    return;
#endif

  if (OPS_diags > 1) {
    OPS_kernels[8].count++;
    ops_timers_core(&c2, &t2);
  }

  // compute locally allocated range for the sub-block
  int start[2];
  int end[2];

  for (int n = 0; n < 2; n++) {
    start[n] = range[2 * n];
    end[n] = range[2 * n + 1];
  }

#ifdef OPS_DEBUG
  ops_register_args(args, "ideal_gas_kernel");
#endif

  // set up initial pointers and exchange halos if necessary
  int base0 = args[0].dat->base_offset;
  const double *__restrict__ density = (double *)(args[0].data + base0);

  int base1 = args[1].dat->base_offset;
  const double *__restrict__ energy = (double *)(args[1].data + base1);

  int base2 = args[2].dat->base_offset;
  double *__restrict__ pressure = (double *)(args[2].data + base2);

  int base3 = args[3].dat->base_offset;
  double *__restrict__ soundspeed = (double *)(args[3].data + base3);

  // initialize global variable with the dimension of dats
  int xdim0_ideal_gas_kernel = args[0].dat->size[0];
  int xdim1_ideal_gas_kernel = args[1].dat->size[0];
  int xdim2_ideal_gas_kernel = args[2].dat->size[0];
  int xdim3_ideal_gas_kernel = args[3].dat->size[0];

  if (OPS_diags > 1) {
    ops_timers_core(&c1, &t1);
    OPS_kernels[8].mpi_time += t1 - t2;
  }

#pragma omp parallel for
  for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(density, energy, pressure, soundspeed)
#else
#pragma simd
#endif
    for (int n_x = start[0]; n_x < end[0]; n_x++) {

      double sound_speed_squared, v, pressurebyenergy, pressurebyvolume;

      v = 1.0 / density[OPS_ACC0(0, 0)];
      pressure[OPS_ACC2(0, 0)] =
          (1.4 - 1.0) * density[OPS_ACC0(0, 0)] * energy[OPS_ACC1(0, 0)];
      pressurebyenergy = (1.4 - 1.0) * density[OPS_ACC0(0, 0)];
      pressurebyvolume =
          -1 * density[OPS_ACC0(0, 0)] * pressure[OPS_ACC2(0, 0)];
      sound_speed_squared =
          v * v *
          (pressure[OPS_ACC2(0, 0)] * pressurebyenergy - pressurebyvolume);
      soundspeed[OPS_ACC3(0, 0)] = sqrt(sound_speed_squared);
    }
  }
  if (OPS_diags > 1) {
    ops_timers_core(&c2, &t2);
    OPS_kernels[8].time += t2 - t1;
  }

  if (OPS_diags > 1) {
    // Update kernel record
    ops_timers_core(&c1, &t1);
    OPS_kernels[8].mpi_time += t1 - t2;
    OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0);
    OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1);
    OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2);
    OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg3);
  }
}
    const double *restrict mass_flux_y, const double *restrict vol_flux_y,
    const double *restrict pre_vol, const double *restrict post_vol,
    double *restrict pre_mass, double *restrict post_mass,
    double *restrict advec_vol, double *restrict post_ener,
    const double *restrict ener_flux, int x_size, int y_size) {
#pragma omp parallel for
  for (int n_y = 0; n_y < y_size; n_y++) {
    for (int n_x = 0; n_x < x_size; n_x++) {

      pre_mass[OPS_ACC6(0, 0)] =
          density1[OPS_ACC0(0, 0)] * pre_vol[OPS_ACC4(0, 0)];
      post_mass[OPS_ACC7(0, 0)] = pre_mass[OPS_ACC6(0, 0)] +
                                  mass_flux_y[OPS_ACC2(0, 0)] -
                                  mass_flux_y[OPS_ACC2(0, 1)];
      post_ener[OPS_ACC9(0, 0)] =
          (energy1[OPS_ACC1(0, 0)] * pre_mass[OPS_ACC6(0, 0)] +
           ener_flux[OPS_ACC10(0, 0)] - ener_flux[OPS_ACC10(0, 1)]) /
          post_mass[OPS_ACC7(0, 0)];
      advec_vol[OPS_ACC8(0, 0)] = pre_vol[OPS_ACC4(0, 0)] +
                                  vol_flux_y[OPS_ACC3(0, 0)] -
                                  vol_flux_y[OPS_ACC3(0, 1)];
      density1[OPS_ACC0(0, 0)] =
          post_mass[OPS_ACC7(0, 0)] / advec_vol[OPS_ACC8(0, 0)];
      energy1[OPS_ACC1(0, 0)] = post_ener[OPS_ACC9(0, 0)];
    }
  }
}
#undef OPS_ACC0
#undef OPS_ACC1
#undef OPS_ACC2
#undef OPS_ACC3
// user function
inline void poisson_kernel_error(const double *u, const double *ref,
                                 double *err) {
  *err = *err +
         (u[OPS_ACC0(0, 0)] - ref[OPS_ACC1(0, 0)]) *
             (u[OPS_ACC0(0, 0)] - ref[OPS_ACC1(0, 0)]);
}
// user function
inline void tea_leaf_cg_calc_ur_r_reduce_kernel(double *r, const double *w,
                                                const double *alpha,
                                                double *rnn) {
  r[OPS_ACC0(0, 0)] = r[OPS_ACC0(0, 0)] - (*alpha) * w[OPS_ACC1(0, 0)];
  *rnn = *rnn + r[OPS_ACC0(0, 0)] * r[OPS_ACC0(0, 0)];
}
// user function
inline void poisson_kernel_update(const double *u2, double *u) {
  u[OPS_ACC1(0, 0)] = u2[OPS_ACC0(0, 0)];
}
inline void update_halo_kernel2_zvel_minus_2_back(double *zvel0, double *zvel1, const int* fields)
{
  if(fields[FIELD_ZVEL0] == 1) zvel0[OPS_ACC0(0,0,0)] = -zvel0[OPS_ACC0(0,0,2)];
  if(fields[FIELD_ZVEL1] == 1) zvel1[OPS_ACC1(0,0,0)] = -zvel1[OPS_ACC1(0,0,2)];
}
   n_z * xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx * 1 + x +           \
   xdim3_flux_calc_kernelx * (y) +                                             \
   xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx * (z))

// user function

void flux_calc_kernelx_c_wrapper(double *restrict vol_flux_x,
                                 const double *restrict xarea,
                                 const double *restrict xvel0,
                                 const double *restrict xvel1, int x_size,
                                 int y_size, int z_size) {
#pragma omp parallel for
  for (int n_z = 0; n_z < z_size; n_z++) {
    for (int n_y = 0; n_y < y_size; n_y++) {
      for (int n_x = 0; n_x < x_size; n_x++) {

        vol_flux_x[OPS_ACC0(0, 0, 0)] =
            0.125 * dt * (xarea[OPS_ACC1(0, 0, 0)]) *
            (xvel0[OPS_ACC2(0, 0, 0)] + xvel0[OPS_ACC2(0, 1, 0)] +
             xvel0[OPS_ACC2(0, 0, 1)] + xvel0[OPS_ACC2(0, 1, 1)] +
             xvel1[OPS_ACC3(0, 0, 0)] + xvel1[OPS_ACC3(0, 1, 0)] +
             xvel1[OPS_ACC3(0, 0, 1)] + xvel1[OPS_ACC3(0, 1, 1)]);
      }
    }
  }
}
#undef OPS_ACC0
#undef OPS_ACC1
#undef OPS_ACC2
#undef OPS_ACC3
inline void advec_mom_kernel2_y( double *vel1, const double *node_mass_post,
                        const double *node_mass_pre, const double *mom_flux) {

  vel1[OPS_ACC0(0,0,0)] = ( vel1[OPS_ACC0(0,0,0)] * node_mass_pre[OPS_ACC2(0,0,0)]  +
    mom_flux[OPS_ACC3(0,-1,0)] - mom_flux[OPS_ACC3(0,0,0)] ) / node_mass_post[OPS_ACC1(0,0,0)];
}
// host stub function
void ops_par_loop_update_halo_kernel5_plus_4_a_execute(
    ops_kernel_descriptor *desc) {
  ops_block block = desc->block;
  int dim = desc->dim;
  int *range = desc->range;
  ops_arg arg0 = desc->args[0];
  ops_arg arg1 = desc->args[1];
  ops_arg arg2 = desc->args[2];

  // Timing
  double t1, t2, c1, c2;

  ops_arg args[3] = {arg0, arg1, arg2};

#ifdef CHECKPOINTING
  if (!ops_checkpointing_before(args, 3, range, 84))
    return;
#endif

  if (OPS_diags > 1) {
    OPS_kernels[84].count++;
    ops_timers_core(&c2, &t2);
  }

  // compute locally allocated range for the sub-block
  int start[3];
  int end[3];

  for (int n = 0; n < 3; n++) {
    start[n] = range[2 * n];
    end[n] = range[2 * n + 1];
  }

#ifdef OPS_DEBUG
  ops_register_args(args, "update_halo_kernel5_plus_4_a");
#endif

  // set up initial pointers and exchange halos if necessary
  int base0 = args[0].dat->base_offset;
  double *__restrict__ vol_flux_z = (double *)(args[0].data + base0);

  int base1 = args[1].dat->base_offset;
  double *__restrict__ mass_flux_z = (double *)(args[1].data + base1);

  const int *__restrict__ fields = (int *)args[2].data;

  // initialize global variable with the dimension of dats
  int xdim0_update_halo_kernel5_plus_4_a = args[0].dat->size[0];
  int ydim0_update_halo_kernel5_plus_4_a = args[0].dat->size[1];
  int xdim1_update_halo_kernel5_plus_4_a = args[1].dat->size[0];
  int ydim1_update_halo_kernel5_plus_4_a = args[1].dat->size[1];

  if (OPS_diags > 1) {
    ops_timers_core(&c1, &t1);
    OPS_kernels[84].mpi_time += t1 - t2;
  }

#pragma omp parallel for collapse(2)
  for (int n_z = start[2]; n_z < end[2]; n_z++) {
    for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(vol_flux_z, mass_flux_z)
#else
#pragma simd
#endif
      for (int n_x = start[0]; n_x < end[0]; n_x++) {

        if (fields[FIELD_VOL_FLUX_Z] == 1)
          vol_flux_z[OPS_ACC0(0, 0, 0)] = vol_flux_z[OPS_ACC0(0, 4, 0)];
        if (fields[FIELD_MASS_FLUX_Z] == 1)
          mass_flux_z[OPS_ACC1(0, 0, 0)] = mass_flux_z[OPS_ACC1(0, 4, 0)];
      }
    }
  }
  if (OPS_diags > 1) {
    ops_timers_core(&c2, &t2);
    OPS_kernels[84].time += t2 - t1;
  }

  if (OPS_diags > 1) {
    // Update kernel record
    ops_timers_core(&c1, &t1);
    OPS_kernels[84].mpi_time += t1 - t2;
    OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0);
    OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1);
  }
}
// host stub function
void ops_par_loop_initialise_chunk_kernel_cellx_execute(
    ops_kernel_descriptor *desc) {
  ops_block block = desc->block;
  int dim = desc->dim;
  int *range = desc->range;
  ops_arg arg0 = desc->args[0];
  ops_arg arg1 = desc->args[1];
  ops_arg arg2 = desc->args[2];

  // Timing
  double t1, t2, c1, c2;

  ops_arg args[3] = {arg0, arg1, arg2};

#ifdef CHECKPOINTING
  if (!ops_checkpointing_before(args, 3, range, 12))
    return;
#endif

  if (OPS_diags > 1) {
    OPS_kernels[12].count++;
    ops_timers_core(&c2, &t2);
  }

  // compute locally allocated range for the sub-block
  int start[2];
  int end[2];

  for (int n = 0; n < 2; n++) {
    start[n] = range[2 * n];
    end[n] = range[2 * n + 1];
  }

#ifdef OPS_DEBUG
  ops_register_args(args, "initialise_chunk_kernel_cellx");
#endif

  // set up initial pointers and exchange halos if necessary
  int base0 = args[0].dat->base_offset;
  const double *__restrict__ vertexx = (double *)(args[0].data + base0);

  int base1 = args[1].dat->base_offset;
  double *__restrict__ cellx = (double *)(args[1].data + base1);

  int base2 = args[2].dat->base_offset;
  double *__restrict__ celldx = (double *)(args[2].data + base2);

  // initialize global variable with the dimension of dats
  int xdim0_initialise_chunk_kernel_cellx = args[0].dat->size[0];
  int xdim1_initialise_chunk_kernel_cellx = args[1].dat->size[0];
  int xdim2_initialise_chunk_kernel_cellx = args[2].dat->size[0];

  if (OPS_diags > 1) {
    ops_timers_core(&c1, &t1);
    OPS_kernels[12].mpi_time += t1 - t2;
  }

#pragma omp parallel for
  for (int n_y = start[1]; n_y < end[1]; n_y++) {
#ifdef intel
#pragma loop_count(10000)
#pragma omp simd aligned(vertexx, cellx, celldx)
#else
#pragma simd
#endif
    for (int n_x = start[0]; n_x < end[0]; n_x++) {

      double d_x;
      d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells;

      cellx[OPS_ACC1(0, 0)] =
          0.5 * (vertexx[OPS_ACC0(0, 0)] + vertexx[OPS_ACC0(1, 0)]);
      celldx[OPS_ACC2(0, 0)] = d_x;
    }
  }
  if (OPS_diags > 1) {
    ops_timers_core(&c2, &t2);
    OPS_kernels[12].time += t2 - t1;
  }

  if (OPS_diags > 1) {
    // Update kernel record
    ops_timers_core(&c1, &t1);
    OPS_kernels[12].mpi_time += t1 - t2;
    OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0);
    OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1);
    OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2);
  }
}