void two_phase_3d_op_explicit(double phi[M][N][P],
			      const double u0[M][N][P],
			      double curvature_motion_part[M][N][P],
			      double dt, double c1, double c2)
{
	double mu = TP_MU;
	double nu = TP_NU;
	double lambda1 = TP_LAMBDA1;
	double lambda2 = TP_LAMBDA2;

	double dx = 1.0;
	double dy = 1.0;
	double dz = 1.0;

	double dx2 = dx * 2.0;
	double dy2 = dy * 2.0;
	double dz2 = dz * 2.0;

	double Dx_p, Dx_m;
	double Dy_p, Dy_m;
	double Dz_p, Dz_m;
	double Dx_0, Dy_0, Dz_0;

	double Dxx, Dyy, Dzz;
	double Dxy, Dxz, Dyz;

	double Grad, K;

	double stencil[3][3][3];
#pragma AP array_partition variable=stencil complete dim=0

	double numer, denom;

	uint32_t i, j, k, l;

	for (i = 1; i < M - 1; i++) {
		for (j = 1; j < N - 1; j++) {
			for (k = 1; k < P - 1; k++) {
#pragma AP pipeline
				/* stencil code */
				stencil[0][0][0] = stencil[0][0][1];
				stencil[0][1][0] = stencil[0][1][1];
				stencil[0][2][0] = stencil[0][2][1];

				stencil[0][0][1] = stencil[0][0][2];
				stencil[0][1][1] = stencil[0][1][2];
				stencil[0][2][1] = stencil[0][2][2];

				stencil[0][0][2] = PHI(i - 1, j - 1, k + 1);
				stencil[0][1][2] = PHI(i - 1, j, k + 1);
				stencil[0][2][2] = PHI(i - 1, j + 1, k + 1);

				stencil[1][0][0] = stencil[1][0][1];
				stencil[1][1][0] = stencil[1][2][1];
				stencil[1][2][0] = stencil[1][2][1];

				stencil[1][0][1] = stencil[1][0][2];
				stencil[1][1][1] = stencil[1][1][2];
				stencil[1][2][1] = stencil[1][2][2];

				stencil[1][0][2] = PHI(i, j - 1, k + 1);
				stencil[1][1][2] = PHI(i, j, k + 1);
				stencil[1][2][2] = PHI(i, j + 1, k + 1);

				stencil[2][0][0] = stencil[2][0][1];
				stencil[2][1][0] = stencil[2][1][1];
				stencil[2][2][0] = stencil[2][2][1];

				stencil[2][0][1] = stencil[2][0][2];
				stencil[2][1][1] = stencil[2][1][2];
				stencil[2][2][1] = stencil[2][2][2];

				stencil[2][0][2] = PHI(i + 1, j - 1, k + 1);
				stencil[2][1][2] = PHI(i + 1, j, k + 1);
				stencil[2][2][2] = PHI(i + 1, j + 1, k + 1);

				/* regular calculation here */
				Dx_p =
				    (stencil[2][1][1] - stencil[1][1][1]) / dx;
				Dx_m =
				    (stencil[1][1][1] - stencil[0][1][1]) / dx;
				Dy_p =
				    (stencil[1][2][1] - stencil[1][1][1]) / dy;
				Dy_m =
				    (stencil[1][1][1] - stencil[1][0][1]) / dy;
				Dz_p =
				    (stencil[1][1][2] - stencil[1][1][1]) / dz;
				Dz_m =
				    (stencil[1][1][1] - stencil[1][1][0]) / dz;

				Dx_0 =
				    (stencil[2][1][1] - stencil[0][1][1]) / dx2;
				Dy_0 =
				    (stencil[1][2][1] - stencil[1][0][1]) / dy2;
				Dz_0 =
				    (stencil[1][1][2] - stencil[1][1][0]) / dz2;

				Dxx = (Dx_p - Dx_m) / dx;
				Dyy = (Dy_p - Dy_m) / dy;
				Dzz = (Dz_p - Dz_m) / dz;

				Dxy =
				    (stencil[2][2][1] - stencil[2][0][1] -
				     stencil[0][2][1] -
				     stencil[0][0][1]) / (4 * dx * dy);
				Dxz =
				    (stencil[2][1][2] - stencil[2][1][0] -
				     stencil[0][1][2] +
				     stencil[0][1][0]) / (4 * dx * dz);
				Dyz =
				    (stencil[1][2][2] - stencil[1][2][0] -
				     stencil[1][0][2] +
				     stencil[1][0][0]) / (4 * dy * dz);

				Grad = (SQR(Dx_0) + SQR(Dy_0) + SQR(Dz_0));
				denom = Grad;

				/* denom = denom^1.5 */
				for (l = 0; l < 3; l++) {
#pragma AP unroll
					denom *= denom;
				}
				q3_sqrt(denom);

				numer = (Dx_0 * Dx_0 * Dyy -
					 2.0 * Dx_0 * Dy_0 * Dxy +
					 Dy_0 * Dy_0 * Dxx + Dx_0 * Dx_0 * Dzz -
					 2.0 * Dx_0 * Dz_0 * Dxz +
					 Dz_0 * Dz_0 * Dxx + Dy_0 * Dy_0 * Dzz -
					 2.0 * Dy_0 * Dz_0 * Dyz +
					 Dz_0 * Dz_0 * Dyy);

				K = numer / denom;

				CMP(i, j, k) =
				    Grad * (mu * K +
					    lambda1 * (U0(i, j, k) -
						       c1) * (U0(i, j,
								 k) - c1) -
					    lambda2 * (U0(i, j, k) -
						       c2) * (U0(i, j,
								 k) - c2));
			}
		}
	}

	neumann_bc(curvature_motion_part);

	for (k = 0; k < P; k++) {
		for (j = 0; j < N; j++) {
			for (i = 0; i < M; i++) {
#pragma AP pipeline
				PHI(i, j, k) += CMP(i, j, k) * dt;
			}
		}
	}
}
/**
  *
  * Compute the bicubic interpolation of a point in an image.
  * Detect if the point goes outside the image domain.
  *
**/
static float bicubic_interpolation_at(
	const float *input, //image to be interpolated
	const float  uu,    //x component of the vector field
	const float  vv,    //y component of the vector field
	const int    nx,    //image width
	const int    ny,    //image height
	bool         border_out //if true, return zero outside the region
)
{
	const int boundary_condition = DEFAULT_BICUBIC_BOUNDARY_CONDITION;
	const int sx = (uu < 0) ? -1: 1;
	const int sy = (vv < 0) ? -1: 1;

	int x, y, mx, my, dx, dy, ddx, ddy;
	bool out[1] = {false};

	//apply the corresponding boundary conditions
	switch(boundary_condition)
	{
	case BICUBIC_BOUNDARY_NEUMANN:
		x   = neumann_bc((int) uu, nx, out);
		y   = neumann_bc((int) vv, ny, out);
		mx  = neumann_bc((int) uu - sx, nx, out);
		my  = neumann_bc((int) vv - sx, ny, out);
		dx  = neumann_bc((int) uu + sx, nx, out);
		dy  = neumann_bc((int) vv + sy, ny, out);
		ddx = neumann_bc((int) uu + 2*sx, nx, out);
		ddy = neumann_bc((int) vv + 2*sy, ny, out);
		break;

	case BICUBIC_BOUNDARY_PERIODIC:
		x   = periodic_bc((int) uu, nx, out);
		y   = periodic_bc((int) vv, ny, out);
		mx  = periodic_bc((int) uu - sx, nx, out);
		my  = periodic_bc((int) vv - sx, ny, out);
		dx  = periodic_bc((int) uu + sx, nx, out);
		dy  = periodic_bc((int) vv + sy, ny, out);
		ddx = periodic_bc((int) uu + 2*sx, nx, out);
		ddy = periodic_bc((int) vv + 2*sy, ny, out);
		break;

	case BICUBIC_BOUNDARY_SYMMETRIC:
		x   = symmetric_bc((int) uu, nx, out);
		y   = symmetric_bc((int) vv, ny, out);
		mx  = symmetric_bc((int) uu - sx, nx, out);
		my  = symmetric_bc((int) vv - sx, ny, out);
		dx  = symmetric_bc((int) uu + sx, nx, out);
		dy  = symmetric_bc((int) vv + sy, ny, out);
		ddx = symmetric_bc((int) uu + 2*sx, nx, out);
		ddy = symmetric_bc((int) vv + 2*sy, ny, out);
		break;
	}

	if(*out && border_out)
		return 0.0;

	else
	{
		//obtain the interpolation points of the image
		const float p11 = input[mx  + nx * my];
		const float p12 = input[x   + nx * my];
		const float p13 = input[dx  + nx * my];
		const float p14 = input[ddx + nx * my];

		const float p21 = input[mx  + nx * y];
		const float p22 = input[x   + nx * y];
		const float p23 = input[dx  + nx * y];
		const float p24 = input[ddx + nx * y];

		const float p31 = input[mx  + nx * dy];
		const float p32 = input[x   + nx * dy];
		const float p33 = input[dx  + nx * dy];
		const float p34 = input[ddx + nx * dy];

		const float p41 = input[mx  + nx * ddy];
		const float p42 = input[x   + nx * ddy];
		const float p43 = input[dx  + nx * ddy];
		const float p44 = input[ddx + nx * ddy];

		//create array
		double pol[4][4] = {
			{p11, p21, p31, p41},
			{p12, p22, p32, p42},
			{p13, p23, p33, p43},
			{p14, p24, p34, p44}
		};

		//return interpolation
		return bicubic_interpolation_cell(pol, uu-x, vv-y);
	}
}