void PolarDecompositionGradient::Compute(const double * M, const double * Q, const double * S, const double * MDot, double * omega, double * QDot, double * SDot, const double * MDotDot, double * omegaDot, double * QDotDot)
{
  // compute omega = G^{-1} (2 * skew(Q^T * MDot)), where G = (tr(S)I - S) * Q^T
  // (see Barbic and Zhao, SIGGRAPH 2011)

  // first, construct G, and invert it

  // tempMatrix = tr(S)I - S
  double tempMatrix[9];
  for(int i=0; i<9; i++)
    tempMatrix[i] = -S[i];
  double trace = S[0] + S[4] + S[8];
  tempMatrix[0] += trace;
  tempMatrix[4] += trace;
  tempMatrix[8] += trace;

  double G[9]; // G = (tr(S)I - S) * Q^T
  MATRIX_MULTIPLY3X3ABT(tempMatrix, Q, G);
  Mat3d GM(G);
  Mat3d GInvM = inv(GM);
  double GInv[9];
  GInvM.convertToArray(GInv);

  // omega = GInv * (2 * skew(R^T * Mdot))
  MATRIX_MULTIPLY3X3ATB(Q, MDot, tempMatrix);
  double rhs[3];
  SKEW_PART(tempMatrix, rhs);
  VECTOR_SCALE3(rhs, 2.0);
  MATRIX_VECTOR_MULTIPLY3X3(GInv, rhs, omega);

  // compute QDot = tilde(omega) * Q
  double omegaTilde[9];
  SKEW_MATRIX(omega, omegaTilde);
  //double QDot[9];
  MATRIX_MULTIPLY3X3(omegaTilde, Q, QDot);

  // compute SDot = Q^T * (MDot - QDot * S)
  // tempMatrix = MDot - QDot * S
  MATRIX_MULTIPLY3X3(QDot, S, tempMatrix);
  for(int i=0; i<9; i++)
    tempMatrix[i] = MDot[i] - tempMatrix[i];
  // SDot = Q^T * tempMatrix
  MATRIX_MULTIPLY3X3ATB(Q, tempMatrix, SDot); 

  if ((MDotDot != NULL) && (omegaDot != NULL))
  {
    // compute omegaDot = GInv * ( 2 skew(Q^T (ADotDot - omegaTilde * ADot)) - (tr(SDot) I - SDot) * Q^T * omega )
    // (see Barbic and Zhao, SIGGRAPH 2011)
    
    // tempMatrix = MDotDot - omegaTilde * MDot
    MATRIX_MULTIPLY3X3(omegaTilde, MDot, tempMatrix);
    for(int i=0; i<9; i++)
      tempMatrix[i] = MDotDot[i] - tempMatrix[i];

    double tempMatrix2[9];
    // tempVector = 2 * skew(Q^T * tempMatrix)
    MATRIX_MULTIPLY3X3ATB(Q, tempMatrix, tempMatrix2);

    double tempVector[3];
    SKEW_PART(tempMatrix2, tempVector);
    VECTOR_SCALE3(tempVector, 2.0);

    // tempMatrix = tr(SDot)I - SDot
    for(int i=0; i<9; i++)
      tempMatrix[i] = -SDot[i];
    double trace = SDot[0] + SDot[4] + SDot[8];
    tempMatrix[0] += trace;
    tempMatrix[4] += trace;
    tempMatrix[8] += trace;

    // tempVector2 = (tempMatrix * Q^T) * omega
    double tempVector2[3];
    MATRIX_MULTIPLY3X3ABT(tempMatrix, Q, tempMatrix2);
    MATRIX_VECTOR_MULTIPLY3X3(tempMatrix2, omega, tempVector2);

    // tempVector -= tempVector2
    VECTOR_SUBTRACTEQUAL3(tempVector, tempVector2);

    // tempVector2 = GInv * tempVector
    MATRIX_VECTOR_MULTIPLY3X3(GInv, tempVector, omegaDot);

    if (QDotDot != NULL)
    {
      double tempMatrix[9];
      SKEW_MATRIX(omegaDot, tempMatrix);
      MATRIX_MULTIPLY3X3(omegaTilde, omegaTilde, tempMatrix2);
      for(int i=0;i<9;i++)
	tempMatrix[i] += tempMatrix2[i];
      MATRIX_MULTIPLY3X3(tempMatrix, Q, QDotDot);
    }
  }
}
void CorotationalLinearFEM::ComputeForceAndStiffnessMatrixOfSubmesh(double * u, double * f, SparseMatrix * stiffnessMatrix, int warp, int elementLo, int elementHi)
{
  // clear f to zero
  if (f != NULL)
    memset(f, 0, sizeof(double) * 3 * numVertices);

  // clear stiffness matrix to zero
  if (stiffnessMatrix != NULL)
    stiffnessMatrix->ResetToZero();

  for (int el=elementLo; el < elementHi; el++)
  {
    int vtxIndex[4];
    for (int vtx=0; vtx<4; vtx++)
      vtxIndex[vtx] = tetMesh->getVertexIndex(el, vtx);

    double KElement[144]; // element stiffness matrix, to be computed below; row-major

    if (warp > 0)
    {
      double P[16]; // the current world-coordinate positions (row-major)
      /*
         P = [ v0   v1   v2   v3 ]
             [  1    1    1    1 ]
      */
      // rows 1,2,3
      for(int i=0; i<3; i++)
        for(int j=0; j<4; j++)
          P[4 * i + j] = undeformedPositions[3 * vtxIndex[j] + i] + u[3 * vtxIndex[j] + i];
      // row 4
      for(int j=0; j<4; j++)
        P[12 + j] = 1;

      // F = P * Inverse(M)
      double F[9]; // upper-left 3x3 block
      for(int i=0; i<3; i++) 
        for(int j=0; j<3; j++) 
        {
          F[3 * i + j] = 0;
          for(int k=0; k<4; k++)
            F[3 * i + j] += P[4 * i + k] * MInverse[el][4 * k + j];
	}

      double R[9]; // rotation (row-major)
      double S[9]; // symmetric (row-major)
      double tolerance = 1E-6;
      int forceRotation = 1;
      PolarDecomposition::Compute(F, R, S, tolerance, forceRotation);

      // RK = R * K
      // KElement = R * K * R^T
      double RK[144]; // row-major
      WarpMatrix(KElementUndeformed[el], R, RK, KElement);

      // f = RK (RT x - x0)
      double fElement[12];
      for(int i=0; i<12; i++)
      {
        fElement[i] = 0;
        for(int j=0; j<4; j++)
          for(int l=0; l<3; l++)
            fElement[i] += KElement[12 * i + 3 * j + l] * P[4 * l + j] - RK[12 * i + 3 * j + l] * undeformedPositions[3 * vtxIndex[j] + l];
      }

      // add fElement into the global f
      if (f != NULL)
      {
        for(int j=0; j<4; j++)
          for(int l=0; l<3; l++)
            f[3 * vtxIndex[j] + l] += fElement[3 * j + l];
      }

      // compute exact stiffness matrix
      if (warp == 2)
      {
        // compute G = (tr(S) I - S) R^T
        double G[9]; 
        double tr = S[0] + S[4] + S[8];
        double temp[9];
        for(int i=0; i<9; i++)
          temp[i] = -S[i];
        temp[0] += tr;
        temp[4] += tr;
        temp[8] += tr;
        // G = temp * R^T
        MATRIX_MULTIPLY3X3ABT(temp, R, G);

        double invG[9]; // invG = G^{-1}
        inverse3x3(G, invG);

        double rhs[27]; // 3 x 9 matrix (column-major)
        for(int i=0; i<3; i++)
          for(int j=0; j<3; j++)
          {
            double temp[9];
            for(int k=0; k<9; k++)
              temp[k] = 0.0;
            // copy i-th row of R into column j of temp      
            for(int k=0; k<3; k++)
              temp[3 * k + j] = R[3 * i + k];
            // extract the skew-symmetric part
            SKEW_PART(temp, &rhs[3 * (3 * i + j)]);
          }
        // must undo division by 2 from inside the SKEW_PART macro
        for(int i=0; i<27; i++)
          rhs[i] *= 2.0;

        // solve G * omega = rhs
        double omega[27]; // column-major
        for(int i=0; i<9; i++)
        {
          MATRIX_VECTOR_MULTIPLY3X3(invG, &rhs[3 * i], &omega[3 * i]);
        }

        double dRdF[81]; // each column is skew(omega) * R ; column-major
        for(int i=0; i<9; i++)
        {
          double skew[9];
          SKEW_MATRIX(&omega[3 * i], skew);
          MATRIX_MULTIPLY3X3(skew, R, &dRdF[9 * i]);
        }

        double B[3][3][9];
        // re-arrange dRdF into B, for easier dRdF * dFdx multiplication (to exploit sparsity of dFdx)
        for(int i=0; i<3; i++)
          for(int j=0; j<3; j++)
            for(int k=0; k<3; k++)
              for(int l=0; l<3; l++)
              {
                int row = 3 * i + k;
                int column = 3 * j + l;
                B[i][j][3 * k + l] = dRdF[9 * column + row];
              }

        // four pointers to a 3-vector
        double * minv[4] = { &MInverse[el][0], &MInverse[el][4], &MInverse[el][8], &MInverse[el][12] }; // the four rows of MInverse (last column ignored)

        double dRdx[108]; // derivative of the element rotation matrix with respect to the positions of the tet vertices; column-major
        for(int k=0; k<4; k++)
          for(int i=0; i<3; i++)
            for(int j=0; j<3; j++)
            {
              double temp[3];
              MATRIX_VECTOR_MULTIPLY3X3(B[i][j], minv[k], temp);
              int row = 3 * i;
              int column = 3 * k + j;
              VECTOR_SET3(&dRdx[9 * column + row], temp);
            }

        // add contribution of dRdx to KElement

        // term 1: \hat{dR/dxl} K (R^T x - m)

        // compute K (R^T x - m)
        double tempVec[12]; // R^T x - m
        for(int vtx=0; vtx<4; vtx++)
        {
          double pos[3];
          for(int i=0; i<3; i++)
            pos[i] = P[4 * i + vtx];
          MATRIX_VECTOR_MULTIPLY3X3T(R, pos, &tempVec[3*vtx]);
          // subtract m
          for(int i=0; i<3; i++)
            tempVec[3*vtx+i] -= undeformedPositions[3 * vtxIndex[vtx] + i];
        }
        double a[12]; // a = K * tempVec
        for (int i=0; i<12; i++)
        {
          a[i] = 0.0;
          for (int j=0; j<12; j++)
            a[i] += KElementUndeformed[el][12 * i + j] * tempVec[j];
        }

        // add [\hat{dR/dxl} K R^T x]_l, l=1 to 12
        for(int column=0; column<12; column++)
        {
          double b[12]; // b = \hat{dR/dxl} * a
          for(int j=0; j<4; j++)
          {
            MATRIX_VECTOR_MULTIPLY3X3(&dRdx[9 * column], &a[3*j], &b[3*j]);
          }
          // write b into KElement (add b to i-th column)
          for(int row=0; row<12; row++)
            KElement[12 * row + column] += b[row]; // KElement is row-major
        }

        // term 2: (R K \hat{dRdxl}^T)x

        // re-write positions into a
        for(int vtx=0; vtx<4; vtx++)
        {
          for(int i=0; i<3; i++)
            a[3 * vtx + i] = P[4 * i + vtx];
        }

        // compute [\hat{dRdxl}^T x)]_l, l=1 to 12
        for(int column=0; column<12; column++)
        {
          double b[12]; // b = \hat{dRdxl}^T * a
          for(int j=0; j<4; j++)
          {
            MATRIX_VECTOR_MULTIPLY3X3T(&dRdx[9 * column], &a[3*j], &b[3*j]);
          }

          // add RK * b to column of KElement
          int rowStart = 0;
          for (int row=0; row<12; row++)
          {
            double contrib = 0.0;
            for (int j=0; j<12; j++)
              contrib += RK[rowStart + j] * b[j];
            KElement[rowStart + column] += contrib;
            rowStart += 12;
          }
        }
      }
    }
    else
    {
      // no warp
      memcpy(KElement, KElementUndeformed[el], sizeof(double) * 144);
      // f = K u
      double fElement[12];
      for(int i=0; i<12; i++)
      {
        fElement[i] = 0;
        for(int j=0; j<4; j++)
        {
          fElement[i] += 
            KElement[12 * i + 3 * j + 0] * u[3 * vtxIndex[j] + 0] +
            KElement[12 * i + 3 * j + 1] * u[3 * vtxIndex[j] + 1] +
            KElement[12 * i + 3 * j + 2] * u[3 * vtxIndex[j] + 2];
        }
      }

      // add fElement into the global f
      if (f != NULL)
      {
        for(int j=0; j<4; j++)
        {
          f[3 * vtxIndex[j] + 0] += fElement[3 * j + 0];
          f[3 * vtxIndex[j] + 1] += fElement[3 * j + 1];
          f[3 * vtxIndex[j] + 2] += fElement[3 * j + 2];
        }
      }
    }

    if (stiffnessMatrix != NULL)
    {
      int * rowIndex = rowIndices[el];
      int * columnIndex = columnIndices[el];

      // add KElement to the global stiffness matrix
      for (int i=0; i<4; i++)
        for (int j=0; j<4; j++)
          for(int k=0; k<3; k++)
            for(int l=0; l<3; l++)
              stiffnessMatrix->AddEntry(3 * rowIndex[i] + k, 3 * columnIndex[4 * i + j] + l, KElement[12 * (3 * i + k) + 3 * j + l]);
    }
  }
}