int StVKReducedInternalForces::LoadFromStream(FILE * fin, int rTarget, int bigEndianMachine) 
{
  if (verbose)
    printf("Loading polynomials assuming little endian machine: %s.", (!bigEndianMachine) ? "TRUE" : "FALSE");

  int header[4];

  if ((int)(fread(header, sizeof(int), 4, fin)) < 4)
  {
    printf("Error: couldn't read from input cubic polynomial file.\n");
    throw 1;
  }
  
  r = header[0];

  int buffer;
  if (bigEndianMachine)
  {
    little2big(&r, &buffer, sizeof(int));
    r = buffer;
  }

  if (rTarget > r)
  {
    printf("Error: the input cubic polynomial file has r=%d, but you requested %d > %d.\n", r, rTarget, r);
    throw 2;
  }

  // first read in the coefficients as if all modes requested
  if (verbose)
    printf(" r=%d\n", r);
  
  r2 = r * r;

  linearSize = header[1];

  if (bigEndianMachine)
  {
    little2big(&linearSize, &buffer, sizeof(int));
    linearSize = buffer;
  }

  quadraticSize = header[2];

  if (bigEndianMachine)
  {
    little2big(&quadraticSize, &buffer, sizeof(int));
    quadraticSize = buffer;
  }

  cubicSize = header[3];

  if (bigEndianMachine)
  {
    little2big(&cubicSize, &buffer, sizeof(int));
    cubicSize = buffer;
  }

  linearCoef_ = (double*) malloc (sizeof(double) * r * linearSize);

  if ((int)(fread(linearCoef_,sizeof(double),r*linearSize,fin)) < r*linearSize)
  {
    printf("Error: couldn't read from input cubic polynomial file.\n");
    throw 1;
  }

  double bufferd;
  if (bigEndianMachine)
  {
    for(int i=0; i<r*linearSize; i++)
    {
      little2big(&linearCoef_[i], &bufferd, sizeof(double));
      linearCoef_[i] = bufferd;
    }
  }

  quadraticCoef_ = (double*) malloc (sizeof(double) * r * quadraticSize);

  if ((int)(fread(quadraticCoef_,sizeof(double),r*quadraticSize,fin)) < r*quadraticSize)
  {
    printf("Error: couldn't read from input cubic polynomial file.\n");
    throw 1;
  }

  if (bigEndianMachine)
  {
    for(int i=0; i<r*quadraticSize; i++)
    {
      little2big(&quadraticCoef_[i], &bufferd, sizeof(double));
      quadraticCoef_[i] = bufferd;
    }
  }

  cubicCoef_ = (double*) malloc (sizeof(double) * r * cubicSize);

  if ((int)(fread(cubicCoef_,sizeof(double),r*cubicSize,fin)) < r*cubicSize)
  {
    printf("Error: couldn't read from input cubic polynomial file.\n");
    throw 1;
  }

  if (bigEndianMachine)
  {
    for(int i=0; i<r*cubicSize; i++)
    {
      little2big(&cubicCoef_[i], &bufferd, sizeof(double));
      cubicCoef_[i] = bufferd;
    }
  }

  if (rTarget >= 0)
  {
    int linearSizeTarget, quadraticSizeTarget, cubicSizeTarget;
    GetSizes(rTarget, &linearSizeTarget, &quadraticSizeTarget, &cubicSizeTarget);

    double * linearCoefTemp_ = 
      (double*) malloc (sizeof(double) * rTarget * linearSizeTarget);

    double * quadraticCoefTemp_ = 
      (double*) malloc (sizeof(double) * rTarget * quadraticSizeTarget);

    double * cubicCoefTemp_ = 
      (double*) malloc (sizeof(double) * rTarget * cubicSizeTarget);

    for(int output=0; output<rTarget; output++)
      for(int i=0; i<rTarget; i++)
      {
        SetSizes(rTarget);
        int positionTarget = linearCoefPos(output, i); 
        SetSizes(r);
        int position = linearCoefPos(output, i); 
        linearCoefTemp_[positionTarget] = linearCoef_[position];
      }
 
    for(int output=0; output<rTarget; output++)
      for(int i=0; i<rTarget; i++)
        for(int j=i; j<rTarget; j++)
        {
          SetSizes(rTarget);
          int positionTarget = quadraticCoefPos(output, i, j); 
          SetSizes(r);
          int position = quadraticCoefPos(output, i, j); 
          quadraticCoefTemp_[positionTarget] = quadraticCoef_[position];
        }

    for(int output=0; output<rTarget; output++)
      for(int i=0; i<rTarget; i++)
        for(int j=i; j<rTarget; j++)
          for(int k=j; k<rTarget; k++)
          {
            SetSizes(rTarget);
            int positionTarget = cubicCoefPos(output, i, j, k); 
            SetSizes(r);
            int position = cubicCoefPos(output, i, j, k); 
            cubicCoefTemp_[positionTarget] = cubicCoef_[position];
          }

    r = rTarget;
    SetSizes(r);

    free(linearCoef_);
    free(quadraticCoef_);
    free(cubicCoef_);

    linearCoef_ = linearCoefTemp_;
    quadraticCoef_ = quadraticCoefTemp_;
    cubicCoef_ = cubicCoefTemp_;
  }

  volumetricMesh = NULL;
  U = NULL;
  reducedGravityForce = NULL;
  precomputedIntegrals = NULL;
  numElementVertices = 0;
  muLame = NULL;

  InitBuffers();

  addGravity = false;

  useSingleThread = 0;
  shallowCopy = 0;
  g=9.81; 

  return 0;
}
StVKReducedStiffnessMatrix::StVKReducedStiffnessMatrix(StVKReducedInternalForces * stVKReducedInternalForces, int verbose) : shallowCopy(0)
{
  r = stVKReducedInternalForces->Getr();
  r2 = r*r;

  if (verbose)
    printf("Building the reduced stiffness matrix quadratic polynomials... r is %d\n",r);

  int i,j,k;

  int output;

  if (verbose)
    printf("Building free terms:");

  // free terms
  // allocate room for coefficients, 1 coefficient per each of the r x r components
  freeCoef_ = (double*) malloc (sizeof(double) * r * (r+1) / 2);

  // obtain free terms by analytic derivation of the linear force terms
  for(output=0; output<r; output++)
  {
    if (verbose)
      printf(" %d",output);
    for(i=output; i<r; i++)
    {
      freeCoef_[freeCoefPos(output,i)] = stVKReducedInternalForces->linearCoef(output,i);
    }
  }

  if (verbose)
    printf("\nBuilding linear terms:");

  // linear terms
  // allocate room for coefficients, r coefficients per each of the r x r components
  linearSize = StVKReducedInternalForces::GetLinearSize(r);
  linearCoef_ = (double*) malloc (sizeof(double) * r * (r+1) / 2 * linearSize);

  // obtain linear coefficients by analytic derivation of the quadratic force terms
  for(output=0; output<r; output++)
  {
    if (verbose)
      printf(" %d",output);
    for(i=output; i<r; i++)
      for(j=0; j<r; j++)
      {
        // (i1,j1) will be (i,j) sorted in ascending order
        int i1 = i;
        int j1 = j;
        if (j1 < i1) // swap them
        {
          j1 = i;
          i1 = j;
        }

        double value = stVKReducedInternalForces->quadraticCoef(output,i1,j1);

        if (i == j)
          value *= 2;

        //int pos = linearCoefPos(output,i,j);
        linearCoef_[linearCoefPos(output,i,j)] = value;
    }
  }

  if (verbose)
    printf("\nBuilding quadratic terms:");

  // quadratic terms
  // allocate room for coefficients, r*(r+1)/2 coefficients per each of the r x r components
  quadraticSize = StVKReducedInternalForces::GetQuadraticSize(r);
  quadraticCoef_ = (double*) malloc (sizeof(double) * r * (r+1) / 2 * quadraticSize);

  // obtain quadratic coefficients by analytic derivation of the cubic force terms
  for(output=0; output<r; output++)
  {
    if (verbose)
      printf(" %d",output);

    for(i=output; i<r; i++)
      for(j=0; j<r; j++)
        for(k=j; k<r; k++)
        {
          // (i1,j1,k1) will be (i,j,k) sorted in ascending order

          int i1 = i;
          int j1 = j;
          int k1 = k;

          int buffer;
          #define SWAP(i,j)\
             buffer = i;\
             i = j;\
             j = buffer;

          // bubble sort on 3 elements
          if (j1 < i1) 
          {
            SWAP(i1,j1);
          }

          if (k1 < j1) 
          {
            SWAP(j1,k1);
          }

          if (j1 < i1)
          {
            SWAP(i1,j1);
          }

          double value = stVKReducedInternalForces->cubicCoef(output,i1,j1,k1);

          if ((i == j) && (i == k)) // q_i^3
            value *= 3;
          else if ((i == j) || (i == k)) // q_i^2 * q_j
            value *= 2;

          quadraticCoef_[quadraticCoefPos(output,i,j,k)] = value;
        }
  }

  if (verbose)
    printf("\n");
  
  InitBuffers();
}
void StVKReducedInternalForces::ProcessElements(int startElement, int endElement, double ** target)
{
  double * linearCoef_ = this->linearCoef_;
  double * quadraticCoef_ = this->quadraticCoef_;
  double * cubicCoef_ = this->cubicCoef_;

  if (target != NULL)
  {
    linearCoef_ = target[0];
    quadraticCoef_ = target[1];
    cubicCoef_ = target[2];
  }

  if (verbose >= 1)
    printf("Generating element data: element %d to %d...\n", startElement, endElement-1);

  int numVertices_ = volumetricMesh->getNumVertices();

  // make auxiliary vectors
  double * qiqjBuffer = (double*) calloc(r2,sizeof(double));
  double * qkBuffer = (double*) calloc(r2,sizeof(double));
  double * coefs = (double*) calloc(r*r*r*r,sizeof(double));

  void * elIter;
  precomputedIntegrals->AllocateElementIterator(&elIter);

  // Linear terms
  //if (verbose >= 1)
    //printf("Building linear terms:");

  for(int el=startElement; el < endElement; el++)
  {
    precomputedIntegrals->PrepareElement(el, elIter);

    if (verbose >= 1)
    {
      if (el % 100 == 1)
        printf("%d ",el); fflush(NULL);
    }

    double lambda = lambdaLame[el];
    double mu = muLame[el];

    for(int i=0; i<r; i++)
    {
      for (int c=0; c<numElementVertices; c++)
      {
        Vec3d force(0.0,0.0,0.0);

        int vc = volumetricMesh->getVertexIndex(el, c);
        for (int a=0; a<numElementVertices; a++)
        {
          int va = volumetricMesh->getVertexIndex(el, a);

          Vec3d ua(U[ELT(3*numVertices_,3*va+0,i)],
                   U[ELT(3*numVertices_,3*va+1,i)],
                   U[ELT(3*numVertices_,3*va+2,i)]);

          force += lambda * (precomputedIntegrals->A(elIter,c,a) * ua) +
                   (mu * precomputedIntegrals->B(elIter,a,c)) * ua +
                   mu * (precomputedIntegrals->A(elIter,a,c) * ua);
        }

        // multiply Uc^T * force
        for(int output=0; output<r; output++)
        {
          linearCoef_[linearCoefPos(output, i)] +=
                  U[ELT(3*numVertices_,3*vc+0,output)] * force[0] +
                  U[ELT(3*numVertices_,3*vc+1,output)] * force[1] +
                  U[ELT(3*numVertices_,3*vc+2,output)] * force[2];
        }
      }
    }
  }

  // Quadratic terms
  //if (verbose >= 1)
    //printf("\nBuilding quadratic terms:");

  double ** forceBuffer = (double**) malloc (sizeof(double*) * numElementVertices);
  for(int c=0; c<numElementVertices; c++)
    forceBuffer[c] = (double*) calloc (3*r2,sizeof(double));

  memset(quadraticCoef_, 0, sizeof(double) * r * quadraticSize);

  int * vertices = (int*) malloc (sizeof(int) * numElementVertices);

  for(int el=startElement; el < endElement; el++)
  {
    precomputedIntegrals->PrepareElement(el, elIter);

    if (verbose >= 1)
    {
      if (el % 100 == 1)
        printf("%d ",el); fflush(NULL);
    }

    double lambda = lambdaLame[el];
    double mu = muLame[el];

    for(int ver=0; ver<numElementVertices ;ver++)
      vertices[ver] = volumetricMesh->getVertexIndex(el, ver);

    for(int c=0; c<numElementVertices; c++)
      memset(forceBuffer[c],0,sizeof(double)*3*r2);

    for(int a=0; a<numElementVertices; a++)
    {
      for(int b=0; b<numElementVertices; b++)
      {
        // compute ua*ub for all possible i,j
        cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans,
                      r, r, 3,
                      1.0,
                      &U[ELT(3*numVertices_,3*vertices[a],0)], 3*numVertices_,
                      &U[ELT(3*numVertices_,3*vertices[b],0)], 3*numVertices_,
                      0.0,
                      qiqjBuffer, r);

        for(int c=0; c<numElementVertices; c++)
        {
          Vec3d vec1 = 0.5 * lambda * precomputedIntegrals->C(elIter,c,a,b) +
                       mu * precomputedIntegrals->C(elIter,a,b,c);

          Vec3d C = lambda * precomputedIntegrals->C(elIter,a,b,c) +
                    mu * (precomputedIntegrals->C(elIter,c,a,b) + precomputedIntegrals->C(elIter,b,a,c)); 

          for(int i=0; i<r; i++)
          {
            double * posa = &(U[ELT(3*numVertices_,3*vertices[a]+0,i)]);
            double Cdotua = C[0] * posa[0] + C[1] * posa[1] + C[2] * posa[2];

            for(int j=0; j<r; j++)
            {
              double buffer = qiqjBuffer[ELT(r,i,j)];
              double * posb = &(U[ELT(3*numVertices_,3*vertices[b]+0,j)]);

              int index = ELT(3,0,ELT(r,i,j));

              forceBuffer[c][index+0] += buffer * vec1[0] + Cdotua * posb[0];
              forceBuffer[c][index+1] += buffer * vec1[1] + Cdotua * posb[1];
              forceBuffer[c][index+2] += buffer * vec1[2] + Cdotua * posb[2];
            }
          }
        } // end c
      } // end b
    } // end a

    // generate unpacked coefficients for this element
    memset(coefs,0,sizeof(double)*r*r*r);
    for(int c=0; c<numElementVertices; c++)
    {
      // multiply Uc^T * forcesBuffer[c]
      cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans,
                    r, r2, 3,
                    1.0,
                    &U[ELT(3*numVertices_,3*vertices[c],0)], 3*numVertices_,
                    forceBuffer[c], 3,
                    1.0,
                    coefs, r);
    }

    // pack and add
    for(int output=0; output<r; output++)
    {
      for(int i=0; i<r; i++)
        for(int j=0; j<r; j++)
        {
          int i1 = i;
          int j1 = j;

          if (j < i)
          {
            i1 = j;
            j1 = i;
          }

          quadraticCoef_[quadraticCoefPos(output,i1, j1)] += coefs[ELT(r,output,ELT(r,i,j))];
        }
    }
  } // end el

  free(vertices);

  for(int c=0; c<numElementVertices; c++)
    free(forceBuffer[c]);
  free(forceBuffer);

  // cubic terms
  //if (verbose >= 1)
    //printf("\nBuilding cubic terms:\n");

  memset(coefs,0,sizeof(double)*r*r*r*r);

  for(int el=startElement; el < endElement; el++)
  {
    precomputedIntegrals->PrepareElement(el, elIter);

    if (verbose >= 1)
    {
      if ((el % 50 == 1) || ((r > 30) && (el % 25 == 1)))
        printf("%d ",el); fflush(NULL);
    }

    double lambda = lambdaLame[el];
    double mu = muLame[el];

    for (int a=0; a<numElementVertices; a++)
    {
      int va = volumetricMesh->getVertexIndex(el, a);
      for(int b=0; b<numElementVertices; b++)
      {
        int vb = volumetricMesh->getVertexIndex(el, b);

        // fill up the buffers
        cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans,
                         r, r, 3,
                         1.0,
                         &U[ELT(3*numVertices_,3*va,0)], 3*numVertices_,
                         &U[ELT(3*numVertices_,3*vb,0)], 3*numVertices_,
                         0.0,
                         qiqjBuffer, r);

        for(int i=0; i<r2; i++)
          qkBuffer[i] = 0;

        for(int c=0; c<numElementVertices; c++)
        {
          int vc = volumetricMesh->getVertexIndex(el, c);
          for(int d=0; d<numElementVertices; d++)
          {
            int vd = volumetricMesh->getVertexIndex(el, d);

            double factor = 0.5 * lambda * precomputedIntegrals->D(elIter,a,b,c,d) +
                            mu * precomputedIntegrals->D(elIter,a,c,b,d);
            cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans,
                            r, r, 3,
                            factor,
                            &U[ELT(3*numVertices_,3*vc,0)], 3*numVertices_,
                            &U[ELT(3*numVertices_,3*vd,0)], 3*numVertices_,
                            1.0,
                            qkBuffer, r);
          }
        }

        // multiply qiqjBuffer * qkBuffer^T (tensor product)
        // both vectors are r^2 vectors

        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,
                          r2, r2, 1,
                          1.0,
                          qiqjBuffer, r2,
                          qkBuffer, r2,
                          1.0,
                          coefs, r2);

      } // over b
    } // over a
  }

  for(int i=0; i < r*cubicSize; i++)
    cubicCoef_[i] = 0.0;

  // unpack
  for(int i=0; i<r; i++)
    for(int j=0; j<r; j++)
      for(int k=0; k<r; k++)
        for(int l=0; l<r; l++)
        {
          // sort the indices
          int i1=i;
          int j1=j;
          int k1=k;
          tripleSort(i1,j1,k1);

          //int pos = cubicCoefPos(l, i1, j1, k1);
          //int pos1 = ELT(r*r,ELT(r,i,j),ELT(r,l,k));
          cubicCoef_[cubicCoefPos(l, i1, j1, k1)] += coefs[ELT(r*r,ELT(r,i,j),ELT(r,l,k))];
        }

  free(qiqjBuffer);
  free(qkBuffer);
  free(coefs);

  precomputedIntegrals->ReleaseElementIterator(elIter);
}