Exemple #1
0
int main(int argc, char* argv[])
{
    double *A[NB][NB];
    double *B[NB][NB];

    // Matrices initializations
    //   printf("Generating matrices... \n");
    genmat(A);
    genmat(B);

    // Different versions' execution
    lu_serial( A );
#pragma omp parallel
#pragma omp single
    lu_dependencies( B );
    
    
//     print_matrices( A, B );
            
    // Check results
    if( !check_matrices( A, B ) )
        printf("Execution of dependencies version has failed\n");

    return 0;
}
Exemple #2
0
int S_op(int colength,int z, int *NBSCHTYPE, int len_NBSCHTYPE, int INTSCHTYPE, double **S, double **f1_mod, short filter)
{
  //increasing the length because we will cut one comuln and one row later
  colength++;

  //local indices
  int i, tt, j;
  int tot = colength * z;
  int tot2 = (colength-1) * z;

  //allocating arrays and matrices
  double *A = (double *)calloc( tot * tot , sizeof(double) );
  double *B = (double *)calloc( tot * tot , sizeof(double) );
  double *A1 = (double *)calloc( tot * tot , sizeof(double) );
  double *B1 = (double *)calloc( tot * tot , sizeof(double) );
  double *A2 = (double *)calloc( tot2 * tot2 , sizeof(double) );
  double *B2 = (double *)calloc( tot2 * tot2 , sizeof(double) );
  double *f1 = (double *)calloc( tot2 * z , sizeof(double) );

  (*S) = (double *)calloc( tot2 * tot2 , sizeof(double) );
  (*f1_mod) = (double *)calloc( tot2 * z , sizeof(double) );

  // COEFFICIENTS DATABASE (AQUIRED USING MAPLE CODE)
  // -----------------------------------------------
  // NOTE: THESE ARE ONE BASED!!!
  // -----------------------------------------------
  // Allocating memory for interior schemes.
  double Thetai[5];
  double Lambdai[5][5];
  // Allocating memory for near boundary schemes.
  double Thetanbs[5][9];
  double Lambdanbs[5][9][10];
 
  // The Interior Scheme.
  // NOMENCLATURE------------------
  // Thetai[TYPE] = returns the coefficient Theta of the interior scheme given
  // the TYPE of scheme.
  //Lambdai[TYPE][ j] = returns the jth coefficient Lambda of the interior 
  //scheme given the TYPE of scheme.
  //NOTE : STARTING FROM SECOND-0RDER ACCURACY
  Thetai[1]=-1./2.;
  Thetai[2]=-1./2.;
  Thetai[3]=-1./2.;
  Thetai[4]=-1./2.;
  Lambdai[1][1]=-1./4.;
  Lambdai[1][2]=0.;
  Lambdai[1][3]=0.;
  Lambdai[1][4]=0.;
  Lambdai[2][1]=-7./24.;
  Lambdai[2][2]=1./48.;
  Lambdai[2][3]=0.;
  Lambdai[2][4]=0.;
  Lambdai[3][1]=-179./576.;
  Lambdai[3][2]=13./360.;
  Lambdai[3][3]=-11./2880.;
  Lambdai[3][4]=0.;
  Lambdai[4][1]=-5561./17280.;
  Lambdai[4][2]=163./3456.;
  Lambdai[4][3]=-23./2688.;
  Lambdai[4][4]=191./241920.;
  // Near Boundary Schemes.
  // NOMENCLATURE------------------
  // Thetanbs[POINT][TYPE] = returns the coefficient Theta of the NBS given
  // the TYPE of scheme at point = POINT.
  // Lambdanbs[POINT][TYPE][ j] = returns the jth coefficient Lambda of NBS 
  // scheme given the TYPE of scheme at point = POINT.
  // NOTE : STARTING FROM SECOND-0RDER ACCURACY
  Thetanbs[1][1]=-1.;
  Thetanbs[1][2]=-1.;
  Thetanbs[1][3]=-1.;
  Thetanbs[1][4]=-1.;
  Thetanbs[1][5]=-1.;
  Thetanbs[1][6]=-1.;
  Thetanbs[1][7]=-1.;
  Thetanbs[1][8]=-1.;
  Thetanbs[2][1]=-1./2.;
  Thetanbs[2][2]=-1./2.;
  Thetanbs[2][3]=-1./2.;
  Thetanbs[2][4]=-1./2.;
  Thetanbs[2][5]=-1./2.;
  Thetanbs[2][6]=-1./2.;
  Thetanbs[2][7]=-1./2.;
  Thetanbs[2][8]=-1./2.;
  Thetanbs[3][1]=-1./2.;
  Thetanbs[3][2]=-1./2.;
  Thetanbs[3][3]=-1./2.;
  Thetanbs[3][4]=-1./2.;
  Thetanbs[3][5]=-1./2.;
  Thetanbs[3][6]=-1./2.;
  Thetanbs[3][7]=-1./2.;
  Thetanbs[3][8]=-1./2.;
  Thetanbs[4][1]=-1./2.;
  Thetanbs[4][2]=-1./2.;
  Thetanbs[4][3]=-1./2.;
  Thetanbs[4][4]=-1./2.;
  Thetanbs[4][5]=-1./2.;
  Thetanbs[4][6]=-1./2.;
  Thetanbs[4][7]=-1./2.;
  Thetanbs[4][8]=-1./2.;
  Lambdanbs[1][1][1]=-1./2.;
  Lambdanbs[1][1][2]=-1./2.;
  Lambdanbs[1][1][3]=0.;
  Lambdanbs[1][1][4]=0.;
  Lambdanbs[1][1][5]=0.;
  Lambdanbs[1][1][6]=0.;
  Lambdanbs[1][1][7]=0.;
  Lambdanbs[1][1][8]=0.;
  Lambdanbs[1][1][9]=0.;
  Lambdanbs[1][2][1]=-5./12.;
  Lambdanbs[1][2][2]=-2./3.;
  Lambdanbs[1][2][3]=1./12.;
  Lambdanbs[1][2][4]=0.;
  Lambdanbs[1][2][5]=0.;
  Lambdanbs[1][2][6]=0.;
  Lambdanbs[1][2][7]=0.;
  Lambdanbs[1][2][8]=0.;
  Lambdanbs[1][2][9]=0.;
  Lambdanbs[1][3][1]=-3./8.;
  Lambdanbs[1][3][2]=-19./24.;
  Lambdanbs[1][3][3]=5./24.;
  Lambdanbs[1][3][4]=-1./24.;
  Lambdanbs[1][3][5]=0.;
  Lambdanbs[1][3][6]=0.;
  Lambdanbs[1][3][7]=0.;
  Lambdanbs[1][3][8]=0.;
  Lambdanbs[1][3][9]=0.;
  Lambdanbs[1][4][1]=-251./720.;
  Lambdanbs[1][4][2]=-323./360.;
  Lambdanbs[1][4][3]=11./30.;
  Lambdanbs[1][4][4]=-53./360.;
  Lambdanbs[1][4][5]=19./720.;
  Lambdanbs[1][4][6]=0.;
  Lambdanbs[1][4][7]=0.;
  Lambdanbs[1][4][8]=0.;
  Lambdanbs[1][4][9]=0.;
  Lambdanbs[1][5][1]=-95./288.;
  Lambdanbs[1][5][2]=-1427./1440.;
  Lambdanbs[1][5][3]=133./240.;
  Lambdanbs[1][5][4]=-241./720.;
  Lambdanbs[1][5][5]=173./1440.;
  Lambdanbs[1][5][6]=-3./160.;
  Lambdanbs[1][5][7]=0.;
  Lambdanbs[1][5][8]=0.;
  Lambdanbs[1][5][9]=0.;
  Lambdanbs[1][6][1]=-19087./60480.;
  Lambdanbs[1][6][2]=-2713./2520.;
  Lambdanbs[1][6][3]=15487./20160.;
  Lambdanbs[1][6][4]=-586./945.;
  Lambdanbs[1][6][5]=6737./20160.;
  Lambdanbs[1][6][6]=-263./2520.;
  Lambdanbs[1][6][7]=863./60480.;
  Lambdanbs[1][6][8]=0.;
  Lambdanbs[1][6][9]=0.;
  Lambdanbs[1][7][1]=-5257./17280.;
  Lambdanbs[1][7][2]=-139849./120960.;
  Lambdanbs[1][7][3]=4511./4480.;
  Lambdanbs[1][7][4]=-123133./120960.;
  Lambdanbs[1][7][5]=88547./120960.;
  Lambdanbs[1][7][6]=-1537./4480.;
  Lambdanbs[1][7][7]=11351./120960.;
  Lambdanbs[1][7][8]=-275./24192.;
  Lambdanbs[1][7][9]=0.;
  Lambdanbs[1][8][1]=-1070017./3628800.;
  Lambdanbs[1][8][2]=-2233547./1814400.;
  Lambdanbs[1][8][3]=2302297./1814400.;
  Lambdanbs[1][8][4]=-2797679./1814400.;
  Lambdanbs[1][8][5]=31457./22680.;
  Lambdanbs[1][8][6]=-1573169./1814400.;
  Lambdanbs[1][8][7]=645607./1814400.;
  Lambdanbs[1][8][8]=-156437./1814400.;
  Lambdanbs[1][8][9]=33953./3628800.;
  Lambdanbs[2][1][1]=1./2.;
  Lambdanbs[2][1][2]=-1./2.;
  Lambdanbs[2][1][3]=0.;
  Lambdanbs[2][1][4]=0.;
  Lambdanbs[2][1][5]=0.;
  Lambdanbs[2][1][6]=0.;
  Lambdanbs[2][1][7]=0.;
  Lambdanbs[2][1][8]=0.;
  Lambdanbs[2][1][9]=0.;
  Lambdanbs[2][2][1]=1./4.;
  Lambdanbs[2][2][2]=0.;
  Lambdanbs[2][2][3]=-1./4.;
  Lambdanbs[2][2][4]=0.;
  Lambdanbs[2][2][5]=0.;
  Lambdanbs[2][2][6]=0.;
  Lambdanbs[2][2][7]=0.;
  Lambdanbs[2][2][8]=0.;
  Lambdanbs[2][2][9]=0.;
  Lambdanbs[2][3][1]=5./24.;
  Lambdanbs[2][3][2]=1./8.;
  Lambdanbs[2][3][3]=-3./8.;
  Lambdanbs[2][3][4]=1./24.;
  Lambdanbs[2][3][5]=0.;
  Lambdanbs[2][3][6]=0.;
  Lambdanbs[2][3][7]=0.;
  Lambdanbs[2][3][8]=0.;
  Lambdanbs[2][3][9]=0.;
  Lambdanbs[2][4][1]=3./16.;
  Lambdanbs[2][4][2]=5./24.;
  Lambdanbs[2][4][3]=-1./2.;
  Lambdanbs[2][4][4]=1./8.;
  Lambdanbs[2][4][5]=-1./48.;
  Lambdanbs[2][4][6]=0.;
  Lambdanbs[2][4][7]=0.;
  Lambdanbs[2][4][8]=0.;
  Lambdanbs[2][4][9]=0.;
  Lambdanbs[2][5][1]=251./1440.;
  Lambdanbs[2][5][2]=79./288.;
  Lambdanbs[2][5][3]=-91./144.;
  Lambdanbs[2][5][4]=37./144.;
  Lambdanbs[2][5][5]=-25./288.;
  Lambdanbs[2][5][6]=19./1440.;
  Lambdanbs[2][5][7]=0.;
  Lambdanbs[2][5][8]=0.;
  Lambdanbs[2][5][9]=0.;
  Lambdanbs[2][6][1]=95./576.;
  Lambdanbs[2][6][2]=119./360.;
  Lambdanbs[2][6][3]=-445./576.;
  Lambdanbs[2][6][4]=4./9.;
  Lambdanbs[2][6][5]=-131./576.;
  Lambdanbs[2][6][6]=5./72.;
  Lambdanbs[2][6][7]=-3./320.;
  Lambdanbs[2][6][8]=0.;
  Lambdanbs[2][6][9]=0.;
  Lambdanbs[2][7][1]=19087./120960.;
  Lambdanbs[2][7][2]=1315./3456.;
  Lambdanbs[2][7][3]=-1771./1920.;
  Lambdanbs[2][7][4]=2399./3456.;
  Lambdanbs[2][7][5]=-1649./3456.;
  Lambdanbs[2][7][6]=421./1920.;
  Lambdanbs[2][7][7]=-205./3456.;
  Lambdanbs[2][7][8]=863./120960.;
  Lambdanbs[2][7][9]=0.;
  Lambdanbs[2][8][1]=5257./34560.;
  Lambdanbs[2][8][2]=1145./2688.;
  Lambdanbs[2][8][3]=-18689./17280.;
  Lambdanbs[2][8][4]=3499./3456.;
  Lambdanbs[2][8][5]=-7./8.;
  Lambdanbs[2][8][6]=9289./17280.;
  Lambdanbs[2][8][7]=-755./3456.;
  Lambdanbs[2][8][8]=101./1920.;
  Lambdanbs[2][8][9]=-275./48384.;
  Lambdanbs[3][1][1]=1./2.;
  Lambdanbs[3][1][2]=-1./2.;
  Lambdanbs[3][1][3]=0.;
  Lambdanbs[3][1][4]=0.;
  Lambdanbs[3][1][5]=0.;
  Lambdanbs[3][1][6]=0.;
  Lambdanbs[3][1][7]=0.;
  Lambdanbs[3][1][8]=0.;
  Lambdanbs[3][1][9]=0.;
  Lambdanbs[3][2][1]=-1./4.;
  Lambdanbs[3][2][2]=1.;
  Lambdanbs[3][2][3]=-3./4.;
  Lambdanbs[3][2][4]=0.;
  Lambdanbs[3][2][5]=0.;
  Lambdanbs[3][2][6]=0.;
  Lambdanbs[3][2][7]=0.;
  Lambdanbs[3][2][8]=0.;
  Lambdanbs[3][2][9]=0.;
  Lambdanbs[3][3][1]=-1./24.;
  Lambdanbs[3][3][2]=3./8.;
  Lambdanbs[3][3][3]=-1./8.;
  Lambdanbs[3][3][4]=-5./24.;
  Lambdanbs[3][3][5]=0.;
  Lambdanbs[3][3][6]=0.;
  Lambdanbs[3][3][7]=0.;
  Lambdanbs[3][3][8]=0.;
  Lambdanbs[3][3][9]=0.;
  Lambdanbs[3][4][1]=-1./48.;
  Lambdanbs[3][4][2]=7./24.;
  Lambdanbs[3][4][3]=0.;
  Lambdanbs[3][4][4]=-7./24.;
  Lambdanbs[3][4][5]=1./48.;
  Lambdanbs[3][4][6]=0.;
  Lambdanbs[3][4][7]=0.;
  Lambdanbs[3][4][8]=0.;
  Lambdanbs[3][4][9]=0.;
  Lambdanbs[3][5][1]=-19./1440.;
  Lambdanbs[3][5][2]=73./288.;
  Lambdanbs[3][5][3]=11./144.;
  Lambdanbs[3][5][4]=-53./144.;
  Lambdanbs[3][5][5]=17./288.;
  Lambdanbs[3][5][6]=-11./1440.;
  Lambdanbs[3][5][7]=0.;
  Lambdanbs[3][5][8]=0.;
  Lambdanbs[3][5][9]=0.;
  Lambdanbs[3][6][1]=-3./320.;
  Lambdanbs[3][6][2]=83./360.;
  Lambdanbs[3][6][3]=77./576.;
  Lambdanbs[3][6][4]=-4./9.;
  Lambdanbs[3][6][5]=67./576.;
  Lambdanbs[3][6][6]=-11./360.;
  Lambdanbs[3][6][7]=11./2880.;
  Lambdanbs[3][6][8]=0.;
  Lambdanbs[3][6][9]=0.;
  Lambdanbs[3][7][1]=-863./120960.;
  Lambdanbs[3][7][2]=3713./17280.;
  Lambdanbs[3][7][3]=347./1920.;
  Lambdanbs[3][7][4]=-1807./3456.;
  Lambdanbs[3][7][5]=673./3456.;
  Lambdanbs[3][7][6]=-149./1920.;
  Lambdanbs[3][7][7]=337./17280.;
  Lambdanbs[3][7][8]=-271./120960.;
  Lambdanbs[3][7][9]=0.;
  Lambdanbs[3][8][1]=-275./48384.;
  Lambdanbs[3][8][2]=24587./120960.;
  Lambdanbs[3][8][3]=85./384.;
  Lambdanbs[3][8][4]=-10439./17280.;
  Lambdanbs[3][8][5]=8./27.;
  Lambdanbs[3][8][6]=-61./384.;
  Lambdanbs[3][8][7]=1039./17280.;
  Lambdanbs[3][8][8]=-335./24192.;
  Lambdanbs[3][8][9]=13./8960.;
  Lambdanbs[4][1][1]=1./2.;
  Lambdanbs[4][1][2]=-1./2.;
  Lambdanbs[4][1][3]=0.;
  Lambdanbs[4][1][4]=0.;
  Lambdanbs[4][1][5]=0.;
  Lambdanbs[4][1][6]=0.;
  Lambdanbs[4][1][7]=0.;
  Lambdanbs[4][1][8]=0.;
  Lambdanbs[4][1][9]=0.;
  Lambdanbs[4][2][1]=-3./4.;
  Lambdanbs[4][2][2]=2.;
  Lambdanbs[4][2][3]=-5./4.;
  Lambdanbs[4][2][4]=0.;
  Lambdanbs[4][2][5]=0.;
  Lambdanbs[4][2][6]=0.;
  Lambdanbs[4][2][7]=0.;
  Lambdanbs[4][2][8]=0.;
  Lambdanbs[4][2][9]=0.;
  Lambdanbs[4][3][1]=5./24.;
  Lambdanbs[4][3][2]=-7./8.;
  Lambdanbs[4][3][3]=13./8.;
  Lambdanbs[4][3][4]=-23./24.;
  Lambdanbs[4][3][5]=0.;
  Lambdanbs[4][3][6]=0.;
  Lambdanbs[4][3][7]=0.;
  Lambdanbs[4][3][8]=0.;
  Lambdanbs[4][3][9]=0.;
  Lambdanbs[4][4][1]=1./48.;
  Lambdanbs[4][4][2]=-1./8.;
  Lambdanbs[4][4][3]=1./2.;
  Lambdanbs[4][4][4]=-5./24.;
  Lambdanbs[4][4][5]=-3./16.;
  Lambdanbs[4][4][6]=0.;
  Lambdanbs[4][4][7]=0.;
  Lambdanbs[4][4][8]=0.;
  Lambdanbs[4][4][9]=0.;
  Lambdanbs[4][5][1]=11./1440.;
  Lambdanbs[4][5][2]=-17./288.;
  Lambdanbs[4][5][3]=53./144.;
  Lambdanbs[4][5][4]=-11./144.;
  Lambdanbs[4][5][5]=-73./288.;
  Lambdanbs[4][5][6]=19./1440.;
  Lambdanbs[4][5][7]=0.;
  Lambdanbs[4][5][8]=0.;
  Lambdanbs[4][5][9]=0.;
  Lambdanbs[4][6][1]=11./2880.;
  Lambdanbs[4][6][2]=-13./360.;
  Lambdanbs[4][6][3]=179./576.;
  Lambdanbs[4][6][4]=0.;
  Lambdanbs[4][6][5]=-179./576.;
  Lambdanbs[4][6][6]=13./360.;
  Lambdanbs[4][6][7]=-11./2880.;
  Lambdanbs[4][6][8]=0.;
  Lambdanbs[4][6][9]=0.;
  Lambdanbs[4][7][1]=271./120960.;
  Lambdanbs[4][7][2]=-433./17280.;
  Lambdanbs[4][7][3]=533./1920.;
  Lambdanbs[4][7][4]=191./3456.;
  Lambdanbs[4][7][5]=-1265./3456.;
  Lambdanbs[4][7][6]=133./1920.;
  Lambdanbs[4][7][7]=-257./17280.;
  Lambdanbs[4][7][8]=191./120960.;
  Lambdanbs[4][7][9]=0.;
  Lambdanbs[4][8][1]=13./8960.;
  Lambdanbs[4][8][2]=-2267./120960.;
  Lambdanbs[4][8][3]=883./3456.;
  Lambdanbs[4][8][4]=191./1920.;
  Lambdanbs[4][8][5]=-91./216.;
  Lambdanbs[4][8][6]=1961./17280.;
  Lambdanbs[4][8][7]=-71./1920.;
  Lambdanbs[4][8][8]=191./24192.;
  Lambdanbs[4][8][9]=-191./241920.;
  // coefficients completed up to here

  // Checking if the depth of the column operand is correctly specified. 
  if (z < 1)
    { 
      printf("\nfatal : The depth of discrete space is invalid. Use z >= 1. exit ... \n");
      exit(0);
    }

  // Evaluating the length of the colomn operand.
  if (colength < 3 )
    {
      printf("\nfatal : The length of discrete space is not valid. Use colength >= 3 \n");
      exit(0);

    }

  //------------------------------------------------------------------------
  //                Generating the LHS tensor [[A]].
  //------------------------------------------------------------------------
  double Theta1 = 0.;
  //Making tensor [[A]]...
  //Defining Near Boundary Schemes for [[A]] at point i=1.
  //Theta1 = 2;
  Theta1 = -1.;
  for( tt = 0; tt < z; tt++)
    { 
      // A(1,1) = {eye(z)};  
      A[(0*colength+0)*z*z + tt*z + tt] = 1.;
      // A(1,2) = {Theta1*eye(z)};
      A[(0*colength+1)*z*z + tt*z + tt] = Theta1;
    }

  // Defining Interior Schemes for [[A]].
  Theta1 = -1./2.;
  for(i = 1; i < (colength-1); i++)
    for( tt = 0; tt < z; tt++)
      {
	// A[i][i-1] = {Theta1*eye(z)}
	A[(i*colength+(i-1))*z*z + tt*z + tt] = Theta1;
	// A[i][i] = {eye(z)};
	A[(i*colength+i)*z*z + tt*z + tt] = 1.;
	// A[i][i+1] = {Theta1*eye(z)};
	A[(i*colength+(i+1))*z*z + tt*z + tt] = Theta1;
      }
    
  // Defining Near Boundary Schemes for [[A]] at point i="colength".
  Theta1 = -1.;
  for( tt = 0; tt < z; tt++)
    { 
      // A(colength,colength) = {eye(z)};
      A[((colength-1)*colength+(colength-1))*z*z + tt*z + tt] = 1.;
      // A(colength,colength-1) = {Theta1*eye(z)};
      A[((colength-1)*colength+ (colength-2))*z*z + tt*z + tt] = Theta1;
    }

  //------------------------------------------------------------------------
  //            Generating the RHS tensor [[B]].
  // ------------------------------------------------------------------------

  // Defining Interior Schemes for [[B]].
  for ( i = INTSCHTYPE; i < (colength-INTSCHTYPE); i++)
    for ( j = 1; j <= INTSCHTYPE; j++)
      for( tt = 0; tt < z; tt++)
	{
	  //B(i,i-j) = {-Lambdai(INTSCHTYPE,j)*eye(z)};
	  B[(i*colength+i-j)*z*z + tt*z + tt] = -Lambdai[INTSCHTYPE][j];
	  //B(i,i+j) = {Lambdai(INTSCHTYPE,j)*eye(z)};
	  B[(i*colength+i+j)*z*z + tt*z + tt] = Lambdai[INTSCHTYPE][j];    
	}

  //Defining Near Boundary Schemes for [[B]] at point i=1.
  for(i = 0; i < len_NBSCHTYPE; i++)
    for(j = 0; j < (max_array_int(NBSCHTYPE, len_NBSCHTYPE)+1); j++)
      for( tt = 0; tt < z; tt++)
	B[(i*colength+j)*z*z + tt*z + tt] = Lambdanbs[i+1][NBSCHTYPE[i]][j+1];


  // Defining Near Boundary Schemes for [[B]] at points near i="colength".
  for(i = 0; i < len_NBSCHTYPE; i++)
    for(j = 0; j < (max_array_int(NBSCHTYPE, len_NBSCHTYPE)+1); j++)
      for( tt = 0; tt < z; tt++)
	B[((colength-i-1)*colength+(colength-j-1))*z*z + tt*z + tt] = -Lambdanbs[i+1][NBSCHTYPE[i]][j+1];

  //converting [[A]] and [[B]] from subblock form to full matrix[i*n+j] form
  int glb_i, glb_j; 
  for(i = 0; i < colength; i++) //loop over row blocks
    for(j = 0; j < colength; j++) //loop over column blocks
      for( tt = 0; tt < z; tt++)
	{
	  glb_i = i*z + tt;
	  glb_j = j*z + tt;
	  A1[glb_i * tot + glb_j] = A[(i*colength+j)*z*z + tt*z + tt];
	  B1[glb_i * tot + glb_j] = B[(i*colength+j)*z*z + tt*z + tt];
	}

  //OK up to here!
  //print_1d_matrix("A1", A1, tot, tot);
  //print_1d_matrix("B1", B1, tot, tot);

  // slicing the matrices A1 and B1
  for(i = 0; i < tot2; i++) //cut the last row
    for(j = z; j < tot; j++)   //cut the first column
      {
	A2[i * tot2 + j-z] = A1[i * tot + j];
	B2[i * tot2 + j-z] = B1[i * tot + j];
      }

  /* print_1d_matrix("A2", A2, tot2, tot2); */
  /* print_1d_matrix("B2", B2, tot2, tot2); */
  // OK - checked for z = 1 n = 10


  int *P = (int *)calloc(tot2*tot2, sizeof(int)); //permutation matrix
  double *b = (double *)calloc(tot2, sizeof(double)); //temp b array
  double *x = (double *)calloc(tot2, sizeof(double)); //temp solution array
  lu_serial(A2, P, tot2);

  // copy first column BLOCK of B1 (excluding the last entry) to [f1]
  for (j = 0; j < z; j++)
    for(i = 0; i < tot2; i++)
      f1[i*z+j] = B1[i * tot + j];

  // solve A2^-1*f1 to obtain f1_mod
  for (j = 0; j < z; j++)
    {
      for( i = 0; i < tot2; i++) 
	b[i] = f1[i*z+j];
      solve_lu_serial(P, A2, x, b, tot2);
      for( i = 0; i < tot2; i++) 
	(*f1_mod)[i*z + j] = x[i];
    }

  // loop over columns of B2 and solve using already stored LU to efficiently compute [S] = [A2]^-1 * [B2] without taking the real inverse.
  for (j = 0; j < tot2; j++)
    {
      for( i = 0; i < tot2; i++) 
	b[i] = B2[i*tot2 + j];
      solve_lu_serial(P, A2, x, b, tot2);
      for( i = 0; i < tot2; i++) 
	(*S)[i*tot2 + j] = x[i];
    }
  // OK sofar, needs machine epsilon filter
  // this is safe for collength < 5. for big collengths we need to use function S_safe()
  if(filter)
    {
      for(i = 0; i < tot2; i++)
	for(j = 0; j < tot2; j++)
	  if(fabs((*S)[i*tot2 + j]) <= 1.e-16) //freez it!
	    (*S)[i*tot2 + j] = 0.;

      for(i = 0; i < tot2; i++)
	for(j = 0; j < z; j++)
	  if(fabs((*f1_mod)[i*z + j]) <= 1.e-16) //freez it!
	    (*f1_mod)[i*z + j] = 0.;
    }

  //f1_mod and S is now computed. preparing to exit ...

  //clean up
  free(A);
  free(B);
  free(A1);
  free(B1);
  free(A2);
  free(B2);
  free(f1);
  free(P);
  free(x);
  free(b);

  //completed successfully!
  return 0;
}
int gauss_seidel_solve_pivoting(int nnodes, int nnz, int *ia, int *ja,  int *iau, double *A, double *rhs, int neqs, double *x_star, double *xn1, double *xn, short init)
{
    //locals
    int i,j;
    int jstart, jend;
    double GS_res = 0.;
    //int counter = 0;

    // allocating permutation matrix
    int *P = (int *)malloc( nnodes * neqs * neqs * sizeof(int) );

    //replace the diagonal matrices with their LU and save in-place
    //also save permutation matrix band per main diagonal in [P]
    for ( i = 0; i < nnodes; i++)
        lu_serial((A + iau[i]*neqs*neqs), (P + i*neqs*neqs), neqs);

    //initializing solution [xn] with [rhs]/A(diag)
    if (init)
        for ( i = 0; i < nnodes; i++)
            for( j = 0; j < neqs; j++)
            {
                jstart = iau[i];
                xn[i*neqs + j] = rhs[i*neqs + j] / A[jstart*neqs*neqs + j*neqs + j];
            }

    do {
        //resetting [x_star]
        for ( i = 0; i < nnodes; i++)
            for( j = 0; j < neqs; j++)
                x_star[i*neqs + j] = 0.;

        //calculating [x_star] = -[O] [xn].
        for( i = 0; i < nnodes; i++)
        {
            jstart = ia[i];
            jend = ia[i+1]-1;
            for ( j = jstart; j <= jend; j++)
                if( j == iau[i] ) // this is diagonal block, skip it!
                    continue;
                else //contribute the off-diagonal elements
                    neg_matrix_vec__mult( (A + j* neqs *neqs) , (xn + ja[j]*neqs) , (x_star + i *neqs) , neqs);

            for ( j = 0; j < neqs; j++) //[x_star] = [rhs]  + [x_star]
                x_star[i *neqs + j] += rhs[i * neqs + j];

        } //matrix multiplication and addition will be finished after this!

        //solving diagonal, i.e. [xn+1] = [D]^-1 [x_star]
        for( i = 0; i < nnodes; i++)
        {
            j = iau[i];
            solve_lu_serial((P + i*neqs*neqs), (A + j*neqs*neqs), (xn1 + i*neqs), (x_star + i*neqs), neqs);

        } //xn+1 will be obtained after this loop
        //calculating residuals
        GS_res = 0.; //initial
        for ( i = 0; i < nnodes; i++)
            for( j = 0; j < neqs; j++)
                GS_res += pow((xn1[i*neqs + j] - xn[i*neqs + j]) , 2.);
        GS_res /= (neqs * nnodes);
        GS_res = sqrt(GS_res);

        //updating [xn]
        for ( i = 0; i < nnodes; i++)
            for( j = 0; j < neqs; j++)
                xn[i*neqs + j]  = xn1[i*neqs + j];
        //printf("\n%e\n", GS_res);
    } while( (GS_res >= GS_RES_EPS) /* && (counter++ < 40) */ );

    //clean - up
    free(P);

    //completed successfully!
    return 0;

}