void sgemm( int m_a, int n_a, float *A, float *B, float *C ) {

  int mpad = m_a%STEPM ? (m_a/STEPM+1)*STEPM:m_a;
  int npad = n_a%STEPN ? (n_a/STEPN+1)*STEPN:n_a;
  int mbackup = m_a;

  float* Apad=malloc(mpad*npad*sizeof(float));
  transposeA(n_a, m_a, npad, mpad, Apad, A);
  A=Apad;
  float* Bpad=malloc(npad*mpad*sizeof(float));
  transposeB(n_a, m_a, npad, mpad, Bpad, B);
  B=Bpad;            
 
  float* Cpad = malloc(mpad*mpad*sizeof(float));
  float* backup = C;
  C = Cpad;

  m_a = mpad;
  n_a = npad;

#pragma omp parallel 
  {
    //    __m128 c0,a1, c1, a2, c2, a3, c3, a4, c4;
    __m128 a1, a2, a3, a4, c0;
    __m128 c11, c12, c13, c14;
    __m128 c21, c22, c23, c24;
    __m128 c31, c32, c33, c34;
    __m128 c41, c42, c43, c44;

    __m128 b1, b2, b3, b4;
    float temp0,temp1,temp2,temp3,temp4, temp5, temp6, temp7, temp8;
    int ii=0;
    int jj=0;
    int kk=0;

    int kkma=0;
    int jjna=0;
    int jjma=0; 
    int iina=0;

#pragma omp for
    for( int j = 0; j < m_a; j+=4 ) {
      jj=j;

      jjma=jj*m_a;
      jjna=jj*n_a;

      for( int i = 0; i < m_a; i+=4 ) {
	ii=i;

	iina=ii*n_a;

	c31=c32=c33=c34=c41=c42=c43=c44=c11=c12=c13=c14=c21=c22=c23=c24 = _mm_setzero_ps();
 
	for( int k = 0; k < n_a; k+=4 ) {
	  float* tempA=A+k+iina;
	  float* tempB=B+k+jjna;

	  b1 = _mm_loadu_ps(tempB);
	  b2 = _mm_loadu_ps(tempB+n_a);
	  b3 = _mm_loadu_ps(tempB+2*n_a);
	  b4 = _mm_loadu_ps(tempB+3*n_a);
	  /////////////////////////////////////////
	  a1 = _mm_loadu_ps(tempA);
	  a2 = _mm_loadu_ps(tempA+n_a);
	  a3 = _mm_loadu_ps(tempA+n_a*2);
	  a4 = _mm_loadu_ps(tempA+n_a*3);

	  c11=_mm_add_ps(c11,  _mm_mul_ps(a1, b1));
	  c21 = _mm_add_ps(c21, _mm_mul_ps(a2, b1));

	  c12 = _mm_add_ps(c12, _mm_mul_ps(a1, b2));
	  c22 = _mm_add_ps(c22, _mm_mul_ps(a2, b2));

	  c13= _mm_add_ps(c13,  _mm_mul_ps(a1, b3));
	  c23 = _mm_add_ps(c23, _mm_mul_ps(a2, b3));

	  c14 = _mm_add_ps(c14, _mm_mul_ps(a1, b4));
	  c24 = _mm_add_ps(c24, _mm_mul_ps(a2, b4));

	  c31=_mm_add_ps(c31,  _mm_mul_ps(a3, b1));
	  c41 = _mm_add_ps(c41, _mm_mul_ps(a4, b1));

	  c32 = _mm_add_ps(c32, _mm_mul_ps(a3, b2));
	  c42 = _mm_add_ps(c42, _mm_mul_ps(a4, b2));

	  c33= _mm_add_ps(c33,  _mm_mul_ps(a3, b3));
	  c43 = _mm_add_ps(c43, _mm_mul_ps(a4, b3));

	  c34 = _mm_add_ps(c34, _mm_mul_ps(a3, b4));
	  c44 = _mm_add_ps(c44, _mm_mul_ps(a4, b4));
	}

	c0=  _mm_hadd_ps(c11,c11);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c12,c12);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c13,c13);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c14,c14);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a*3] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c21,c21);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c22,c22);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c23,c23);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c24,c24);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a*3] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c31,c31);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c32,c32);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c33,c33);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c34,c34);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a*3] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c41,c41);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c42,c42);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c43,c43);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c44,c44);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a*3] = _mm_cvtss_f32(c0);
	/*
	c0=  _mm_hadd_ps(c11,c11);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma] = _mm_cvtss_f32(c0);

	c11=  _mm_hadd_ps(c12,c12);
	c11=  _mm_hadd_ps(c11,c11);
	C[ii+jjma+m_a]= _mm_cvtss_f32(c11); 

	c12=  _mm_hadd_ps(c13,c13);
	c12=  _mm_hadd_ps(c12,c12);   
	C[ii+jjma+m_a*2]=_mm_cvtss_f32(c12);

	c13=  _mm_hadd_ps(c14,c14);
	c13=  _mm_hadd_ps(c13,c13);   
	C[ii+jjma+m_a*3]= _mm_cvtss_f32(c13);

	c14=  _mm_hadd_ps(c21,c21);
	c14=  _mm_hadd_ps(c14,c14); 
	C[ii+jjma+1] = _mm_cvtss_f32(c14);

	c21=  _mm_hadd_ps(c22,c22);
	c21=  _mm_hadd_ps(c21,c21);
	C[ii+jjma+m_a+1] = _mm_cvtss_f32(c21); 

	c22=  _mm_hadd_ps(c23,c23);
	c22=  _mm_hadd_ps(c22,c22);   
	C[ii+jjma+2*m_a+1]=_mm_cvtss_f32(c22);

	c23=  _mm_hadd_ps(c24,c24);
	c23=  _mm_hadd_ps(c23,c23);   
	C[ii+jjma+3*m_a+1]= _mm_cvtss_f32(c23);
	*/
      }
    } 
  }

  move(mbackup, mpad, C, backup);
  free(A);
  free(B);
  free(C);
}             
//
//  Amesos_TestMultiSolver.cpp reads in a matrix in Harwell-Boeing format, 
//  calls one of the sparse direct solvers, using blocked right hand sides
//  and computes the error and residual.  
//
//  TestSolver ignores the Harwell-Boeing right hand sides, creating
//  random right hand sides instead.  
//
//  Amesos_TestMultiSolver can test either A x = b or A^T x = b.
//  This can be a bit confusing because sparse direct solvers 
//  use compressed column storage - the transpose of Trilinos'
//  sparse row storage.
//
//  Matrices:
//    readA - Serial.  As read from the file.
//    transposeA - Serial.  The transpose of readA.
//    serialA - if (transpose) then transposeA else readA 
//    distributedA - readA distributed to all processes
//    passA - if ( distributed ) then distributedA else serialA
//
//
int Amesos_TestMultiSolver( Epetra_Comm &Comm, char *matrix_file, int numsolves, 
		      SparseSolverType SparseSolver, bool transpose,
		      int special, AMESOS_MatrixType matrix_type ) {


  int iam = Comm.MyPID() ;

  
  //  int hatever;
  //  if ( iam == 0 )  std::cin >> hatever ; 
  Comm.Barrier();


  Epetra_Map * readMap;
  Epetra_CrsMatrix * readA; 
  Epetra_Vector * readx; 
  Epetra_Vector * readb;
  Epetra_Vector * readxexact;
   
  std::string FileName = matrix_file ;
  int FN_Size = FileName.size() ; 
  std::string LastFiveBytes = FileName.substr( EPETRA_MAX(0,FN_Size-5), FN_Size );
  std::string LastFourBytes = FileName.substr( EPETRA_MAX(0,FN_Size-4), FN_Size );
  bool NonContiguousMap = false; 

  if ( LastFiveBytes == ".triU" ) { 
    NonContiguousMap = true; 
    // Call routine to read in unsymmetric Triplet matrix
    EPETRA_CHK_ERR( Trilinos_Util_ReadTriples2Epetra( matrix_file, false, Comm, readMap, readA, readx, 
						      readb, readxexact, NonContiguousMap ) );
  } else {
    if ( LastFiveBytes == ".triS" ) { 
      NonContiguousMap = true; 
      // Call routine to read in symmetric Triplet matrix
      EPETRA_CHK_ERR( Trilinos_Util_ReadTriples2Epetra( matrix_file, true, Comm, 
							readMap, readA, readx, 
							readb, readxexact, NonContiguousMap ) );
    } else {
      if (  LastFourBytes == ".mtx" ) { 
	EPETRA_CHK_ERR( Trilinos_Util_ReadMatrixMarket2Epetra( matrix_file, Comm, readMap, 
							       readA, readx, readb, readxexact) );
      } else {
	// Call routine to read in HB problem
	Trilinos_Util_ReadHb2Epetra( matrix_file, Comm, readMap, readA, readx, 
						     readb, readxexact) ;
      }
    }
  }

  Epetra_CrsMatrix transposeA(Copy, *readMap, 0);
  Epetra_CrsMatrix *serialA ; 

  if ( transpose ) {
    assert( CrsMatrixTranspose( readA, &transposeA ) == 0 ); 
    serialA = &transposeA ; 
  } else {
    serialA = readA ; 
  }

  // Create uniform distributed map
  Epetra_Map map(readMap->NumGlobalElements(), 0, Comm);
  Epetra_Map* map_;

  if( NonContiguousMap ) {
    //
    //  map gives us NumMyElements and MyFirstElement;
    //
    int NumGlobalElements =  readMap->NumGlobalElements();
    int NumMyElements = map.NumMyElements();
    int MyFirstElement = map.MinMyGID();
    std::vector<int> MapMap_( NumGlobalElements );
    readMap->MyGlobalElements( &MapMap_[0] ) ;
    Comm.Broadcast( &MapMap_[0], NumGlobalElements, 0 ) ; 
    map_ = new Epetra_Map( NumGlobalElements, NumMyElements, &MapMap_[MyFirstElement], 0, Comm);
  } else {
    map_ = new Epetra_Map( map ) ; 
  }


  // Create Exporter to distribute read-in matrix and vectors
  Epetra_Export exporter(*readMap, *map_);
  Epetra_CrsMatrix A(Copy, *map_, 0);

  Epetra_RowMatrix * passA = 0; 
  Epetra_MultiVector * passx = 0; 
  Epetra_MultiVector * passb = 0;
  Epetra_MultiVector * passxexact = 0;
  Epetra_MultiVector * passresid = 0;
  Epetra_MultiVector * passtmp = 0;

  Epetra_MultiVector x(*map_,numsolves);
  Epetra_MultiVector b(*map_,numsolves);
  Epetra_MultiVector xexact(*map_,numsolves);
  Epetra_MultiVector resid(*map_,numsolves);
  Epetra_MultiVector tmp(*map_,numsolves);

  Epetra_MultiVector serialx(*readMap,numsolves);
  Epetra_MultiVector serialb(*readMap,numsolves);
  Epetra_MultiVector serialxexact(*readMap,numsolves);
  Epetra_MultiVector serialresid(*readMap,numsolves);
  Epetra_MultiVector serialtmp(*readMap,numsolves);

  bool distribute_matrix = ( matrix_type == AMESOS_Distributed ) ; 
  if ( distribute_matrix ) { 
    //
    //  Initialize x, b and xexact to the values read in from the file
    //
    
    A.Export(*serialA, exporter, Add);
    Comm.Barrier();

    assert(A.FillComplete()==0);    
    Comm.Barrier();

    passA = &A; 
    passx = &x; 
    passb = &b;
    passxexact = &xexact;
    passresid = &resid;
    passtmp = &tmp;
  } else { 
    passA = serialA; 
    passx = &serialx; 
    passb = &serialb;
    passxexact = &serialxexact;
    passresid = &serialresid;
    passtmp = &serialtmp;
  }

  passxexact->SetSeed(131) ; 
  passxexact->Random();
  passx->SetSeed(11231) ; 
  passx->Random();

  passb->PutScalar( 0.0 );
  passA->Multiply( transpose, *passxexact, *passb ) ; 

  Epetra_MultiVector CopyB( *passb ) ;

  double Anorm = passA->NormInf() ; 
  SparseDirectTimingVars::SS_Result.Set_Anorm(Anorm) ;

  Epetra_LinearProblem Problem(  (Epetra_RowMatrix *) passA, 
				 (Epetra_MultiVector *) passx, 
				 (Epetra_MultiVector *) passb );

  double max_resid = 0.0;
  for ( int j = 0 ; j < special+1 ; j++ ) { 
    
    Epetra_Time TotalTime( Comm ) ; 
    if ( false ) { 
#ifdef TEST_UMFPACK

      unused code

    } else if ( SparseSolver == UMFPACK ) { 
      UmfpackOO umfpack( (Epetra_RowMatrix *) passA, 
			 (Epetra_MultiVector *) passx, 
			 (Epetra_MultiVector *) passb ) ; 
    
      umfpack.SetTrans( transpose ) ; 
      umfpack.Solve() ; 
#endif
#ifdef TEST_SUPERLU
    } else if ( SparseSolver == SuperLU ) { 
      SuperluserialOO superluserial( (Epetra_RowMatrix *) passA, 
				     (Epetra_MultiVector *) passx, 
				     (Epetra_MultiVector *) passb ) ; 

      superluserial.SetPermc( SuperLU_permc ) ; 
      superluserial.SetTrans( transpose ) ; 
      superluserial.SetUseDGSSV( special == 0 ) ; 
      superluserial.Solve() ; 
#endif
#ifdef HAVE_AMESOS_SLUD
    } else if ( SparseSolver == SuperLUdist ) { 
      SuperludistOO superludist( Problem ) ; 
      superludist.SetTrans( transpose ) ; 
      EPETRA_CHK_ERR( superludist.Solve( true ) ) ;
#endif 
#ifdef HAVE_AMESOS_SLUD2
    } else if ( SparseSolver == SuperLUdist2 ) { 
      Superludist2_OO superludist2( Problem ) ; 
      superludist2.SetTrans( transpose ) ; 
      EPETRA_CHK_ERR( superludist2.Solve( true ) ) ;
#endif 
#ifdef TEST_SPOOLES
    } else if ( SparseSolver == SPOOLES ) { 
      SpoolesOO spooles( (Epetra_RowMatrix *) passA, 
			 (Epetra_MultiVector *) passx, 
			 (Epetra_MultiVector *) passb ) ; 
    
      spooles.SetTrans( transpose ) ; 
      spooles.Solve() ; 
#endif
#ifdef HAVE_AMESOS_DSCPACK
    } else if ( SparseSolver == DSCPACK ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Dscpack dscpack( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( dscpack.SetParameters( ParamList ) ); 
    
      EPETRA_CHK_ERR( dscpack.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_UMFPACK
    } else if ( SparseSolver == UMFPACK ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Umfpack umfpack( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( umfpack.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( umfpack.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( umfpack.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_KLU
    } else if ( SparseSolver == KLU ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Klu klu( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( klu.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( klu.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( klu.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( klu.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( klu.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_PARAKLETE
    } else if ( SparseSolver == PARAKLETE ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Paraklete paraklete( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( paraklete.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( paraklete.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( paraklete.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( paraklete.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( paraklete.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_SLUS
    } else if ( SparseSolver == SuperLU ) { 
      Epetra_SLU superluserial( &Problem ) ; 
      EPETRA_CHK_ERR( superluserial.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( superluserial.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( superluserial.NumericFactorization(  ) ); 

      EPETRA_CHK_ERR( superluserial.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_LAPACK
    } else if ( SparseSolver == LAPACK ) { 
      Teuchos::ParameterList ParamList ;
      ParamList.set( "MaxProcs", -3 );
      Amesos_Lapack lapack( Problem ) ; 
      EPETRA_CHK_ERR( lapack.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( lapack.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( lapack.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( lapack.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_TAUCS
    } else if ( SparseSolver == TAUCS ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Taucs taucs( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( taucs.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( taucs.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( taucs.SymbolicFactorization( ) ); 
      EPETRA_CHK_ERR( taucs.NumericFactorization( ) ); 
      EPETRA_CHK_ERR( taucs.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_PARDISO
    } else if ( SparseSolver == PARDISO ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Pardiso pardiso( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( pardiso.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( pardiso.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( pardiso.SymbolicFactorization( ) ); 
      EPETRA_CHK_ERR( pardiso.NumericFactorization( ) ); 
      EPETRA_CHK_ERR( pardiso.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_PARKLETE
    } else if ( SparseSolver == PARKLETE ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Parklete parklete( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( parklete.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( parklete.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( parklete.SymbolicFactorization( ) ); 
      EPETRA_CHK_ERR( parklete.NumericFactorization( ) ); 
      EPETRA_CHK_ERR( parklete.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_MUMPS
    } else if ( SparseSolver == MUMPS ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Mumps mumps( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( mumps.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( mumps.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( mumps.SymbolicFactorization( ) ); 
      EPETRA_CHK_ERR( mumps.NumericFactorization( ) ); 
      EPETRA_CHK_ERR( mumps.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_SCALAPACK
    } else if ( SparseSolver == SCALAPACK ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Scalapack scalapack( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( scalapack.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( scalapack.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( scalapack.SymbolicFactorization( ) ); 
      EPETRA_CHK_ERR( scalapack.NumericFactorization( ) ); 
      EPETRA_CHK_ERR( scalapack.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_SUPERLUDIST
    } else if ( SparseSolver == SUPERLUDIST ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Superludist superludist( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( superludist.SetParameters( ParamList ) ); 

      EPETRA_CHK_ERR( superludist.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( superludist.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( superludist.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( superludist.Solve( ) ); 
#endif
#ifdef HAVE_AMESOS_SUPERLU
    } else if ( SparseSolver == SUPERLU ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Superlu superlu( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( superlu.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( superlu.SetUseTranspose( transpose ) ); 
    
      EPETRA_CHK_ERR( superlu.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( superlu.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( superlu.Solve( ) ); 
#endif
#ifdef TEST_SPOOLESSERIAL 
    } else if ( SparseSolver == SPOOLESSERIAL ) { 
      SpoolesserialOO spoolesserial( (Epetra_RowMatrix *) passA, 
				     (Epetra_MultiVector *) passx, 
				     (Epetra_MultiVector *) passb ) ; 
    
      spoolesserial.Solve() ;
#endif
    } else { 
      SparseDirectTimingVars::log_file << "Solver not implemented yet" << std::endl ;
      std::cerr << "\n\n####################  Requested solver not available (Or not tested with blocked RHS) on this platform #####################\n" << std::endl ;
    }

    SparseDirectTimingVars::SS_Result.Set_Total_Time( TotalTime.ElapsedTime() ); 
    //    SparseDirectTimingVars::SS_Result.Set_First_Time( 0.0 ); 
    //    SparseDirectTimingVars::SS_Result.Set_Middle_Time( 0.0 ); 
    //    SparseDirectTimingVars::SS_Result.Set_Last_Time( 0.0 ); 

    //
    //  Compute the error = norm(xcomp - xexact )
    //
    std::vector <double> error(numsolves) ; 
    double max_error = 0.0;
  
    passresid->Update(1.0, *passx, -1.0, *passxexact, 0.0);

    passresid->Norm2(&error[0]);
    for ( int i = 0 ; i< numsolves; i++ ) 
      if ( error[i] > max_error ) max_error = error[i] ; 
    SparseDirectTimingVars::SS_Result.Set_Error(max_error) ;

    //  passxexact->Norm2(&error[0] ) ; 
    //  passx->Norm2(&error ) ; 

    //
    //  Compute the residual = norm(Ax - b)
    //
    std::vector <double> residual(numsolves) ; 
  
    passtmp->PutScalar(0.0);
    passA->Multiply( transpose, *passx, *passtmp);
    passresid->Update(1.0, *passtmp, -1.0, *passb, 0.0); 
    //    passresid->Update(1.0, *passtmp, -1.0, CopyB, 0.0); 
    passresid->Norm2(&residual[0]);

    for ( int i = 0 ; i< numsolves; i++ ) 
      if ( residual[i] > max_resid ) max_resid = residual[i] ; 


    SparseDirectTimingVars::SS_Result.Set_Residual(max_resid) ;
    
    std::vector <double> bnorm(numsolves); 
    passb->Norm2( &bnorm[0] ) ; 
    SparseDirectTimingVars::SS_Result.Set_Bnorm(bnorm[0]) ;

    std::vector <double> xnorm(numsolves); 
    passx->Norm2( &xnorm[0] ) ; 
    SparseDirectTimingVars::SS_Result.Set_Xnorm(xnorm[0]) ;


    if ( false && iam == 0 ) { 

      std::cout << " Amesos_TestMutliSolver.cpp " << std::endl ; 
      for ( int i = 0 ; i< numsolves && i < 10 ; i++ ) {
	std::cout << "i=" << i 
	     << " error = " << error[i] 
	     << " xnorm = " << xnorm[i] 
	     << " residual = " << residual[i] 
	     << " bnorm = " << bnorm[i] 
	     << std::endl ; 
      
      }
    
      std::cout << std::endl << " max_resid = " << max_resid ; 
      std::cout << " max_error = " << max_error << std::endl ; 
      std::cout << " Get_residual() again = " << SparseDirectTimingVars::SS_Result.Get_Residual() << std::endl ;

    }
  }
  delete readA;
  delete readx;
  delete readb;
  delete readxexact;
  delete readMap;
  delete map_;
  
  Comm.Barrier();

return 0 ;
}
Exemple #3
0
void sgemm( int m_a, int n_a, float *A, float *B, float *C ) {

  int mpad = (m_a%STEPM? (m_a/STEPM+1)*STEPM:m_a);
  int npad =(n_a%STEPN ? (n_a/STEPN+1)*STEPN:n_a);
  int mbackup = m_a;
  // padding and transpose happen all together
  float* Apad=malloc(mpad*npad*sizeof(float));
  transposeA(n_a, m_a, npad, mpad, Apad, A);
  A=Apad;
  float* Bpad=malloc(npad*mpad*sizeof(float));
  transposeB(n_a, m_a, npad, mpad, Bpad, B);
  B=Bpad;            
 
  float* Cpad = malloc(mpad*mpad*sizeof(float));
  float* backup = C;
  C = Cpad;
  // we are not worried about freeing original A\B\C so we just 
  // overide them with padded ones so that the below code use A\B\C.
  m_a = mpad;
  n_a = npad;
  // calibrate the block size to match the loop strike size
  int Blocki=STEPM;
  int Blockj=STEPM;

  int IB=m_a/Blocki;
  int JB=m_a/Blockj;
#pragma omp parallel 
  {
    __m128 a1, a2, a3, a4,a5,a6,a7,a8, c0;
    __m128 c11, c12, c13, c14,c15,c16,c17,c18;
    __m128 c21, c22, c23, c24,c25,c26,c27,c28;
    __m128 c31, c32, c33, c34,c35,c36,c37,c38;
    __m128 c41, c42, c43, c44,c45,c46,c47,c48;

    __m128 b1, b2, b3, b4,b5,b6,b7,b8;
    float temp0,temp1,temp2,temp3,temp4, temp5, temp6, temp7, temp8;
   // Below is just variables adjusted by blocking in for loops
    int jj=0;
    int kk=0;

    int kkma=0;
    int jjna=0;
    int jjma=0; 
    int iina=0;

#pragma omp for 
    for( int j = 0; j < m_a; j+=8 ) {
      jj=j;
      jjma=jj*m_a;
      jjna=jj*n_a;

       int ii=0;

  for (int l=0;l<IB;l++){
      for( int i = 0; i < Blocki; i+=4 ) {
	ii=i+l*Blocki; // adjusted for blocking

	iina=ii*n_a;   // precomputed for the products in inner loop

    c31=c32=c33=c34=c35=c36=c37=c38\
     =c41=c42=c43=c44=c45=c46=c47=c48\
     =c11=c12=c13=c14=c15=c16=c17=c18\
     =c21=c22=c23=c24=c25=c26=c27=c28\
     = _mm_setzero_ps();
 
	for( int k = 0; k < n_a; k+=4 ) {
	  float* tempA=A+k+iina;
	  float* tempB=B+k+jjna;
            // use 8 B's in one iteration and this affect index j
	  b1 = _mm_loadu_ps(tempB);
	  b2 = _mm_loadu_ps(tempB+n_a);
	  b3 = _mm_loadu_ps(tempB+2*n_a);
	  b4 = _mm_loadu_ps(tempB+3*n_a);
	  b5 = _mm_loadu_ps(tempB+4*n_a);
	  b6 = _mm_loadu_ps(tempB+5*n_a);
	  b7 = _mm_loadu_ps(tempB+6*n_a);
	  b8 = _mm_loadu_ps(tempB+7*n_a);
	  
    /////////////////////////////////////////
    // use 4 a's. Affect index i
	  a1 = _mm_loadu_ps(tempA);
	  a2 = _mm_loadu_ps(tempA+n_a);
	  a3 = _mm_loadu_ps(tempA+n_a*2);
	  a4 = _mm_loadu_ps(tempA+n_a*3);
    // below are multiplications between a's and corresponding b's
	  c11=_mm_add_ps(c11,  _mm_mul_ps(a1, b1));
	  c21 = _mm_add_ps(c21, _mm_mul_ps(a2, b1));
    c31=_mm_add_ps(c31,  _mm_mul_ps(a3, b1));
	  c41 = _mm_add_ps(c41, _mm_mul_ps(a4, b1));
    
	  c12 = _mm_add_ps(c12, _mm_mul_ps(a1, b2));
	  c22 = _mm_add_ps(c22, _mm_mul_ps(a2, b2));
    c32 = _mm_add_ps(c32, _mm_mul_ps(a3, b2));
	  c42 = _mm_add_ps(c42, _mm_mul_ps(a4, b2));
   
	  c13= _mm_add_ps(c13,  _mm_mul_ps(a1, b3));
	  c23 = _mm_add_ps(c23, _mm_mul_ps(a2, b3));
    c33= _mm_add_ps(c33,  _mm_mul_ps(a3, b3));
	  c43 = _mm_add_ps(c43, _mm_mul_ps(a4, b3));

	
	  c14 = _mm_add_ps(c14, _mm_mul_ps(a1, b4));
	  c24 = _mm_add_ps(c24, _mm_mul_ps(a2, b4));
	  c34 = _mm_add_ps(c34, _mm_mul_ps(a3, b4));
	  c44 = _mm_add_ps(c44, _mm_mul_ps(a4, b4));
    /////////////////////////////////////////
    c15 = _mm_add_ps(c15, _mm_mul_ps(a1, b5));
	  c25 = _mm_add_ps(c25, _mm_mul_ps(a2, b5));
	  c35 = _mm_add_ps(c35, _mm_mul_ps(a3, b5));
	  c45 = _mm_add_ps(c45, _mm_mul_ps(a4, b5));
 
    c16 = _mm_add_ps(c16, _mm_mul_ps(a1, b6));
	  c26 = _mm_add_ps(c26, _mm_mul_ps(a2, b6));
	  c36 = _mm_add_ps(c36, _mm_mul_ps(a3, b6));
	  c46 = _mm_add_ps(c46, _mm_mul_ps(a4, b6));

 
    /////////////////////////////////////////
    c17 = _mm_add_ps(c17, _mm_mul_ps(a1, b7));
	  c27 = _mm_add_ps(c27, _mm_mul_ps(a2, b7));
	  c37 = _mm_add_ps(c37, _mm_mul_ps(a3, b7));
	  c47 = _mm_add_ps(c47, _mm_mul_ps(a4, b7));
 
    c18 = _mm_add_ps(c18, _mm_mul_ps(a1, b8));
	  c28 = _mm_add_ps(c28, _mm_mul_ps(a2, b8));
	  c38 = _mm_add_ps(c38, _mm_mul_ps(a3, b8));
	  c48 = _mm_add_ps(c48, _mm_mul_ps(a4, b8));   
     
    }
  // below sums 4 numbers in a c and extract it to save to one matrix entry
	c0=  _mm_hadd_ps(c11,c11);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c12,c12);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c13,c13);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c14,c14);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a*3] = _mm_cvtss_f32(c0);

  c0=  _mm_hadd_ps(c15,c15);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a*4] = _mm_cvtss_f32(c0);
 
 	c0=  _mm_hadd_ps(c16,c16);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a*5] = _mm_cvtss_f32(c0);
 
  	c0=  _mm_hadd_ps(c17,c17);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a*6] = _mm_cvtss_f32(c0);
  
   	c0=  _mm_hadd_ps(c18,c18);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+m_a*7] = _mm_cvtss_f32(c0);
  

  //
	c0=  _mm_hadd_ps(c21,c21);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c22,c22);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c23,c23);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c24,c24);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a*3] = _mm_cvtss_f32(c0);
  
	c0=  _mm_hadd_ps(c25,c25);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a*4] = _mm_cvtss_f32(c0);
 
	c0=  _mm_hadd_ps(c26,c26);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a*5] = _mm_cvtss_f32(c0);
 
  
	c0=  _mm_hadd_ps(c27,c27);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a*6] = _mm_cvtss_f32(c0);
                        
   
	c0=  _mm_hadd_ps(c28,c28);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+1+m_a*7] = _mm_cvtss_f32(c0);
                        
   
  //

	c0=  _mm_hadd_ps(c31,c31);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c32,c32);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c33,c33);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c34,c34);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a*3] = _mm_cvtss_f32(c0);
  
   c0=  _mm_hadd_ps(c35,c35);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a*4] = _mm_cvtss_f32(c0);
  
    c0=  _mm_hadd_ps(c36,c36);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a*5] = _mm_cvtss_f32(c0);
  
   
   c0=  _mm_hadd_ps(c37,c37);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a*6] = _mm_cvtss_f32(c0);
  
    c0=  _mm_hadd_ps(c38,c38);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+2+m_a*7] = _mm_cvtss_f32(c0);
                   
  
  //
	c0=  _mm_hadd_ps(c41,c41);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c42,c42);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c43,c43);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a*2] = _mm_cvtss_f32(c0);

	c0=  _mm_hadd_ps(c44,c44);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a*3] = _mm_cvtss_f32(c0);
  
   	c0=  _mm_hadd_ps(c45,c45);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a*4] = _mm_cvtss_f32(c0);
	
   	c0=  _mm_hadd_ps(c46,c46);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a*5] = _mm_cvtss_f32(c0);
	
  
   	c0=  _mm_hadd_ps(c47,c47);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a*6] = _mm_cvtss_f32(c0);
	
   	c0=  _mm_hadd_ps(c48,c48);
	c0=  _mm_hadd_ps(c0,c0); 
	C[ii+jjma+3+m_a*7] = _mm_cvtss_f32(c0);
	                          
   
      }    }
    } 
    }
  // copy the result to the original matrix
  move(mbackup, mpad, C, backup);
  // these are padded matrix already
  free(A);
  free(B);
  free(C);
}