void sgemm( int m_a, int n_a, float *A, float *B, float *C ) { int mpad = m_a%STEPM ? (m_a/STEPM+1)*STEPM:m_a; int npad = n_a%STEPN ? (n_a/STEPN+1)*STEPN:n_a; int mbackup = m_a; float* Apad=malloc(mpad*npad*sizeof(float)); transposeA(n_a, m_a, npad, mpad, Apad, A); A=Apad; float* Bpad=malloc(npad*mpad*sizeof(float)); transposeB(n_a, m_a, npad, mpad, Bpad, B); B=Bpad; float* Cpad = malloc(mpad*mpad*sizeof(float)); float* backup = C; C = Cpad; m_a = mpad; n_a = npad; #pragma omp parallel { // __m128 c0,a1, c1, a2, c2, a3, c3, a4, c4; __m128 a1, a2, a3, a4, c0; __m128 c11, c12, c13, c14; __m128 c21, c22, c23, c24; __m128 c31, c32, c33, c34; __m128 c41, c42, c43, c44; __m128 b1, b2, b3, b4; float temp0,temp1,temp2,temp3,temp4, temp5, temp6, temp7, temp8; int ii=0; int jj=0; int kk=0; int kkma=0; int jjna=0; int jjma=0; int iina=0; #pragma omp for for( int j = 0; j < m_a; j+=4 ) { jj=j; jjma=jj*m_a; jjna=jj*n_a; for( int i = 0; i < m_a; i+=4 ) { ii=i; iina=ii*n_a; c31=c32=c33=c34=c41=c42=c43=c44=c11=c12=c13=c14=c21=c22=c23=c24 = _mm_setzero_ps(); for( int k = 0; k < n_a; k+=4 ) { float* tempA=A+k+iina; float* tempB=B+k+jjna; b1 = _mm_loadu_ps(tempB); b2 = _mm_loadu_ps(tempB+n_a); b3 = _mm_loadu_ps(tempB+2*n_a); b4 = _mm_loadu_ps(tempB+3*n_a); ///////////////////////////////////////// a1 = _mm_loadu_ps(tempA); a2 = _mm_loadu_ps(tempA+n_a); a3 = _mm_loadu_ps(tempA+n_a*2); a4 = _mm_loadu_ps(tempA+n_a*3); c11=_mm_add_ps(c11, _mm_mul_ps(a1, b1)); c21 = _mm_add_ps(c21, _mm_mul_ps(a2, b1)); c12 = _mm_add_ps(c12, _mm_mul_ps(a1, b2)); c22 = _mm_add_ps(c22, _mm_mul_ps(a2, b2)); c13= _mm_add_ps(c13, _mm_mul_ps(a1, b3)); c23 = _mm_add_ps(c23, _mm_mul_ps(a2, b3)); c14 = _mm_add_ps(c14, _mm_mul_ps(a1, b4)); c24 = _mm_add_ps(c24, _mm_mul_ps(a2, b4)); c31=_mm_add_ps(c31, _mm_mul_ps(a3, b1)); c41 = _mm_add_ps(c41, _mm_mul_ps(a4, b1)); c32 = _mm_add_ps(c32, _mm_mul_ps(a3, b2)); c42 = _mm_add_ps(c42, _mm_mul_ps(a4, b2)); c33= _mm_add_ps(c33, _mm_mul_ps(a3, b3)); c43 = _mm_add_ps(c43, _mm_mul_ps(a4, b3)); c34 = _mm_add_ps(c34, _mm_mul_ps(a3, b4)); c44 = _mm_add_ps(c44, _mm_mul_ps(a4, b4)); } c0= _mm_hadd_ps(c11,c11); c0= _mm_hadd_ps(c0,c0); C[ii+jjma] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c12,c12); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c13,c13); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c14,c14); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a*3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c21,c21); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c22,c22); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c23,c23); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c24,c24); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a*3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c31,c31); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c32,c32); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c33,c33); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c34,c34); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a*3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c41,c41); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c42,c42); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c43,c43); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c44,c44); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a*3] = _mm_cvtss_f32(c0); /* c0= _mm_hadd_ps(c11,c11); c0= _mm_hadd_ps(c0,c0); C[ii+jjma] = _mm_cvtss_f32(c0); c11= _mm_hadd_ps(c12,c12); c11= _mm_hadd_ps(c11,c11); C[ii+jjma+m_a]= _mm_cvtss_f32(c11); c12= _mm_hadd_ps(c13,c13); c12= _mm_hadd_ps(c12,c12); C[ii+jjma+m_a*2]=_mm_cvtss_f32(c12); c13= _mm_hadd_ps(c14,c14); c13= _mm_hadd_ps(c13,c13); C[ii+jjma+m_a*3]= _mm_cvtss_f32(c13); c14= _mm_hadd_ps(c21,c21); c14= _mm_hadd_ps(c14,c14); C[ii+jjma+1] = _mm_cvtss_f32(c14); c21= _mm_hadd_ps(c22,c22); c21= _mm_hadd_ps(c21,c21); C[ii+jjma+m_a+1] = _mm_cvtss_f32(c21); c22= _mm_hadd_ps(c23,c23); c22= _mm_hadd_ps(c22,c22); C[ii+jjma+2*m_a+1]=_mm_cvtss_f32(c22); c23= _mm_hadd_ps(c24,c24); c23= _mm_hadd_ps(c23,c23); C[ii+jjma+3*m_a+1]= _mm_cvtss_f32(c23); */ } } } move(mbackup, mpad, C, backup); free(A); free(B); free(C); }
// // Amesos_TestMultiSolver.cpp reads in a matrix in Harwell-Boeing format, // calls one of the sparse direct solvers, using blocked right hand sides // and computes the error and residual. // // TestSolver ignores the Harwell-Boeing right hand sides, creating // random right hand sides instead. // // Amesos_TestMultiSolver can test either A x = b or A^T x = b. // This can be a bit confusing because sparse direct solvers // use compressed column storage - the transpose of Trilinos' // sparse row storage. // // Matrices: // readA - Serial. As read from the file. // transposeA - Serial. The transpose of readA. // serialA - if (transpose) then transposeA else readA // distributedA - readA distributed to all processes // passA - if ( distributed ) then distributedA else serialA // // int Amesos_TestMultiSolver( Epetra_Comm &Comm, char *matrix_file, int numsolves, SparseSolverType SparseSolver, bool transpose, int special, AMESOS_MatrixType matrix_type ) { int iam = Comm.MyPID() ; // int hatever; // if ( iam == 0 ) std::cin >> hatever ; Comm.Barrier(); Epetra_Map * readMap; Epetra_CrsMatrix * readA; Epetra_Vector * readx; Epetra_Vector * readb; Epetra_Vector * readxexact; std::string FileName = matrix_file ; int FN_Size = FileName.size() ; std::string LastFiveBytes = FileName.substr( EPETRA_MAX(0,FN_Size-5), FN_Size ); std::string LastFourBytes = FileName.substr( EPETRA_MAX(0,FN_Size-4), FN_Size ); bool NonContiguousMap = false; if ( LastFiveBytes == ".triU" ) { NonContiguousMap = true; // Call routine to read in unsymmetric Triplet matrix EPETRA_CHK_ERR( Trilinos_Util_ReadTriples2Epetra( matrix_file, false, Comm, readMap, readA, readx, readb, readxexact, NonContiguousMap ) ); } else { if ( LastFiveBytes == ".triS" ) { NonContiguousMap = true; // Call routine to read in symmetric Triplet matrix EPETRA_CHK_ERR( Trilinos_Util_ReadTriples2Epetra( matrix_file, true, Comm, readMap, readA, readx, readb, readxexact, NonContiguousMap ) ); } else { if ( LastFourBytes == ".mtx" ) { EPETRA_CHK_ERR( Trilinos_Util_ReadMatrixMarket2Epetra( matrix_file, Comm, readMap, readA, readx, readb, readxexact) ); } else { // Call routine to read in HB problem Trilinos_Util_ReadHb2Epetra( matrix_file, Comm, readMap, readA, readx, readb, readxexact) ; } } } Epetra_CrsMatrix transposeA(Copy, *readMap, 0); Epetra_CrsMatrix *serialA ; if ( transpose ) { assert( CrsMatrixTranspose( readA, &transposeA ) == 0 ); serialA = &transposeA ; } else { serialA = readA ; } // Create uniform distributed map Epetra_Map map(readMap->NumGlobalElements(), 0, Comm); Epetra_Map* map_; if( NonContiguousMap ) { // // map gives us NumMyElements and MyFirstElement; // int NumGlobalElements = readMap->NumGlobalElements(); int NumMyElements = map.NumMyElements(); int MyFirstElement = map.MinMyGID(); std::vector<int> MapMap_( NumGlobalElements ); readMap->MyGlobalElements( &MapMap_[0] ) ; Comm.Broadcast( &MapMap_[0], NumGlobalElements, 0 ) ; map_ = new Epetra_Map( NumGlobalElements, NumMyElements, &MapMap_[MyFirstElement], 0, Comm); } else { map_ = new Epetra_Map( map ) ; } // Create Exporter to distribute read-in matrix and vectors Epetra_Export exporter(*readMap, *map_); Epetra_CrsMatrix A(Copy, *map_, 0); Epetra_RowMatrix * passA = 0; Epetra_MultiVector * passx = 0; Epetra_MultiVector * passb = 0; Epetra_MultiVector * passxexact = 0; Epetra_MultiVector * passresid = 0; Epetra_MultiVector * passtmp = 0; Epetra_MultiVector x(*map_,numsolves); Epetra_MultiVector b(*map_,numsolves); Epetra_MultiVector xexact(*map_,numsolves); Epetra_MultiVector resid(*map_,numsolves); Epetra_MultiVector tmp(*map_,numsolves); Epetra_MultiVector serialx(*readMap,numsolves); Epetra_MultiVector serialb(*readMap,numsolves); Epetra_MultiVector serialxexact(*readMap,numsolves); Epetra_MultiVector serialresid(*readMap,numsolves); Epetra_MultiVector serialtmp(*readMap,numsolves); bool distribute_matrix = ( matrix_type == AMESOS_Distributed ) ; if ( distribute_matrix ) { // // Initialize x, b and xexact to the values read in from the file // A.Export(*serialA, exporter, Add); Comm.Barrier(); assert(A.FillComplete()==0); Comm.Barrier(); passA = &A; passx = &x; passb = &b; passxexact = &xexact; passresid = &resid; passtmp = &tmp; } else { passA = serialA; passx = &serialx; passb = &serialb; passxexact = &serialxexact; passresid = &serialresid; passtmp = &serialtmp; } passxexact->SetSeed(131) ; passxexact->Random(); passx->SetSeed(11231) ; passx->Random(); passb->PutScalar( 0.0 ); passA->Multiply( transpose, *passxexact, *passb ) ; Epetra_MultiVector CopyB( *passb ) ; double Anorm = passA->NormInf() ; SparseDirectTimingVars::SS_Result.Set_Anorm(Anorm) ; Epetra_LinearProblem Problem( (Epetra_RowMatrix *) passA, (Epetra_MultiVector *) passx, (Epetra_MultiVector *) passb ); double max_resid = 0.0; for ( int j = 0 ; j < special+1 ; j++ ) { Epetra_Time TotalTime( Comm ) ; if ( false ) { #ifdef TEST_UMFPACK unused code } else if ( SparseSolver == UMFPACK ) { UmfpackOO umfpack( (Epetra_RowMatrix *) passA, (Epetra_MultiVector *) passx, (Epetra_MultiVector *) passb ) ; umfpack.SetTrans( transpose ) ; umfpack.Solve() ; #endif #ifdef TEST_SUPERLU } else if ( SparseSolver == SuperLU ) { SuperluserialOO superluserial( (Epetra_RowMatrix *) passA, (Epetra_MultiVector *) passx, (Epetra_MultiVector *) passb ) ; superluserial.SetPermc( SuperLU_permc ) ; superluserial.SetTrans( transpose ) ; superluserial.SetUseDGSSV( special == 0 ) ; superluserial.Solve() ; #endif #ifdef HAVE_AMESOS_SLUD } else if ( SparseSolver == SuperLUdist ) { SuperludistOO superludist( Problem ) ; superludist.SetTrans( transpose ) ; EPETRA_CHK_ERR( superludist.Solve( true ) ) ; #endif #ifdef HAVE_AMESOS_SLUD2 } else if ( SparseSolver == SuperLUdist2 ) { Superludist2_OO superludist2( Problem ) ; superludist2.SetTrans( transpose ) ; EPETRA_CHK_ERR( superludist2.Solve( true ) ) ; #endif #ifdef TEST_SPOOLES } else if ( SparseSolver == SPOOLES ) { SpoolesOO spooles( (Epetra_RowMatrix *) passA, (Epetra_MultiVector *) passx, (Epetra_MultiVector *) passb ) ; spooles.SetTrans( transpose ) ; spooles.Solve() ; #endif #ifdef HAVE_AMESOS_DSCPACK } else if ( SparseSolver == DSCPACK ) { Teuchos::ParameterList ParamList ; Amesos_Dscpack dscpack( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( dscpack.SetParameters( ParamList ) ); EPETRA_CHK_ERR( dscpack.Solve( ) ); #endif #ifdef HAVE_AMESOS_UMFPACK } else if ( SparseSolver == UMFPACK ) { Teuchos::ParameterList ParamList ; Amesos_Umfpack umfpack( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( umfpack.SetParameters( ParamList ) ); EPETRA_CHK_ERR( umfpack.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( umfpack.Solve( ) ); #endif #ifdef HAVE_AMESOS_KLU } else if ( SparseSolver == KLU ) { Teuchos::ParameterList ParamList ; Amesos_Klu klu( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( klu.SetParameters( ParamList ) ); EPETRA_CHK_ERR( klu.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( klu.SymbolicFactorization( ) ); EPETRA_CHK_ERR( klu.NumericFactorization( ) ); EPETRA_CHK_ERR( klu.Solve( ) ); #endif #ifdef HAVE_AMESOS_PARAKLETE } else if ( SparseSolver == PARAKLETE ) { Teuchos::ParameterList ParamList ; Amesos_Paraklete paraklete( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( paraklete.SetParameters( ParamList ) ); EPETRA_CHK_ERR( paraklete.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( paraklete.SymbolicFactorization( ) ); EPETRA_CHK_ERR( paraklete.NumericFactorization( ) ); EPETRA_CHK_ERR( paraklete.Solve( ) ); #endif #ifdef HAVE_AMESOS_SLUS } else if ( SparseSolver == SuperLU ) { Epetra_SLU superluserial( &Problem ) ; EPETRA_CHK_ERR( superluserial.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( superluserial.SymbolicFactorization( ) ); EPETRA_CHK_ERR( superluserial.NumericFactorization( ) ); EPETRA_CHK_ERR( superluserial.Solve( ) ); #endif #ifdef HAVE_AMESOS_LAPACK } else if ( SparseSolver == LAPACK ) { Teuchos::ParameterList ParamList ; ParamList.set( "MaxProcs", -3 ); Amesos_Lapack lapack( Problem ) ; EPETRA_CHK_ERR( lapack.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( lapack.SymbolicFactorization( ) ); EPETRA_CHK_ERR( lapack.NumericFactorization( ) ); EPETRA_CHK_ERR( lapack.Solve( ) ); #endif #ifdef HAVE_AMESOS_TAUCS } else if ( SparseSolver == TAUCS ) { Teuchos::ParameterList ParamList ; Amesos_Taucs taucs( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( taucs.SetParameters( ParamList ) ); EPETRA_CHK_ERR( taucs.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( taucs.SymbolicFactorization( ) ); EPETRA_CHK_ERR( taucs.NumericFactorization( ) ); EPETRA_CHK_ERR( taucs.Solve( ) ); #endif #ifdef HAVE_AMESOS_PARDISO } else if ( SparseSolver == PARDISO ) { Teuchos::ParameterList ParamList ; Amesos_Pardiso pardiso( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( pardiso.SetParameters( ParamList ) ); EPETRA_CHK_ERR( pardiso.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( pardiso.SymbolicFactorization( ) ); EPETRA_CHK_ERR( pardiso.NumericFactorization( ) ); EPETRA_CHK_ERR( pardiso.Solve( ) ); #endif #ifdef HAVE_AMESOS_PARKLETE } else if ( SparseSolver == PARKLETE ) { Teuchos::ParameterList ParamList ; Amesos_Parklete parklete( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( parklete.SetParameters( ParamList ) ); EPETRA_CHK_ERR( parklete.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( parklete.SymbolicFactorization( ) ); EPETRA_CHK_ERR( parklete.NumericFactorization( ) ); EPETRA_CHK_ERR( parklete.Solve( ) ); #endif #ifdef HAVE_AMESOS_MUMPS } else if ( SparseSolver == MUMPS ) { Teuchos::ParameterList ParamList ; Amesos_Mumps mumps( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( mumps.SetParameters( ParamList ) ); EPETRA_CHK_ERR( mumps.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( mumps.SymbolicFactorization( ) ); EPETRA_CHK_ERR( mumps.NumericFactorization( ) ); EPETRA_CHK_ERR( mumps.Solve( ) ); #endif #ifdef HAVE_AMESOS_SCALAPACK } else if ( SparseSolver == SCALAPACK ) { Teuchos::ParameterList ParamList ; Amesos_Scalapack scalapack( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( scalapack.SetParameters( ParamList ) ); EPETRA_CHK_ERR( scalapack.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( scalapack.SymbolicFactorization( ) ); EPETRA_CHK_ERR( scalapack.NumericFactorization( ) ); EPETRA_CHK_ERR( scalapack.Solve( ) ); #endif #ifdef HAVE_AMESOS_SUPERLUDIST } else if ( SparseSolver == SUPERLUDIST ) { Teuchos::ParameterList ParamList ; Amesos_Superludist superludist( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( superludist.SetParameters( ParamList ) ); EPETRA_CHK_ERR( superludist.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( superludist.SymbolicFactorization( ) ); EPETRA_CHK_ERR( superludist.NumericFactorization( ) ); EPETRA_CHK_ERR( superludist.Solve( ) ); #endif #ifdef HAVE_AMESOS_SUPERLU } else if ( SparseSolver == SUPERLU ) { Teuchos::ParameterList ParamList ; Amesos_Superlu superlu( Problem ) ; ParamList.set( "MaxProcs", -3 ); EPETRA_CHK_ERR( superlu.SetParameters( ParamList ) ); EPETRA_CHK_ERR( superlu.SetUseTranspose( transpose ) ); EPETRA_CHK_ERR( superlu.SymbolicFactorization( ) ); EPETRA_CHK_ERR( superlu.NumericFactorization( ) ); EPETRA_CHK_ERR( superlu.Solve( ) ); #endif #ifdef TEST_SPOOLESSERIAL } else if ( SparseSolver == SPOOLESSERIAL ) { SpoolesserialOO spoolesserial( (Epetra_RowMatrix *) passA, (Epetra_MultiVector *) passx, (Epetra_MultiVector *) passb ) ; spoolesserial.Solve() ; #endif } else { SparseDirectTimingVars::log_file << "Solver not implemented yet" << std::endl ; std::cerr << "\n\n#################### Requested solver not available (Or not tested with blocked RHS) on this platform #####################\n" << std::endl ; } SparseDirectTimingVars::SS_Result.Set_Total_Time( TotalTime.ElapsedTime() ); // SparseDirectTimingVars::SS_Result.Set_First_Time( 0.0 ); // SparseDirectTimingVars::SS_Result.Set_Middle_Time( 0.0 ); // SparseDirectTimingVars::SS_Result.Set_Last_Time( 0.0 ); // // Compute the error = norm(xcomp - xexact ) // std::vector <double> error(numsolves) ; double max_error = 0.0; passresid->Update(1.0, *passx, -1.0, *passxexact, 0.0); passresid->Norm2(&error[0]); for ( int i = 0 ; i< numsolves; i++ ) if ( error[i] > max_error ) max_error = error[i] ; SparseDirectTimingVars::SS_Result.Set_Error(max_error) ; // passxexact->Norm2(&error[0] ) ; // passx->Norm2(&error ) ; // // Compute the residual = norm(Ax - b) // std::vector <double> residual(numsolves) ; passtmp->PutScalar(0.0); passA->Multiply( transpose, *passx, *passtmp); passresid->Update(1.0, *passtmp, -1.0, *passb, 0.0); // passresid->Update(1.0, *passtmp, -1.0, CopyB, 0.0); passresid->Norm2(&residual[0]); for ( int i = 0 ; i< numsolves; i++ ) if ( residual[i] > max_resid ) max_resid = residual[i] ; SparseDirectTimingVars::SS_Result.Set_Residual(max_resid) ; std::vector <double> bnorm(numsolves); passb->Norm2( &bnorm[0] ) ; SparseDirectTimingVars::SS_Result.Set_Bnorm(bnorm[0]) ; std::vector <double> xnorm(numsolves); passx->Norm2( &xnorm[0] ) ; SparseDirectTimingVars::SS_Result.Set_Xnorm(xnorm[0]) ; if ( false && iam == 0 ) { std::cout << " Amesos_TestMutliSolver.cpp " << std::endl ; for ( int i = 0 ; i< numsolves && i < 10 ; i++ ) { std::cout << "i=" << i << " error = " << error[i] << " xnorm = " << xnorm[i] << " residual = " << residual[i] << " bnorm = " << bnorm[i] << std::endl ; } std::cout << std::endl << " max_resid = " << max_resid ; std::cout << " max_error = " << max_error << std::endl ; std::cout << " Get_residual() again = " << SparseDirectTimingVars::SS_Result.Get_Residual() << std::endl ; } } delete readA; delete readx; delete readb; delete readxexact; delete readMap; delete map_; Comm.Barrier(); return 0 ; }
void sgemm( int m_a, int n_a, float *A, float *B, float *C ) { int mpad = (m_a%STEPM? (m_a/STEPM+1)*STEPM:m_a); int npad =(n_a%STEPN ? (n_a/STEPN+1)*STEPN:n_a); int mbackup = m_a; // padding and transpose happen all together float* Apad=malloc(mpad*npad*sizeof(float)); transposeA(n_a, m_a, npad, mpad, Apad, A); A=Apad; float* Bpad=malloc(npad*mpad*sizeof(float)); transposeB(n_a, m_a, npad, mpad, Bpad, B); B=Bpad; float* Cpad = malloc(mpad*mpad*sizeof(float)); float* backup = C; C = Cpad; // we are not worried about freeing original A\B\C so we just // overide them with padded ones so that the below code use A\B\C. m_a = mpad; n_a = npad; // calibrate the block size to match the loop strike size int Blocki=STEPM; int Blockj=STEPM; int IB=m_a/Blocki; int JB=m_a/Blockj; #pragma omp parallel { __m128 a1, a2, a3, a4,a5,a6,a7,a8, c0; __m128 c11, c12, c13, c14,c15,c16,c17,c18; __m128 c21, c22, c23, c24,c25,c26,c27,c28; __m128 c31, c32, c33, c34,c35,c36,c37,c38; __m128 c41, c42, c43, c44,c45,c46,c47,c48; __m128 b1, b2, b3, b4,b5,b6,b7,b8; float temp0,temp1,temp2,temp3,temp4, temp5, temp6, temp7, temp8; // Below is just variables adjusted by blocking in for loops int jj=0; int kk=0; int kkma=0; int jjna=0; int jjma=0; int iina=0; #pragma omp for for( int j = 0; j < m_a; j+=8 ) { jj=j; jjma=jj*m_a; jjna=jj*n_a; int ii=0; for (int l=0;l<IB;l++){ for( int i = 0; i < Blocki; i+=4 ) { ii=i+l*Blocki; // adjusted for blocking iina=ii*n_a; // precomputed for the products in inner loop c31=c32=c33=c34=c35=c36=c37=c38\ =c41=c42=c43=c44=c45=c46=c47=c48\ =c11=c12=c13=c14=c15=c16=c17=c18\ =c21=c22=c23=c24=c25=c26=c27=c28\ = _mm_setzero_ps(); for( int k = 0; k < n_a; k+=4 ) { float* tempA=A+k+iina; float* tempB=B+k+jjna; // use 8 B's in one iteration and this affect index j b1 = _mm_loadu_ps(tempB); b2 = _mm_loadu_ps(tempB+n_a); b3 = _mm_loadu_ps(tempB+2*n_a); b4 = _mm_loadu_ps(tempB+3*n_a); b5 = _mm_loadu_ps(tempB+4*n_a); b6 = _mm_loadu_ps(tempB+5*n_a); b7 = _mm_loadu_ps(tempB+6*n_a); b8 = _mm_loadu_ps(tempB+7*n_a); ///////////////////////////////////////// // use 4 a's. Affect index i a1 = _mm_loadu_ps(tempA); a2 = _mm_loadu_ps(tempA+n_a); a3 = _mm_loadu_ps(tempA+n_a*2); a4 = _mm_loadu_ps(tempA+n_a*3); // below are multiplications between a's and corresponding b's c11=_mm_add_ps(c11, _mm_mul_ps(a1, b1)); c21 = _mm_add_ps(c21, _mm_mul_ps(a2, b1)); c31=_mm_add_ps(c31, _mm_mul_ps(a3, b1)); c41 = _mm_add_ps(c41, _mm_mul_ps(a4, b1)); c12 = _mm_add_ps(c12, _mm_mul_ps(a1, b2)); c22 = _mm_add_ps(c22, _mm_mul_ps(a2, b2)); c32 = _mm_add_ps(c32, _mm_mul_ps(a3, b2)); c42 = _mm_add_ps(c42, _mm_mul_ps(a4, b2)); c13= _mm_add_ps(c13, _mm_mul_ps(a1, b3)); c23 = _mm_add_ps(c23, _mm_mul_ps(a2, b3)); c33= _mm_add_ps(c33, _mm_mul_ps(a3, b3)); c43 = _mm_add_ps(c43, _mm_mul_ps(a4, b3)); c14 = _mm_add_ps(c14, _mm_mul_ps(a1, b4)); c24 = _mm_add_ps(c24, _mm_mul_ps(a2, b4)); c34 = _mm_add_ps(c34, _mm_mul_ps(a3, b4)); c44 = _mm_add_ps(c44, _mm_mul_ps(a4, b4)); ///////////////////////////////////////// c15 = _mm_add_ps(c15, _mm_mul_ps(a1, b5)); c25 = _mm_add_ps(c25, _mm_mul_ps(a2, b5)); c35 = _mm_add_ps(c35, _mm_mul_ps(a3, b5)); c45 = _mm_add_ps(c45, _mm_mul_ps(a4, b5)); c16 = _mm_add_ps(c16, _mm_mul_ps(a1, b6)); c26 = _mm_add_ps(c26, _mm_mul_ps(a2, b6)); c36 = _mm_add_ps(c36, _mm_mul_ps(a3, b6)); c46 = _mm_add_ps(c46, _mm_mul_ps(a4, b6)); ///////////////////////////////////////// c17 = _mm_add_ps(c17, _mm_mul_ps(a1, b7)); c27 = _mm_add_ps(c27, _mm_mul_ps(a2, b7)); c37 = _mm_add_ps(c37, _mm_mul_ps(a3, b7)); c47 = _mm_add_ps(c47, _mm_mul_ps(a4, b7)); c18 = _mm_add_ps(c18, _mm_mul_ps(a1, b8)); c28 = _mm_add_ps(c28, _mm_mul_ps(a2, b8)); c38 = _mm_add_ps(c38, _mm_mul_ps(a3, b8)); c48 = _mm_add_ps(c48, _mm_mul_ps(a4, b8)); } // below sums 4 numbers in a c and extract it to save to one matrix entry c0= _mm_hadd_ps(c11,c11); c0= _mm_hadd_ps(c0,c0); C[ii+jjma] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c12,c12); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c13,c13); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c14,c14); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a*3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c15,c15); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a*4] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c16,c16); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a*5] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c17,c17); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a*6] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c18,c18); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a*7] = _mm_cvtss_f32(c0); // c0= _mm_hadd_ps(c21,c21); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c22,c22); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c23,c23); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c24,c24); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a*3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c25,c25); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a*4] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c26,c26); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a*5] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c27,c27); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a*6] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c28,c28); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a*7] = _mm_cvtss_f32(c0); // c0= _mm_hadd_ps(c31,c31); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c32,c32); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c33,c33); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c34,c34); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a*3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c35,c35); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a*4] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c36,c36); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a*5] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c37,c37); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a*6] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c38,c38); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a*7] = _mm_cvtss_f32(c0); // c0= _mm_hadd_ps(c41,c41); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c42,c42); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c43,c43); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c44,c44); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a*3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c45,c45); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a*4] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c46,c46); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a*5] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c47,c47); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a*6] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c48,c48); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a*7] = _mm_cvtss_f32(c0); } } } } // copy the result to the original matrix move(mbackup, mpad, C, backup); // these are padded matrix already free(A); free(B); free(C); }