clsparseStatus
cg(cldenseVectorPrivate *pX,
   const clsparseCsrMatrixPrivate* pA,
   const cldenseVectorPrivate *pB,
   PTYPE& M,
   clSParseSolverControl solverControl,
   clsparseControl control)
{

    assert( pA->num_cols == pB->num_values );
    assert( pA->num_rows == pX->num_values );
    if( ( pA->num_cols != pB->num_values ) || ( pA->num_rows != pX->num_values ) )
    {
        return clsparseInvalidSystemSize;
    }

    //opaque input parameters with clsparse::array type;
    clsparse::vector<T> x(control, pX->values, pX->num_values);
    clsparse::vector<T> b(control, pB->values, pB->num_values);

    cl_int status;

    T scalarOne = 1;
    T scalarZero = 0;

    //clsparse::vector<T> norm_b(control, 1, 0, CL_MEM_WRITE_ONLY, true);
    clsparse::scalar<T> norm_b(control, 0, CL_MEM_WRITE_ONLY, false);

    //norm of rhs of equation
    status = Norm1<T>(norm_b, b, control);
    CLSPARSE_V(status, "Norm B Failed");

    //norm_b is calculated once
    T h_norm_b = norm_b[0];

#ifndef NDEBUG
    std::cout << "norm_b " << h_norm_b << std::endl;
#endif

    if (h_norm_b == 0) //special case b is zero so solution is x = 0
    {
        solverControl->nIters = 0;
        solverControl->absoluteTolerance = 0.0;
        solverControl->relativeTolerance = 0.0;
        //we can either fill the x with zeros or cpy b to x;
        x = b;
        return clsparseSuccess;
    }


    //continuing "normal" execution of cg algorithm
    const auto N = pA->num_cols;

    //helper containers, all need to be zeroed
    clsparse::vector<T> y(control, N, 0, CL_MEM_READ_WRITE, true);
    clsparse::vector<T> z(control, N, 0, CL_MEM_READ_WRITE, true);
    clsparse::vector<T> r(control, N, 0, CL_MEM_READ_WRITE, true);
    clsparse::vector<T> p(control, N, 0, CL_MEM_READ_WRITE, true);

    clsparse::scalar<T> one(control,  1, CL_MEM_READ_ONLY, true);
    clsparse::scalar<T> zero(control, 0, CL_MEM_READ_ONLY, true);

    // y = A*x
    status = csrmv<T>(one, pA, x, zero, y, control);
    CLSPARSE_V(status, "csrmv Failed");

    //r = b - y
    status = r.sub(b, y, control);
    //status = elementwise_transform<T, EW_MINUS>(r, b, y, control);
    CLSPARSE_V(status, "b - y Failed");

    clsparse::scalar<T> norm_r(control, 0, CL_MEM_WRITE_ONLY, false);
    status = Norm1<T>(norm_r, r, control);
    CLSPARSE_V(status, "norm r Failed");

    //T residuum = 0;
    clsparse::scalar<T> residuum(control, 0, CL_MEM_WRITE_ONLY, false);

    //residuum = norm_r[0] / h_norm_b;
    residuum.div(norm_r, norm_b, control);

    solverControl->initialResidual = residuum[0];
#ifndef NDEBUG
        std::cout << "initial residuum = "
                  << solverControl->initialResidual << std::endl;
#endif
    if (solverControl->finished(solverControl->initialResidual))
    {
        solverControl->nIters = 0;
        return clsparseSuccess;
    }
    //apply preconditioner z = M*r
    M(r, z, control);

    //copy inital z to p
    p = z;

    //rz = <r, z>, here actually should be conjugate(r)) but we do not support complex type.
    clsparse::scalar<T> rz(control, 0, CL_MEM_WRITE_ONLY, false);
    status = dot<T>(rz, r, z, control);
    CLSPARSE_V(status, "<r, z> Failed");

    int iteration = 0;

    bool converged = false;

    clsparse::scalar<T> alpha (control, 0, CL_MEM_READ_WRITE, false);
    clsparse::scalar<T> beta  (control, 0, CL_MEM_READ_WRITE, false);

    //yp buffer for inner product of y and p vectors;
    clsparse::scalar<T> yp(control, 0, CL_MEM_WRITE_ONLY, false);

    clsparse::scalar<T> rz_old(control, 0, CL_MEM_WRITE_ONLY, false);

    while(!converged)
    {
        solverControl->nIters = iteration;

        //y = A*p
        status = csrmv<T>(one, pA, p, zero, y, control);
        CLSPARSE_V(status, "csrmv Failed");


        status = dot<T>(yp, y, p, control);
        CLSPARSE_V(status, "<y,p> Failed");

        // alpha = <r,z> / <y,p>
        //alpha[0] = rz[0] / yp[0];
        alpha.div(rz, yp, control);

#ifndef NDEBUG
            std::cout << "alpha = " << alpha[0] << std::endl;
#endif

        //x = x + alpha*p
        status = axpy<T>(x, alpha, p, x, control);
        CLSPARSE_V(status, "x = x + alpha * p Failed");

        //r = r - alpha * y;
        status = axpy<T, EW_MINUS>(r, alpha, y, r, control);
        CLSPARSE_V(status, "r = r - alpha * y Failed");


        //apply preconditioner z = M*r
        M(r, z, control);

        //store old value of rz
        //improve that by move or swap
        rz_old = rz;

        //rz = <r,z>
        status = dot<T>(rz, r, z, control);
        CLSPARSE_V(status, "<r,z> Failed");

        // beta = <r^(i), r^(i)>/<r^(i-1),r^(i-1)> // i: iteration index;
        // beta is ratio of dot product in current iteration compared
        //beta[0] = rz[0] / rz_old[0];
        beta.div(rz, rz_old, control);
#ifndef NDEBUG
            std::cout << "beta = " << beta[0] << std::endl;
#endif

        //p = z + beta*p;
        status = axpby<T>(p, one, z, beta, p, control );
        CLSPARSE_V(status, "p = z + beta*p Failed");

        //calculate norm of r
        status = Norm1<T>(norm_r, r, control);
        CLSPARSE_V(status, "norm r Failed");

        //residuum = norm_r[0] / h_norm_b;
        status = residuum.div(norm_r, norm_b, control);
        CLSPARSE_V(status, "residuum");

        iteration++;
        converged = solverControl->finished(residuum[0]);

        solverControl->print();
    }
    return clsparseSuccess;
}
int main(int argc, char* argv[])
{
  //
  // Get a default output stream from the Teuchos::VerboseObjectBase
  //
  Teuchos::RCP<Teuchos::FancyOStream>
    out = Teuchos::VerboseObjectBase::getDefaultOStream();
  //
  // Set the parameters for the Belos LOWS Factory and create a parameter list.
  //
  int             blockSize              = 2;
  int             maxIterations          = 400;
  int             maxRestarts            = 25;
  int             gmresKrylovLength      = 25;
  int             outputFrequency        = 1;
  bool            outputMaxResOnly       = true;
  double          maxResid               = 1e-6;
  
  Teuchos::RCP<Teuchos::ParameterList>
    belosLOWSFPL = Teuchos::rcp( new Teuchos::ParameterList() );
  
  belosLOWSFPL->set("Solver Type","Block GMRES");
  
  Teuchos::ParameterList& belosLOWSFPL_solver = 
    belosLOWSFPL->sublist("Solver Types");

  Teuchos::ParameterList& belosLOWSFPL_gmres = 
    belosLOWSFPL_solver.sublist("Block GMRES");

  belosLOWSFPL_gmres.set("Maximum Iterations",int(maxIterations));
  belosLOWSFPL_gmres.set("Convergence Tolerance",double(maxResid));
  belosLOWSFPL_gmres.set("Maximum Restarts",int(maxRestarts));
  belosLOWSFPL_gmres.set("Block Size",int(blockSize));
  belosLOWSFPL_gmres.set("Num Blocks",int(gmresKrylovLength));
  belosLOWSFPL_gmres.set("Output Frequency",int(outputFrequency));
  belosLOWSFPL_gmres.set("Show Maximum Residual Norm Only",bool(outputMaxResOnly));

#ifdef HAVE_BELOS_IFPACK
  //
  // Set the parameters for the Ifpack Preconditioner Factory and create parameter list
  //
  Teuchos::ParameterList &ifpackPFSL = belosLOWSFPL->sublist("IfpackPreconditionerFactory");
  
  ifpackPFSL.set("Overlap",int(2));
  ifpackPFSL.set("Prec Type","ILUT");
#endif

  // Whether the linear solver succeeded.
  // (this will be set during the residual check at the end)
  bool success = true;

  // Number of random right-hand sides we will be solving for.
  int numRhs = 5;

  // Name of input matrix file
  std::string matrixFile = "orsirr1.hb";

  // Read in the matrix file (can be *.mtx, *.hb, etc.)
  Epetra_SerialComm comm;
  Teuchos::RCP<Epetra_CrsMatrix> epetra_A;
  EpetraExt::readEpetraLinearSystem( matrixFile, comm, &epetra_A );
  
  // Create a Thyra linear operator (A) using the Epetra_CrsMatrix (epetra_A).
  Teuchos::RCP<const Thyra::LinearOpBase<double> > 
    A = Thyra::epetraLinearOp(epetra_A);

  // Get the domain space for the Thyra linear operator 
  Teuchos::RCP<const Thyra::VectorSpaceBase<double> > domain = A->domain();

  // Create the Belos LOWS factory.
  Teuchos::RCP<Thyra::LinearOpWithSolveFactoryBase<double> >
    belosLOWSFactory = Teuchos::rcp(new Thyra::BelosLinearOpWithSolveFactory<double>());

#ifdef HAVE_BELOS_IFPACK

  // Set the preconditioner factory for the LOWS factory.
  belosLOWSFactory->setPreconditionerFactory(
					     Teuchos::rcp(new Thyra::IfpackPreconditionerFactory())
					     ,"IfpackPreconditionerFactory"
					     );
#endif

  // Set the parameter list to specify the behavior of the factory.
  belosLOWSFactory->setParameterList( belosLOWSFPL );

  // Set the output stream and the verbosity level (prints to std::cout by defualt)
  belosLOWSFactory->setVerbLevel(Teuchos::VERB_LOW);

  // Create a BelosLinearOpWithSolve object from the Belos LOWS factory.
  Teuchos::RCP<Thyra::LinearOpWithSolveBase<double> >
    nsA = belosLOWSFactory->createOp();

  // Initialize the BelosLinearOpWithSolve object with the Thyra linear operator.
  Thyra::initializeOp<double>( *belosLOWSFactory, A, &*nsA );

  // Create a right-hand side with numRhs vectors in it and randomize it.
  Teuchos::RCP< Thyra::MultiVectorBase<double> > 
    b = Thyra::createMembers(domain, numRhs);
  Thyra::seed_randomize<double>(0);
  Thyra::randomize(-1.0, 1.0, &*b);

  // Create an initial std::vector with numRhs vectors in it and initialize it to zero.
  Teuchos::RCP< Thyra::MultiVectorBase<double> >
    x = Thyra::createMembers(domain, numRhs);
  Thyra::assign(&*x, 0.0);

  // Perform solve using the linear operator to get the approximate solution of Ax=b,
  // where b is the right-hand side and x is the left-hand side.
  Thyra::SolveStatus<double> solveStatus;
  solveStatus = Thyra::solve( *nsA, Thyra::NONCONJ_ELE, *b, &*x );

  // Print out status of solve.
  *out << "\nBelos LOWS Status: "<< solveStatus << std::endl;

  //
  // Compute residual and double check convergence.
  //
  std::vector<double> norm_b(numRhs), norm_res(numRhs);
  Teuchos::RCP< Thyra::MultiVectorBase<double> >
    y = Thyra::createMembers(domain, numRhs);

  // Compute the column norms of the right-hand side b.
  Thyra::norms_2( *b, &norm_b[0] );

  // Compute y=A*x, where x is the solution from the linear solver.
  A->apply( Thyra::NONCONJ_ELE, *x, &*y );
  
  // Compute A*x-b = y-b
  Thyra::update( -1.0, *b, &*y );

  // Compute the column norms of A*x-b.
  Thyra::norms_2( *y, &norm_res[0] );

  // Print out the final relative residual norms.
  double rel_res = 0.0;
  *out << "Final relative residual norms" << std::endl;  
  for (int i=0; i<numRhs; ++i) {
    rel_res = norm_res[i]/norm_b[i];
    if (rel_res > maxResid)
      success = false;
    *out << "RHS " << i+1 << " : " 
         << std::setw(16) << std::right << rel_res << std::endl;
  }

  return ( success ? 0 : 1 );
}
int main(int argc, char* argv[])
{
  //
  // Get a default output stream from the Teuchos::VerboseObjectBase
  //
  Teuchos::RCP<Teuchos::FancyOStream>
    out = Teuchos::VerboseObjectBase::getDefaultOStream();
  
  Teuchos::GlobalMPISession mpiSession(&argc,&argv);

#ifdef HAVE_COMPLEX
  typedef std::complex<double> ST;  // Scalar-type typedef
#elif HAVE_COMPLEX_H
  typedef std::complex<double> ST;     // Scalar-type typedef
#else
  typedef double ST;                // Scalar-type typedef
#endif
  
  typedef Teuchos::ScalarTraits<ST>::magnitudeType MT;  // Magnitude-type typedef
  typedef int OT;                   // Ordinal-type typedef
  ST one = Teuchos::ScalarTraits<ST>::one(); 
  ST zero = Teuchos::ScalarTraits<ST>::zero(); 
  
#ifdef HAVE_MPI
  MPI_Comm mpiComm = MPI_COMM_WORLD;
  const Tpetra::MpiPlatform<OT,OT>  ordinalPlatform(mpiComm);
  const Tpetra::MpiPlatform<OT,ST>   scalarPlatform(mpiComm);
#else
  const Tpetra::SerialPlatform<OT,OT>  ordinalPlatform;
  const Tpetra::SerialPlatform<OT,ST>   scalarPlatform;
#endif
  
  //
  // Get the data from the HB file
  //
  
  // Name of input matrix file
  std::string matrixFile = "mhd1280b.cua";
  
  int info=0;
  int dim,dim2,nnz;
  MT *dvals;
  int *colptr,*rowind;
  ST *cvals;
  nnz = -1;
  info = readHB_newmat_double(matrixFile.c_str(),&dim,&dim2,&nnz,
                              &colptr,&rowind,&dvals);

  if (info == 0 || nnz < 0) {
    *out << "Error reading '" << matrixFile << "'" << std::endl;
  }
#ifdef HAVE_MPI
  MPI_Finalize();
#endif

  // Convert interleaved doubles to std::complex values
  cvals = new ST[nnz];
  for (int ii=0; ii<nnz; ii++) {
    cvals[ii] = ST(dvals[ii*2],dvals[ii*2+1]);
  }
  
  // Declare global dimension of the linear operator
  OT globalDim = dim;
  
  // Create the element space and std::vector space
  const Tpetra::ElementSpace<OT> elementSpace(globalDim,0,ordinalPlatform);
  const Tpetra::VectorSpace<OT,ST> vectorSpace(elementSpace,scalarPlatform);
  
  // Create my implementation of a Tpetra::Operator
  RCP<Tpetra::Operator<OT,ST> >
    tpetra_A = rcp( new MyOperator<OT,ST>(vectorSpace,dim,colptr,nnz,rowind,cvals) );

  // Create a Thyra linear operator (A) using the Tpetra::CisMatrix (tpetra_A).
  RCP<Thyra::LinearOpBase<ST> >
    A = Teuchos::rcp( new Thyra::TpetraLinearOp<OT,ST>(tpetra_A) );

  //
  // Set the parameters for the Belos LOWS Factory and create a parameter list.
  //
  int             blockSize              = 1;
  int             maxIterations          = globalDim;
  int             maxRestarts            = 15;
  int             gmresKrylovLength      = 50;
  int             outputFrequency        = 100;
  bool            outputMaxResOnly       = true;
  MT              maxResid               = 1e-5;

  Teuchos::RCP<Teuchos::ParameterList>
    belosLOWSFPL = Teuchos::rcp( new Teuchos::ParameterList() );
 
  belosLOWSFPL->set("Solver Type","Block GMRES");

  Teuchos::ParameterList& belosLOWSFPL_solver =
    belosLOWSFPL->sublist("Solver Types");

  Teuchos::ParameterList& belosLOWSFPL_gmres =
    belosLOWSFPL_solver.sublist("Block GMRES");

  belosLOWSFPL_gmres.set("Maximum Iterations",int(maxIterations));
  belosLOWSFPL_gmres.set("Convergence Tolerance",MT(maxResid));
  belosLOWSFPL_gmres.set("Maximum Restarts",int(maxRestarts));
  belosLOWSFPL_gmres.set("Block Size",int(blockSize));
  belosLOWSFPL_gmres.set("Num Blocks",int(gmresKrylovLength));
  belosLOWSFPL_gmres.set("Output Frequency",int(outputFrequency));
  belosLOWSFPL_gmres.set("Show Maximum Residual Norm Only",bool(outputMaxResOnly));
 
  // Whether the linear solver succeeded.
  // (this will be set during the residual check at the end)
  bool success = true;

  // Number of random right-hand sides we will be solving for.
  int numRhs = 1;

  // Get the domain space for the Thyra linear operator 
  Teuchos::RCP<const Thyra::VectorSpaceBase<ST> > domain = A->domain();

  // Create the Belos LOWS factory.
  Teuchos::RCP<Thyra::LinearOpWithSolveFactoryBase<ST> >
    belosLOWSFactory = Teuchos::rcp(new Thyra::BelosLinearOpWithSolveFactory<ST>());

  // Set the parameter list to specify the behavior of the factory.
  belosLOWSFactory->setParameterList( belosLOWSFPL );

  // Set the output stream and the verbosity level (prints to std::cout by defualt)
  // NOTE:  Set to VERB_NONE for no output from the solver.
  belosLOWSFactory->setVerbLevel(Teuchos::VERB_LOW);

  // Create a BelosLinearOpWithSolve object from the Belos LOWS factory.
  Teuchos::RCP<Thyra::LinearOpWithSolveBase<ST> >
    nsA = belosLOWSFactory->createOp();

  // Initialize the BelosLinearOpWithSolve object with the Thyra linear operator.
  Thyra::initializeOp<ST>( *belosLOWSFactory, A, &*nsA );

  // Create a right-hand side with numRhs vectors in it.
  Teuchos::RCP< Thyra::MultiVectorBase<ST> > 
    b = Thyra::createMembers(domain, numRhs);

  // Create an initial std::vector with numRhs vectors in it and initialize it to one.
  Teuchos::RCP< Thyra::MultiVectorBase<ST> >
    x = Thyra::createMembers(domain, numRhs);
  Thyra::assign(&*x, one);

  // Initialize the right-hand side so that the solution is a std::vector of ones.
  A->apply( Thyra::NONCONJ_ELE, *x, &*b );
  Thyra::assign(&*x, zero);

  // Perform solve using the linear operator to get the approximate solution of Ax=b,
  // where b is the right-hand side and x is the left-hand side.
  Thyra::SolveStatus<ST> solveStatus;
  solveStatus = Thyra::solve( *nsA, Thyra::NONCONJ_ELE, *b, &*x );

  // Print out status of solve.
  *out << "\nBelos LOWS Status: "<< solveStatus << std::endl;

  //
  // Compute residual and ST check convergence.
  //
  std::vector<MT> norm_b(numRhs), norm_res(numRhs);
  Teuchos::RCP< Thyra::MultiVectorBase<ST> >
    y = Thyra::createMembers(domain, numRhs);

  // Compute the column norms of the right-hand side b.
  Thyra::norms_2( *b, &norm_b[0] );

  // Compute y=A*x, where x is the solution from the linear solver.
  A->apply( Thyra::NONCONJ_ELE, *x, &*y );
  
  // Compute A*x-b = y-b
  Thyra::update( -one, *b, &*y );

  // Compute the column norms of A*x-b.
  Thyra::norms_2( *y, &norm_res[0] );

  // Print out the final relative residual norms.
  MT rel_res = 0.0;
  *out << "Final relative residual norms" << std::endl;  
  for (int i=0; i<numRhs; ++i) {
    rel_res = norm_res[i]/norm_b[i];
    if (rel_res > maxResid)
      success = false;
    *out << "RHS " << i+1 << " : " 
	 << std::setw(16) << std::right << rel_res << std::endl;
  }

  return ( success ? 0 : 1 );
}