returnValue DynamicDiscretization::getResiduum( BlockMatrix &residuum_ ) const{

    int  run1;
    uint run2;

    residuum_.init( N, 1 );

    for( run1 = 0; run1 < N; run1++ ){
        Matrix tmp( residuum.getNumValues(), 1 );
        for( run2 = 0; run2 < residuum.getNumValues(); run2++ )
                tmp( run2, 0 ) = residuum(run1,run2);
        residuum_.setDense(run1,0,tmp);
    }

    return SUCCESSFUL_RETURN;
}
clsparseStatus
cg(cldenseVectorPrivate *pX,
   const clsparseCsrMatrixPrivate* pA,
   const cldenseVectorPrivate *pB,
   PTYPE& M,
   clSParseSolverControl solverControl,
   clsparseControl control)
{

    assert( pA->num_cols == pB->num_values );
    assert( pA->num_rows == pX->num_values );
    if( ( pA->num_cols != pB->num_values ) || ( pA->num_rows != pX->num_values ) )
    {
        return clsparseInvalidSystemSize;
    }

    //opaque input parameters with clsparse::array type;
    clsparse::vector<T> x(control, pX->values, pX->num_values);
    clsparse::vector<T> b(control, pB->values, pB->num_values);

    cl_int status;

    T scalarOne = 1;
    T scalarZero = 0;

    //clsparse::vector<T> norm_b(control, 1, 0, CL_MEM_WRITE_ONLY, true);
    clsparse::scalar<T> norm_b(control, 0, CL_MEM_WRITE_ONLY, false);

    //norm of rhs of equation
    status = Norm1<T>(norm_b, b, control);
    CLSPARSE_V(status, "Norm B Failed");

    //norm_b is calculated once
    T h_norm_b = norm_b[0];

#ifndef NDEBUG
    std::cout << "norm_b " << h_norm_b << std::endl;
#endif

    if (h_norm_b == 0) //special case b is zero so solution is x = 0
    {
        solverControl->nIters = 0;
        solverControl->absoluteTolerance = 0.0;
        solverControl->relativeTolerance = 0.0;
        //we can either fill the x with zeros or cpy b to x;
        x = b;
        return clsparseSuccess;
    }


    //continuing "normal" execution of cg algorithm
    const auto N = pA->num_cols;

    //helper containers, all need to be zeroed
    clsparse::vector<T> y(control, N, 0, CL_MEM_READ_WRITE, true);
    clsparse::vector<T> z(control, N, 0, CL_MEM_READ_WRITE, true);
    clsparse::vector<T> r(control, N, 0, CL_MEM_READ_WRITE, true);
    clsparse::vector<T> p(control, N, 0, CL_MEM_READ_WRITE, true);

    clsparse::scalar<T> one(control,  1, CL_MEM_READ_ONLY, true);
    clsparse::scalar<T> zero(control, 0, CL_MEM_READ_ONLY, true);

    // y = A*x
    status = csrmv<T>(one, pA, x, zero, y, control);
    CLSPARSE_V(status, "csrmv Failed");

    //r = b - y
    status = r.sub(b, y, control);
    //status = elementwise_transform<T, EW_MINUS>(r, b, y, control);
    CLSPARSE_V(status, "b - y Failed");

    clsparse::scalar<T> norm_r(control, 0, CL_MEM_WRITE_ONLY, false);
    status = Norm1<T>(norm_r, r, control);
    CLSPARSE_V(status, "norm r Failed");

    //T residuum = 0;
    clsparse::scalar<T> residuum(control, 0, CL_MEM_WRITE_ONLY, false);

    //residuum = norm_r[0] / h_norm_b;
    residuum.div(norm_r, norm_b, control);

    solverControl->initialResidual = residuum[0];
#ifndef NDEBUG
        std::cout << "initial residuum = "
                  << solverControl->initialResidual << std::endl;
#endif
    if (solverControl->finished(solverControl->initialResidual))
    {
        solverControl->nIters = 0;
        return clsparseSuccess;
    }
    //apply preconditioner z = M*r
    M(r, z, control);

    //copy inital z to p
    p = z;

    //rz = <r, z>, here actually should be conjugate(r)) but we do not support complex type.
    clsparse::scalar<T> rz(control, 0, CL_MEM_WRITE_ONLY, false);
    status = dot<T>(rz, r, z, control);
    CLSPARSE_V(status, "<r, z> Failed");

    int iteration = 0;

    bool converged = false;

    clsparse::scalar<T> alpha (control, 0, CL_MEM_READ_WRITE, false);
    clsparse::scalar<T> beta  (control, 0, CL_MEM_READ_WRITE, false);

    //yp buffer for inner product of y and p vectors;
    clsparse::scalar<T> yp(control, 0, CL_MEM_WRITE_ONLY, false);

    clsparse::scalar<T> rz_old(control, 0, CL_MEM_WRITE_ONLY, false);

    while(!converged)
    {
        solverControl->nIters = iteration;

        //y = A*p
        status = csrmv<T>(one, pA, p, zero, y, control);
        CLSPARSE_V(status, "csrmv Failed");


        status = dot<T>(yp, y, p, control);
        CLSPARSE_V(status, "<y,p> Failed");

        // alpha = <r,z> / <y,p>
        //alpha[0] = rz[0] / yp[0];
        alpha.div(rz, yp, control);

#ifndef NDEBUG
            std::cout << "alpha = " << alpha[0] << std::endl;
#endif

        //x = x + alpha*p
        status = axpy<T>(x, alpha, p, x, control);
        CLSPARSE_V(status, "x = x + alpha * p Failed");

        //r = r - alpha * y;
        status = axpy<T, EW_MINUS>(r, alpha, y, r, control);
        CLSPARSE_V(status, "r = r - alpha * y Failed");


        //apply preconditioner z = M*r
        M(r, z, control);

        //store old value of rz
        //improve that by move or swap
        rz_old = rz;

        //rz = <r,z>
        status = dot<T>(rz, r, z, control);
        CLSPARSE_V(status, "<r,z> Failed");

        // beta = <r^(i), r^(i)>/<r^(i-1),r^(i-1)> // i: iteration index;
        // beta is ratio of dot product in current iteration compared
        //beta[0] = rz[0] / rz_old[0];
        beta.div(rz, rz_old, control);
#ifndef NDEBUG
            std::cout << "beta = " << beta[0] << std::endl;
#endif

        //p = z + beta*p;
        status = axpby<T>(p, one, z, beta, p, control );
        CLSPARSE_V(status, "p = z + beta*p Failed");

        //calculate norm of r
        status = Norm1<T>(norm_r, r, control);
        CLSPARSE_V(status, "norm r Failed");

        //residuum = norm_r[0] / h_norm_b;
        status = residuum.div(norm_r, norm_b, control);
        CLSPARSE_V(status, "residuum");

        iteration++;
        converged = solverControl->finished(residuum[0]);

        solverControl->print();
    }
    return clsparseSuccess;
}