コード例 #1
0
ファイル: gmres.hpp プロジェクト: ettabib/calculParallele
int 
GMRES(const Operator &A, Vector &x, const Vector &b,
      const Preconditioner &M, Matrix &H, int &m, int &max_iter,
      Real &tol)
{
  Real resid;
  int i, j = 1, k;
  Vector s(m+1), cs(m+1), sn(m+1), w,r,Ax;
  r=M*b;
  Real normb = sqrt((r,r));
  
  Ax=b;
  Ax -=A * x;
  r = M*(Ax);
  Real beta = sqrt((r,r));
  
  if ( abs(normb) < 1.e-30)
    normb = 1;
  
  if ((resid = beta / normb) <= tol) {
    tol = resid;
    max_iter = 0;
    return 0;
  }

  Vector *v = new Vector[m+1];

  while (j <= max_iter) {
    v[0] = r / beta;    
    s = 0.0;
    s(0) = beta;
    
    for (i = 0; i < m && j <= max_iter; i++, j++) {
      w = M*(Ax=A * v[i]);
      for (k = 0; k <= i; k++) {
        H(k, i) = (w, v[k]);
        w -= H(k, i) * v[k];
      }
      H(i+1, i) = sqrt((w,w));
      v[i+1] = w  / H(i+1, i) ; 

      for (k = 0; k < i; k++)
        ApplyPlaneRotation(H(k,i), H(k+1,i), cs(k), sn(k));
      
      GeneratePlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i));
      ApplyPlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i));
      ApplyPlaneRotation(s(i), s(i+1), cs(i), sn(i));
      
      if ((resid = abs(s(i+1)) / normb) < tol) {
        Update(x, i, H, s, v);
        tol = resid;
        max_iter = j;
        delete [] v;
        return 0;
      }
    }
    Update(x, m - 1, H, s, v);
    Ax = b;
    Ax -= A*x;
    r = M*(Ax);
    beta = sqrt((r,r));
    if ((resid = beta / normb) < tol) {
      tol = resid;
      max_iter = j;
      delete [] v;
      return 0;
    }
  }
  
  tol = resid;
  delete [] v;
  return 1;
}
コード例 #2
0
ファイル: dfgmres.cpp プロジェクト: maxhutch/magma
extern "C" magma_int_t
magma_dfgmres(
    magma_d_matrix A, magma_d_matrix b, magma_d_matrix *x,
    magma_d_solver_par *solver_par,
    magma_d_preconditioner *precond_par,
    magma_queue_t queue )
{
    magma_int_t info = MAGMA_NOTCONVERGED;
    
    magma_int_t dofs = A.num_rows;

    // prepare solver feedback
    solver_par->solver = Magma_PGMRES;
    solver_par->numiter = 0;
    solver_par->spmv_count = 0;
    
    //Chronometry
    real_Double_t tempo1, tempo2;

    magma_int_t dim = solver_par->restart;
    magma_int_t m1 = dim+1; // used inside H macro
    magma_int_t i, j, k;
    double beta;
    
    double rel_resid, resid0=1, r0=0.0, betanom = 0.0, nom;
    
    magma_d_matrix v_t={Magma_CSR}, w_t={Magma_CSR}, t={Magma_CSR}, t2={Magma_CSR}, V={Magma_CSR}, W={Magma_CSR};
    v_t.memory_location = Magma_DEV;
    v_t.num_rows = dofs;
    v_t.num_cols = 1;
    v_t.dval = NULL;
    v_t.storage_type = Magma_DENSE;

    w_t.memory_location = Magma_DEV;
    w_t.num_rows = dofs;
    w_t.num_cols = 1;
    w_t.dval = NULL;
    w_t.storage_type = Magma_DENSE;
    
    double temp;
    
    double *H={0}, *s={0}, *cs={0}, *sn={0};

    CHECK( magma_dvinit( &t, Magma_DEV, dofs, 1, MAGMA_D_ZERO, queue ));
    CHECK( magma_dvinit( &t2, Magma_DEV, dofs, 1, MAGMA_D_ZERO, queue ));
    
    CHECK( magma_dmalloc_pinned( &H, (dim+1)*dim ));
    CHECK( magma_dmalloc_pinned( &s,  dim+1 ));
    CHECK( magma_dmalloc_pinned( &cs, dim ));
    CHECK( magma_dmalloc_pinned( &sn, dim ));
    
    
    CHECK( magma_dvinit( &V, Magma_DEV, dofs*(dim+1), 1, MAGMA_D_ZERO, queue ));
    CHECK( magma_dvinit( &W, Magma_DEV, dofs*dim, 1, MAGMA_D_ZERO, queue ));
    
    CHECK(  magma_dresidual( A, b, *x, &nom, queue));

    solver_par->init_res = nom;
    
    if ( ( nom * solver_par->rtol) < ATOLERANCE )
        r0 = ATOLERANCE;
    
    solver_par->numiter = 0;
    solver_par->spmv_count = 0;
    

    tempo1 = magma_sync_wtime( queue );
    do
    {
        solver_par->numiter++;
        // compute initial residual and its norm
        // A.mult(n, 1, x, n, V(0), n);                        // V(0) = A*x
        CHECK( magma_d_spmv( MAGMA_D_ONE, A, *x, MAGMA_D_ZERO, t, queue ));
        solver_par->spmv_count++;
        magma_dcopy( dofs, t.dval, 1, V(0), 1, queue );
        
        temp = MAGMA_D_MAKE(-1.0, 0.0);
        magma_daxpy( dofs,temp, b.dval, 1, V(0), 1, queue );           // V(0) = V(0) - b
        beta = MAGMA_D_MAKE( magma_dnrm2( dofs, V(0), 1, queue ), 0.0 ); // beta = norm(V(0))
        if( magma_d_isnan_inf( beta ) ){
            info = MAGMA_DIVERGENCE;
            break;
        }
        
        if (solver_par->numiter == 0){
            solver_par->init_res = MAGMA_D_REAL( beta );
            resid0 = MAGMA_D_REAL( beta );
        
            r0 = resid0 * solver_par->rtol;
            if ( r0 < ATOLERANCE )
                r0 = ATOLERANCE;
            if ( resid0 < r0 ) {
                solver_par->final_res = solver_par->init_res;
                solver_par->iter_res = solver_par->init_res;
                info = MAGMA_SUCCESS;
                goto cleanup;
            }
        }
        if ( solver_par->verbose > 0 ) {
            solver_par->res_vec[0] = resid0;
            solver_par->timing[0] = 0.0;
        }
        temp = -1.0/beta;
        magma_dscal( dofs, temp, V(0), 1, queue );                 // V(0) = -V(0)/beta

        // save very first residual norm
        if (solver_par->numiter == 0)
            solver_par->init_res = MAGMA_D_REAL( beta );

        for (i = 1; i < dim+1; i++)
            s[i] = MAGMA_D_ZERO;
        s[0] = beta;

        i = -1;
        do {
            i++;
            
            // M.apply(n, 1, V(i), n, W(i), n);
            v_t.dval = V(i);
            CHECK( magma_d_applyprecond_left( MagmaNoTrans, A, v_t, &t, precond_par, queue ));
            CHECK( magma_d_applyprecond_right( MagmaNoTrans, A, t, &t2, precond_par, queue ));
            magma_dcopy( dofs, t2.dval, 1, W(i), 1, queue );

            // A.mult(n, 1, W(i), n, V(i+1), n);
            w_t.dval = W(i);
            CHECK( magma_d_spmv( MAGMA_D_ONE, A, w_t, MAGMA_D_ZERO, t, queue ));
            solver_par->spmv_count++;
            magma_dcopy( dofs, t.dval, 1, V(i+1), 1, queue );
            
            for (k = 0; k <= i; k++)
            {
                H(k, i) = magma_ddot( dofs, V(k), 1, V(i+1), 1, queue );
                temp = -H(k,i);
                // V(i+1) -= H(k, i) * V(k);
                magma_daxpy( dofs,-H(k,i), V(k), 1, V(i+1), 1, queue );
            }

            H(i+1, i) = MAGMA_D_MAKE( magma_dnrm2( dofs, V(i+1), 1, queue), 0. ); // H(i+1,i) = ||r||
            temp = 1.0 / H(i+1, i);
            // V(i+1) = V(i+1) / H(i+1, i)
            magma_dscal( dofs, temp, V(i+1), 1, queue );    //  (to be fused)
    
            for (k = 0; k < i; k++)
                ApplyPlaneRotation(&H(k,i), &H(k+1,i), cs[k], sn[k]);
          
            GeneratePlaneRotation(H(i,i), H(i+1,i), &cs[i], &sn[i]);
            ApplyPlaneRotation(&H(i,i), &H(i+1,i), cs[i], sn[i]);
            ApplyPlaneRotation(&s[i], &s[i+1], cs[i], sn[i]);
            
            betanom = MAGMA_D_ABS( s[i+1] );
            rel_resid = betanom / resid0;
            if ( solver_par->verbose > 0 ) {
                tempo2 = magma_sync_wtime( queue );
                if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                    solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                            = (real_Double_t) betanom;
                    solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                            = (real_Double_t) tempo2-tempo1;
                }
            }
            if (rel_resid <= solver_par->rtol || betanom <= solver_par->atol ){
                info = MAGMA_SUCCESS;
                break;
            }
        }
        while (i+1 < dim && solver_par->numiter+1 <= solver_par->maxiter);

        // solve upper triangular system in place
        for (j = i; j >= 0; j--)
        {
            s[j] /= H(j,j);
            for (k = j-1; k >= 0; k--)
                s[k] -= H(k,j) * s[j];
        }

        // update the solution
        for (j = 0; j <= i; j++)
        {
            // x = x + s[j] * W(j)
            magma_daxpy( dofs, s[j], W(j), 1, x->dval, 1, queue );
        }
    }
    while (rel_resid > solver_par->rtol
                && solver_par->numiter+1 <= solver_par->maxiter);

    tempo2 = magma_sync_wtime( queue );
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    double residual;
    CHECK( magma_dresidual( A, b, *x, &residual, queue ));
    solver_par->iter_res = betanom;
    solver_par->final_res = residual;

    if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) {
        info = MAGMA_SUCCESS;
    } else if ( solver_par->init_res > solver_par->final_res ) {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        info = MAGMA_SLOW_CONVERGENCE;
        if( solver_par->iter_res < solver_par->rtol*solver_par->init_res ||
            solver_par->iter_res < solver_par->atol ) {
            info = MAGMA_SUCCESS;
        }
    }
    else {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        info = MAGMA_DIVERGENCE;
    }
    
cleanup:
    // free pinned memory
    magma_free_pinned(s);
    magma_free_pinned(cs);
    magma_free_pinned(sn);
    magma_free_pinned(H);

    //free DEV memory
    magma_dmfree( &V, queue);
    magma_dmfree( &W, queue);
    magma_dmfree( &t, queue);
    magma_dmfree( &t2, queue);

    solver_par->info = info;
    return info;
} /* magma_dfgmres */
コード例 #3
0
ファイル: iterativesolvers.cpp プロジェクト: erwi/SpaMtrix
bool IterativeSolvers::gmres(const IRCMatrix &A,
                             Vector &x,
                             const Vector &b,
                             const Preconditioner &M) {
    const idx N = x.getLength();
    idx i, j = 1, k;
    Vector s(maxInnerIter + 1);
    Vector cs(maxInnerIter + 1);
    Vector sn(maxInnerIter + 1);
    Vector w(N);
    real normb = norm(M.solve(b));
    Vector r = M.solve(b - A * x);
    real beta = norm(r);
    if (normb == 0.0)
        normb = 1;
    real res(norm(r) / normb);
    if (res <= toler) {
        toler = res;
        maxIter = 0;
        return true;
    }
    Vector *v = new Vector[maxInnerIter + 1];
    for (idx id = 0; id < maxInnerIter + 1; ++id)
        v[id] = Vector(N);
    // CREATE HESSENBERG MATRIX NEEDED TO STORE INTERMEDIATES
    DenseMatrix H(maxInnerIter + 1, maxInnerIter);
    Vector temp(N);
    Vector temp2(maxInnerIter + 1);
    // MAIN LOOP
    while (j <= maxIter) {
        v[0] = r * (1.0 / beta);
        s = 0.0;
        s(0) = beta;
        // INNER ITERATIONS
        for (i = 0; i < maxInnerIter && j <= maxIter; i++, j++) {
            // CALCULATE w = M^{-1}(A*v[i])
            multiply(A, v[i], temp);
            M.solveMxb(w, temp);
            // PRE-CALCULATE DOT PRODUCTS IN PARALLEL
            // H(k,i) = dot( v[k], w)
#ifdef USES_OPENMP
            #pragma omp parallel for
#endif
            for (k = 0; k <= i ; ++k) {
                register real dp(0);
                for (idx id = 0 ; id < N ; ++id)
                    dp += w[id] * v[k][id];
                H(k, i) = dp; //dot(w,v[k]);
            }
            for (k = 0; k <= i; ++k) {
                // w -= v[k]*H(k,i) without temporaries
                register real tempr = H(k, i);
#ifdef USES_OPENMP
                #pragma omp parallel for // why is this loop so critical??
#endif
                for (idx id = 0 ; id < N ; ++id)
                    w[id] -= v[k][id] * tempr;
            }
            // BELOW PARALLEL REGION CALCULATES:
            // H(i+1,i) = norm(w);
            // v[i+1] = w * (1.0 / H(i+1, i));
            H(i + 1, i) = 0;
            real tempr(0);
#ifdef USES_OPENMP
            #pragma omp parallel shared(tempr)
#endif
            {
#ifdef USES_OPENMP
                #pragma omp for reduction(+:tempr)
#endif
                for (idx id = 0 ; id < N ; ++id)
                    tempr += w[id] * w[id]; //norm(w);
#ifdef USES_OPENMP
                #pragma omp single
#endif
                {
                    H(i + 1, i) = sqrt(tempr);
                    tempr = (1.0 / H(i + 1, i));
                }
#ifdef USES_OPENMP
                #pragma omp for
#endif
                for (idx id = 0 ; id < N ; ++id)
                    v[i + 1][id] = w[id] * tempr;
            }// end for omp parallel
            for (k = 0; k < i; k++)
                ApplyPlaneRotation(H(k, i), H(k + 1, i), cs(k), sn(k));
            GeneratePlaneRotation(H(i, i), H(i + 1, i), cs(i), sn(i));
            ApplyPlaneRotation(H(i, i), H(i + 1, i), cs(i), sn(i));
            ApplyPlaneRotation(s(i), s(i + 1), cs(i), sn(i));
            res = fabs(s(i + 1)) / normb;
            if (res < toler) {
                // COPY S INTO temp WITHOUT RESIZING
                for (idx id = 0 ; id < maxInnerIter + 1 ; ++id)
                    temp2[id] = s[id];
                Update(x, i, H, temp2, v);
                toler = res;
                maxIter = j;
                delete [] v;
                return true;
            }
        }// end for i IINNER ITERATIONS
        // COPY S INTO temp WITHOUT RESIZING
        for (idx id = 0 ; id < maxInnerIter + 1 ; ++id)
            temp2[id] = s[id];
        Update(x, maxInnerIter - 1, H, temp2, v);
        //multiply(A, x, temp);     //r = M.solve(b - A * x);
        M.solveMxb(r, b - A * x);
        beta = norm(r);
        res = beta / normb;
        if (res < toler) {
            toler = res;
            maxIter = j;
            delete [] v;
            return true;
        }
    }
    toler = res;
    delete [] v;
    return false;
}
コード例 #4
0
ファイル: gmres_part.cpp プロジェクト: xflying777/OpenAcc
void gmres(double *A, double *D, double *x, double *b, int N, int max_restart, int max_iter, double tol)
{
	int i, j, k, l, m, N2;
	double resid, *normb, *beta, *nrm_temp, temp, *M_temp, *r, *q, *v, *M_b, *w, *cs, *sn, *s, *y, *Q, *H;

	normb = (double *) malloc(1*sizeof(double));
	beta = (double *) malloc(1*sizeof(double));
	nrm_temp = (double *) malloc(1*sizeof(double));

	Q = (double *) malloc(N*N*(max_iter+1)*sizeof(double));
	H = (double *) malloc((N+1)*max_iter*sizeof(double));
	M_temp = (double *) malloc(N*N*sizeof(double));
	M_b = (double *) malloc(N*N*sizeof(double));
	r = (double *) malloc(N*N*sizeof(double));
	q = (double *) malloc(N*N*sizeof(double));
	v = (double *) malloc(N*N*sizeof(double));
	w = (double *) malloc(N*N*sizeof(double));
	cs = (double *) malloc((max_iter+1)*sizeof(double));
	sn = (double *) malloc((max_iter+1)*sizeof(double));
	s = (double *) malloc((max_iter+1)*sizeof(double));
	y = (double *) malloc((max_iter+1)*sizeof(double));

	N2 = N*N;

	#pragma acc data copyin(b[0:N2]) copyout(M_b[0:N2], r[0:N2], normb[0], beta[0])
	{
		fastpoisson(b, M_b, N);
		norm_gpu(M_b, normb, N2);
		#pragma acc parallel loop independent
		for (k=0; k<N2; k++)	r[k] = M_b[k];
		norm_gpu(r, beta, N2);
	}

	if ((resid = *beta / *normb) <= tol) 
	{
		tol = resid;
		max_iter = 0;
  	}

	for (m=0; m<max_restart; m++)
	{

		for (i=0; i<N2; i++)	Q[i] = r[i] / *beta;
		for (i=0; i<max_iter; i++)	s[i+1] = 0.0;
		s[0] = *beta;

		for (i=0; i<max_iter; i++)
		{
			#pragma acc data copyin(D[0:N2]) copy(Q[0:N2*(max_iter+1)], H[0:(N+1)*max_iter], cs[0:max_iter+1], sn[0:max_iter+1], s[0:max_iter+1]) create(q[0:N2], v[0:N2], M_temp[0:N2], w[0:N2], y[0:max_iter+1])
			{
		  		q_subQ_gpu(q, Q, N2, i);
		  		cublas_gemm(N, v, D, q);
				fastpoisson(v, M_temp, N);

				#pragma acc parallel loop independent present(w, q, M_temp)
		  		for (k=0; k<N*N; k++)	w[k] = q[k] + M_temp[k];

	  			// h(k,i) = qk*w
		  		for (k=0; k<=i; k++)
				{
					#pragma acc parallel loop independent
					for (j=0; j<N2; j++)
					{
						q[j] = Q[N2*k+j];
		  			}
					dot_gpu(q, w, nrm_temp, N2);
					H[max_iter*k+i] = *nrm_temp;
				}

				#pragma acc parallel loop seq
				for (k=0; k<=i; k++)
				{
					#pragma acc loop independent
					for (j=0; j<N2; j++)	w[j] -= H[max_iter*k+i]*Q[N2*k+j];
				}

				norm_gpu(w, nrm_temp, N2);
				H[max_iter*(i+1)+i] = *nrm_temp;
				subQ_v_gpu(Q, w, N2, i+1, H[max_iter*(i+1)+i]);

				#pragma acc kernels
				for (k = 0; k < i; k++)
				{
					//ApplyPlaneRotation(H(k,i), H(k+1,i), cs(k), sn(k))
					temp = cs[k]*H[max_iter*k+i] + sn[k]*H[max_iter*(k+1)+i];
					H[max_iter*(k+1)+i] = -1.0*sn[k]*H[max_iter*k+i] + cs[k]*H[max_iter*(k+1)+i];
					H[max_iter*k+i] = temp;
				} // end kernels

				GeneratePlaneRotation(H[max_iter*i+i], H[max_iter*(i+1)+i], cs, sn, i);
				#pragma acc kernels
				{
					//ApplyPlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i))
					H[max_iter*i+i] = cs[i]*H[max_iter*i+i] + sn[i]*H[max_iter*(i+1)+i];
					H[max_iter*(i+1)+i] = 0.0;

					//ApplyPlaneRotation(s(i), s(i+1), cs(i), sn(i));
					temp = cs[i]*s[i];
					s[i+1] = -1.0*sn[i]*s[i];
					s[i] = temp;
					resid = fabs(s[i+1] / *beta);
				} // end kernels
			} // end pragma data

			if (resid < tol)
			{
				backsolve(H, s, y, N, max_iter, i);
				#pragma acc data copyin(Q[0:N2*(max_iter+1)], y[0:max_iter+1]) copy(x[0:N2])
				#pragma acc parallel loop independent
				for(j=0; j<N; j++)
				{
					#pragma acc loop independent
					for (l=0; l<N; l++)
					{
						#pragma acc loop seq
						for(k=0; k<=i; k++)
						{
							x[N*j+l] += Q[N2*k+N*j+l]*y[k];
						}
					}
				}
				break;
			}
		}//end inside for

		if (resid < tol)
		{
			printf(" resid = %e \n", resid);
			printf(" Converges at %d cycle %d step. \n", m, i+1);
			break;
		}

		// Caution : i = i + 1.
		i = i - 1;
		backsolve(H, s, y, N, max_iter, i);

		#pragma acc data copyin(Q[0:N2*(max_iter+1)], y[0:max_iter+1]) copy(x[0:N2])
		#pragma acc parallel loop independent
		for(j=0; j<N; j++)
		{
			#pragma acc loop independent
			for (l=0; l<N; l++)
			{
				#pragma acc loop seq
				for(k=0; k<=i; k++)
				{
					x[N*j+l] += Q[N2*k+N*j+l]*y[k];
				}
			}
		}

		#pragma acc data copyin(D[0:N2], M_b[0:N2], x[0:N2]) create(v[0:N2], M_temp[0:N2]) copyout(r[0:N2])
		{
			cublas_gemm(N, v, D, x);
			fastpoisson(v, M_temp, N);
			#pragma acc parallel loop independent
			for (j=0; j<N2; j++)	r[j] = M_b[j] - (x[j] + M_temp[j]);
			norm_gpu(r, beta, N2);
		}

		s[i+1] = *beta;
		resid = s[i+1] / *normb;
		if ( resid < tol)
		{
			printf(" resid = %e \n", resid);
			printf(" Converges at %d cycle %d step. \n", m, i);
			break;
		}
	}//end outside for
}
コード例 #5
0
ファイル: gmres_md.cpp プロジェクト: xflying777/OpenAcc
void gmres(double *A, double *D, double *x, double *b, int N, int max_restart, int max_iter, double tol)
{
	int i, j, k, l, m, N2;
	double resid, *normb, *beta, *temp_nrm, temp, *r, *q, *Aq, *qA, *Dq, *w, *cs, *sn, *s, *y, *Q, *H, *res;
	
	normb = (double *) malloc(1*sizeof(double));
	beta = (double *) malloc(1*sizeof(double));
	temp_nrm = (double *) malloc(1*sizeof(double));
	
	Q = (double *) malloc(N*N*(max_iter+1)*sizeof(double));
	H = (double *) malloc((N+1)*max_iter*sizeof(double));
	r = (double *) malloc(N*N*sizeof(double));
	q = (double *) malloc(N*N*sizeof(double));
	Aq = (double *) malloc(N*N*sizeof(double));
	qA = (double *) malloc(N*N*sizeof(double));
	Dq = (double *) malloc(N*N*sizeof(double));
	w = (double *) malloc(N*N*sizeof(double));
	cs = (double *) malloc((max_iter+1)*sizeof(double));
	sn = (double *) malloc((max_iter+1)*sizeof(double));
	s = (double *) malloc((max_iter+1)*sizeof(double));
	y = (double *) malloc((max_iter+1)*sizeof(double));
	res = (double *) malloc(max_iter*sizeof(double));
	
	N2 = N*N;

	norm(b, normb, N2);
	for (k=0; k<N2; k++)	r[k] = b[k];
	norm(r, beta, N2);
	
	if ((resid = *beta / *normb) <= tol) 
	{
		tol = resid;
		max_iter = 0;
  	}
	
	for (m=0; m<max_restart; m++)
	{

		for (i=0; i<N2; i++)	Q[i] = r[i] / *beta;
		for (i=0; i<max_iter; i++)	s[i+1] = 0.0;
		s[0] = *beta;
		
		for (i = 0; i<max_iter; i++) 
		{
	  		q_subQ(q, Q, N2, i);
			matrix_matrix(A, q, Aq, N);
			matrix_matrix(q, A, qA, N);
	  		matrix_matrix(D, q, Dq, N);
	  		for (k=0; k<N2; k++)	w[k] = Aq[k] + qA[k] + Dq[k];
	  		
	  		for (k=0; k<=i; k++) 
			{
				q_subQ(q, Q, N2, k);
				H[max_iter*k+i] = inner_product(q, w, N2);
				w_shift(w, q, H[max_iter*k+i], N2);
	  		}

/*			for (k=0; k<=i; k++)
			{
				H[max_iter*k+i] = 0.0;
				for (j=0; j<N2; j++)	H[max_iter*k+i] += Q[N2*k+j]*w[j];
			}

			for (k=0; k<=i; k++)
			{
				for (j=0; j<N2; j++)	w[j] = w[j] - H[max_iter*k+i]*Q[N2*k+j];
			}
*/	  		
	  		norm(w, temp_nrm, N2);
			H[max_iter*(i+1)+i] = *temp_nrm;
			subQ_v(Q, w, N2, i+1, H[max_iter*(i+1)+i]);
			
			for (k = 0; k < i; k++)
			{
				//ApplyPlaneRotation(H(k,i), H(k+1,i), cs(k), sn(k))
				temp = cs[k]*H[max_iter*k+i] + sn[k]*H[max_iter*(k+1)+i];
				H[max_iter*(k+1)+i] = -1.0*sn[k]*H[max_iter*k+i] + cs[k]*H[max_iter*(k+1)+i];
				H[max_iter*k+i] = temp;
			}
			
			GeneratePlaneRotation(H[max_iter*i+i], H[max_iter*(i+1)+i], cs, sn, i);
	      	
			//ApplyPlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i))
			H[max_iter*i+i] = cs[i]*H[max_iter*i+i] + sn[i]*H[max_iter*(i+1)+i];
			H[max_iter*(i+1)+i] = 0.0;
			
			//ApplyPlaneRotation(s(i), s(i+1), cs(i), sn(i));
			temp = cs[i]*s[i];
			s[i+1] = -1.0*sn[i]*s[i];
			s[i] = temp;
			resid = fabs(s[i+1] / *beta);
			res[i] = resid;
	     	
			if (resid < tol) 
			{
//				backsolve(H, s, y, N, max_iter, i);
				for (k=0; k<max_iter+1; k++)	y[k] = s[k];
				cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, i, H, max_iter, y, 1);
				for(j=0; j<N; j++)
				{
					for (l=0; l<N; l++)
					{
						for(k=0; k<=i; k++)
						{
							x[N*j+l] += Q[N2*k+N*j+l]*y[k];
						}
					}
				}
				break;
			}
		}//end inside for
		
		if (resid < tol)	
		{
			printf(" resid = %e \n", resid);
			printf(" Converges at %d cycle %d step. \n", m, i+1);
			break;
		}
		
		// Caution : i = i + 1.
		i = i - 1;
		backsolve(H, s, y, N, max_iter, i);
		for(j=0; j<N; j++)
		{
			for (l=0; l<N; l++)
			{
				for(k=0; k<=i; k++)
				{
					x[N*j+l] += Q[N2*k+N*j+l]*y[k];
				}
			}
		}

		matrix_matrix(A, x, Aq, N);
		matrix_matrix(x, A, qA, N);
	  	matrix_matrix(D, x, Dq, N);
		for (j=0; j<N2; j++)	r[j] = b[j] - (Aq[j] + qA[j] + Dq[j]);
		norm(r, beta, N2);
		s[i+1] = *beta;
		resid = s[i+1] / *normb;
		if ( resid < tol)	
		{
			printf(" resid = %e \n", resid);
			printf(" Converges at %d cycle %d step. \n", m, i);
			break;
		}
	}//end outside for
}