LIS_INT lis_fgmres_quad(LIS_SOLVER solver) { LIS_MATRIX A; LIS_VECTOR b,x; LIS_VECTOR r,s,*z,*v; LIS_QUAD *h; LIS_QUAD_PTR aa,bb,rr,a2,b2,t,one,tmp; LIS_REAL bnrm2,nrm2,tol; LIS_INT iter,maxiter,n,output; double time,ptime; LIS_REAL rnorm; LIS_INT i,j,k,m; LIS_INT ii,i1,iiv,i1v,iih,jj; LIS_INT h_dim; LIS_INT cs,sn; LIS_DEBUG_FUNC_IN; A = solver->A; b = solver->b; x = solver->x; n = A->n; maxiter = solver->options[LIS_OPTIONS_MAXITER]; output = solver->options[LIS_OPTIONS_OUTPUT]; m = solver->options[LIS_OPTIONS_RESTART]; h_dim = m+1; ptime = 0.0; s = solver->work[0]; r = solver->work[1]; z = &solver->work[2]; v = &solver->work[m+2]; h = (LIS_QUAD *)lis_malloc( sizeof(LIS_QUAD)*(h_dim+1)*(h_dim+2),"lis_fgmres_quad::h" ); cs = (m+1)*h_dim; sn = (m+2)*h_dim; LIS_QUAD_SCALAR_MALLOC(aa,0,1); LIS_QUAD_SCALAR_MALLOC(bb,1,1); LIS_QUAD_SCALAR_MALLOC(rr,2,1); LIS_QUAD_SCALAR_MALLOC(a2,3,1); LIS_QUAD_SCALAR_MALLOC(b2,4,1); LIS_QUAD_SCALAR_MALLOC(t,5,1); LIS_QUAD_SCALAR_MALLOC(tmp,6,1); LIS_QUAD_SCALAR_MALLOC(one,7,1); one.hi[0] = 1.0; one.lo[0] = 0.0; /* Initial Residual */ if( lis_solver_get_initial_residual(solver,NULL,NULL,v[0],&bnrm2) ) { lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } tol = solver->tol; rnorm = 1.0/bnrm2; iter=0; while( iter<maxiter ) { /* first column of V */ /* v = r / ||r||_2 */ lis_vector_scaleex_nm(bnrm2,v[0]); /* s = ||r||_2 e_1 */ lis_vector_set_allex_nm(0.0,s); s->value[0] = rnorm; s->value_lo[0] = 0.0; i = 0; do { iter++; i++; ii = i-1; i1 = i; iiv = i-1; i1v = i; iih = (i-1)*h_dim; /* z = M^-1 * v */ time = lis_wtime(); lis_psolve(solver,v[iiv],z[iiv]); ptime += lis_wtime()-time; /* w = A * z */ lis_matvec(A,z[iiv], v[i1v]); for(k=0;k<i;k++) { /* h[k,i] = <w,v[k]> */ /* w = w - h[k,i] * v[k] */ lis_vector_dotex_mmm(v[i1v],v[k],&t); h[k+iih].hi = t.hi[0]; h[k+iih].lo = t.lo[0]; lis_quad_minus((LIS_QUAD *)t.hi); lis_vector_axpyex_mmm(t,v[k],v[i1v]); } /* h[i+1,i] = ||w|| */ /* v[i+1] = w / h[i+1,i] */ lis_vector_nrm2ex_mm(v[i1v],&t); h[i1+iih].hi = t.hi[0]; h[i1+iih].lo = t.lo[0]; lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)one.hi,(LIS_QUAD *)t.hi); lis_vector_scaleex_mm(tmp,v[i1v]); for(k=1;k<=ii;k++) { jj = k-1; t.hi[0] = h[jj+iih].hi; t.lo[0] = h[jj+iih].lo; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[jj+cs],(LIS_QUAD *)t.hi); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+sn],(LIS_QUAD *)&h[k+iih]); lis_quad_add((LIS_QUAD *)aa.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)&h[jj+sn],(LIS_QUAD *)t.hi); lis_quad_minus((LIS_QUAD *)bb.hi); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+cs],(LIS_QUAD *)&h[k+iih]); lis_quad_add((LIS_QUAD *)bb.hi,(LIS_QUAD *)bb.hi,(LIS_QUAD *)tmp.hi); h[jj+iih].hi = aa.hi[0]; h[jj+iih].lo = aa.lo[0]; h[k+iih].hi = bb.hi[0]; h[k+iih].lo = bb.lo[0]; } aa.hi[0] = h[ii+iih].hi; aa.lo[0] = h[ii+iih].lo; bb.hi[0] = h[i1+iih].hi; bb.lo[0] = h[i1+iih].lo; lis_quad_sqr((LIS_QUAD *)a2.hi,(LIS_QUAD *)aa.hi); lis_quad_sqr((LIS_QUAD *)b2.hi,(LIS_QUAD *)bb.hi); lis_quad_add((LIS_QUAD *)rr.hi,(LIS_QUAD *)a2.hi,(LIS_QUAD *)b2.hi); lis_quad_sqrt((LIS_QUAD *)rr.hi,(LIS_QUAD *)rr.hi); if( rr.hi[0]==0.0 ) { rr.hi[0]=1.0e-17; rr.lo[0]=0.0; } lis_quad_div((LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)aa.hi,(LIS_QUAD *)rr.hi); lis_quad_div((LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)bb.hi,(LIS_QUAD *)rr.hi); tmp.hi[0] = s->value[ii]; tmp.lo[0] = s->value_lo[ii]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)tmp.hi); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)tmp.hi); lis_quad_minus((LIS_QUAD *)aa.hi); s->value[i1] = aa.hi[0]; s->value_lo[i1] = aa.lo[0]; s->value[ii] = bb.hi[0]; s->value_lo[ii] = bb.lo[0]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)&h[ii+iih]); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)&h[i1+iih]); lis_quad_add((LIS_QUAD *)aa.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi); h[ii+iih].hi = aa.hi[0]; h[ii+iih].lo = aa.lo[0]; /* convergence check */ nrm2 = fabs(s->value[i1]); if( output ) { if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2; if( output & LIS_PRINT_OUT && A->my_rank==0 ) lis_print_rhistory(iter,nrm2); } if( tol >= nrm2 ) break; } while( i<m && iter <maxiter ); /* Solve H * Y = S for upper Hessenberg matrix H */ tmp.hi[0] = s->value[ii]; tmp.lo[0] = s->value_lo[ii]; lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[ii+iih]); s->value[ii] = tmp.hi[0]; s->value_lo[ii] = tmp.lo[0]; for(k=1;k<=ii;k++) { jj = ii-k; t.hi[0] = s->value[jj]; t.lo[0] = s->value_lo[jj]; for(j=jj+1;j<=ii;j++) { tmp.hi[0] = s->value[j]; tmp.lo[0] = s->value_lo[j]; lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+j*h_dim]); lis_quad_sub((LIS_QUAD *)t.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)tmp.hi); } lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)&h[jj+jj*h_dim]); s->value[jj] = tmp.hi[0]; s->value_lo[jj] = tmp.lo[0]; } /* x = x + y * z */ for(j=0;j<=ii;j++) { aa.hi[0] = s->value[j]; aa.lo[0] = s->value_lo[j]; lis_vector_axpyex_mmm(aa,z[j],x); } if( tol >= nrm2 ) { solver->retcode = LIS_SUCCESS; solver->iter = iter; solver->resid = nrm2; solver->ptime = ptime; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } lis_matvec(A,x,v[0]); lis_vector_xpay(b,-1.0,v[0]); memset(v[0]->value_lo,0,n*sizeof(LIS_SCALAR)); lis_vector_nrm2(v[0],&rnorm); bnrm2 = 1.0/rnorm; } solver->retcode = LIS_MAXITER; solver->iter = iter+1; solver->resid = nrm2; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_MAXITER; }
LIS_INT lis_gmres_switch(LIS_SOLVER solver) { LIS_MATRIX A; LIS_VECTOR b,x; LIS_VECTOR r,s,z,*v; LIS_QUAD *h; LIS_SCALAR *hd; LIS_QUAD_PTR aa,bb,rr,a2,b2,t,one,tmp; LIS_QUAD_PTR rnorm; LIS_REAL bnrm2,nrm2,tol,tol2; LIS_INT iter,maxiter,n,output; LIS_INT iter2,maxiter2; double time,ptime; LIS_INT i,j,k,m; LIS_INT ii,i1,iiv,i1v,iih,jj; LIS_INT h_dim; LIS_INT cs,sn; LIS_DEBUG_FUNC_IN; A = solver->A; b = solver->b; x = solver->x; n = A->n; maxiter = solver->options[LIS_OPTIONS_MAXITER]; maxiter2 = solver->options[LIS_OPTIONS_SWITCH_MAXITER]; output = solver->options[LIS_OPTIONS_OUTPUT]; tol = solver->params[LIS_PARAMS_RESID-LIS_OPTIONS_LEN]; tol2 = solver->params[LIS_PARAMS_SWITCH_RESID-LIS_OPTIONS_LEN]; m = solver->options[LIS_OPTIONS_RESTART]; h_dim = m+1; ptime = 0.0; s = solver->work[0]; r = solver->work[1]; z = solver->work[2]; v = &solver->work[3]; LIS_QUAD_SCALAR_MALLOC(aa,0,1); LIS_QUAD_SCALAR_MALLOC(bb,1,1); LIS_QUAD_SCALAR_MALLOC(rr,2,1); LIS_QUAD_SCALAR_MALLOC(a2,3,1); LIS_QUAD_SCALAR_MALLOC(b2,4,1); LIS_QUAD_SCALAR_MALLOC(t,5,1); LIS_QUAD_SCALAR_MALLOC(tmp,6,1); LIS_QUAD_SCALAR_MALLOC(one,7,1); LIS_QUAD_SCALAR_MALLOC(rnorm,8,1); h = (LIS_QUAD *)lis_malloc( sizeof(LIS_QUAD)*(h_dim+1)*(h_dim+2),"lis_gmres_switch::h" ); hd = (LIS_SCALAR *)h; cs = (m+1)*h_dim; sn = (m+2)*h_dim; one.hi[0] = 1.0; one.lo[0] = 0.0; z->precision = LIS_PRECISION_DEFAULT; /* r = M^-1 * (b - A * x) */ lis_matvec(A,x,z); lis_vector_xpay(b,-1.0,z); lis_psolve(solver,z,v[0]); /* Initial Residual */ if( lis_solver_get_initial_residual(solver,NULL,NULL,v[0],&bnrm2) ) { lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } tol2 = solver->tol_switch; iter=0; while( iter<maxiter2 ) { /* first column of V */ /* v = r / ||r||_2 */ lis_vector_nrm2(v[0],&rnorm.hi[0]); lis_vector_scale(1.0/rnorm.hi[0],v[0]); /* s = ||r||_2 e_1 */ lis_vector_set_all(0,s); s->value[0] = rnorm.hi[0]; i = 0; do { iter++; i++; ii = i-1; i1 = i; iiv = i-1; i1v = i; iih = (i-1)*h_dim; /* z = M^-1 * v */ time = lis_wtime(); lis_psolve(solver,v[iiv],z); ptime += lis_wtime()-time; /* w = A * z */ lis_matvec(A,z, v[i1v]); for(k=0;k<i;k++) { /* h[k,i] = <w,v[k]> */ /* w = w - h[k,i] * v[k] */ lis_vector_dot(v[i1v],v[k],&t.hi[0]); hd[k+iih] = t.hi[0]; lis_vector_axpy(-t.hi[0],v[k],v[i1v]); } /* h[i+1,i] = ||w|| */ /* v[i+1] = w / h[i+1,i] */ lis_vector_nrm2(v[i1v],&t.hi[0]); hd[i1+iih] = t.hi[0]; lis_vector_scale(1.0/t.hi[0],v[i1v]); for(k=1;k<=ii;k++) { jj = k-1; t.hi[0] = hd[jj+iih]; aa.hi[0] = hd[jj+cs]*t.hi[0]; aa.hi[0] += hd[jj+sn]*hd[k+iih]; bb.hi[0] = -hd[jj+sn]*t.hi[0]; bb.hi[0] += hd[jj+cs]*hd[k+iih]; hd[jj+iih] = aa.hi[0]; hd[k+iih] = bb.hi[0]; } aa.hi[0] = hd[ii+iih]; bb.hi[0] = hd[i1+iih]; a2.hi[0] = aa.hi[0]*aa.hi[0]; b2.hi[0] = bb.hi[0]*bb.hi[0]; rr.hi[0] = sqrt(a2.hi[0]+b2.hi[0]); if( rr.hi[0]==0.0 ) rr.hi[0]=1.0e-17; hd[ii+cs] = aa.hi[0]/rr.hi[0]; hd[ii+sn] = bb.hi[0]/rr.hi[0]; s->value[i1] = -hd[ii+sn]*s->value[ii]; s->value[ii] = hd[ii+cs]*s->value[ii]; aa.hi[0] = hd[ii+cs]*hd[ii+iih]; aa.hi[0] += hd[ii+sn]*hd[i1+iih]; hd[ii+iih] = aa.hi[0]; /* convergence check */ nrm2 = fabs(s->value[i1])*bnrm2; if( output ) { if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2; if( output & LIS_PRINT_OUT && A->my_rank==0 ) lis_print_rhistory(iter,nrm2); } if( tol2 >= nrm2 ) break; } while( i<m && iter <maxiter2 ); /* Solve H * Y = S for upper Hessenberg matrix H */ s->value[ii] = s->value[ii]/hd[ii+iih]; for(k=1;k<=ii;k++) { jj = ii-k; t.hi[0] = s->value[jj]; for(j=jj+1;j<=ii;j++) { t.hi[0] -= hd[jj+j*h_dim]*s->value[j]; } s->value[jj] = t.hi[0]/hd[jj+jj*h_dim]; } /* z = z + y * v */ for(k=0;k<n;k++) { z->value[k] = s->value[0]*v[0]->value[k]; } for(j=1;j<=ii;j++) { lis_vector_axpy(s->value[j],v[j],z); } /* r = M^-1 * z */ time = lis_wtime(); lis_psolve(solver,z,r); ptime += lis_wtime()-time; /* x = x + r */ lis_vector_axpy(1,r,x); if( tol2 >= nrm2 ) { solver->iter = iter; solver->iter2 = iter; solver->ptime = ptime; break; } for(j=1;j<=i;j++) { jj = i1-j+1; s->value[jj-1] = -hd[jj-1+sn]*s->value[jj]; s->value[jj] = hd[jj-1+cs]*s->value[jj]; } for(j=0;j<=i1;j++) { t.hi[0] = s->value[j]; if( j==0 ) t.hi[0] = t.hi[0]-1.0; lis_vector_axpy(t.hi[0],v[j],v[0]); } } /* Initial Residual */ z->precision = LIS_PRECISION_QUAD; solver->options[LIS_OPTIONS_INITGUESS_ZEROS] = LIS_FALSE; lis_vector_copyex_mn(x,solver->xx); lis_solver_get_initial_residual(solver,NULL,NULL,v[0],&bnrm2); tol = solver->tol; iter2=iter; while( iter2<maxiter ) { /* first column of V */ /* v = r / ||r||_2 */ lis_vector_nrm2ex_mm(v[0],&rnorm); lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)one.hi,(LIS_QUAD *)rnorm.hi); lis_vector_scaleex_mm(tmp,v[0]); /* s = ||r||_2 e_1 */ lis_vector_set_allex_nm(0.0,s); s->value[0] = rnorm.hi[0]; s->value_lo[0] = rnorm.lo[0]; i = 0; do { iter2++; i++; ii = i-1; i1 = i; iiv = i-1; i1v = i; iih = (i-1)*h_dim; /* z = M^-1 * v */ time = lis_wtime(); lis_psolve(solver,v[iiv],z); ptime += lis_wtime()-time; /* w = A * z */ lis_matvec(A,z, v[i1v]); for(k=0;k<i;k++) { /* h[k,i] = <w,v[k]> */ /* w = w - h[k,i] * v[k] */ lis_vector_dotex_mmm(v[i1v],v[k],&t); h[k+iih].hi = t.hi[0]; h[k+iih].lo = t.lo[0]; lis_quad_minus((LIS_QUAD *)t.hi); lis_vector_axpyex_mmm(t,v[k],v[i1v]); } /* h[i+1,i] = ||w|| */ /* v[i+1] = w / h[i+1,i] */ lis_vector_nrm2ex_mm(v[i1v],&t); h[i1+iih].hi = t.hi[0]; h[i1+iih].lo = t.lo[0]; lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)one.hi,(LIS_QUAD *)t.hi); lis_vector_scaleex_mm(tmp,v[i1v]); for(k=1;k<=ii;k++) { jj = k-1; t.hi[0] = h[jj+iih].hi; t.lo[0] = h[jj+iih].lo; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[jj+cs],(LIS_QUAD *)t.hi); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+sn],(LIS_QUAD *)&h[k+iih]); lis_quad_add((LIS_QUAD *)aa.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)&h[jj+sn],(LIS_QUAD *)t.hi); lis_quad_minus((LIS_QUAD *)bb.hi); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+cs],(LIS_QUAD *)&h[k+iih]); lis_quad_add((LIS_QUAD *)bb.hi,(LIS_QUAD *)bb.hi,(LIS_QUAD *)tmp.hi); h[jj+iih].hi = aa.hi[0]; h[jj+iih].lo = aa.lo[0]; h[k+iih].hi = bb.hi[0]; h[k+iih].lo = bb.lo[0]; } aa.hi[0] = h[ii+iih].hi; aa.lo[0] = h[ii+iih].lo; bb.hi[0] = h[i1+iih].hi; bb.lo[0] = h[i1+iih].lo; lis_quad_sqr((LIS_QUAD *)a2.hi,(LIS_QUAD *)aa.hi); lis_quad_sqr((LIS_QUAD *)b2.hi,(LIS_QUAD *)bb.hi); lis_quad_add((LIS_QUAD *)rr.hi,(LIS_QUAD *)a2.hi,(LIS_QUAD *)b2.hi); lis_quad_sqrt((LIS_QUAD *)rr.hi,(LIS_QUAD *)rr.hi); lis_quad_div((LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)aa.hi,(LIS_QUAD *)rr.hi); lis_quad_div((LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)bb.hi,(LIS_QUAD *)rr.hi); tmp.hi[0] = s->value[ii]; tmp.lo[0] = s->value_lo[ii]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)tmp.hi); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)tmp.hi); lis_quad_minus((LIS_QUAD *)aa.hi); s->value[i1] = aa.hi[0]; s->value_lo[i1] = aa.lo[0]; s->value[ii] = bb.hi[0]; s->value_lo[ii] = bb.lo[0]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)&h[ii+iih]); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)&h[i1+iih]); lis_quad_add((LIS_QUAD *)aa.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi); h[ii+iih].hi = aa.hi[0]; h[ii+iih].lo = aa.lo[0]; /* convergence check */ nrm2 = fabs(s->value[i1])*bnrm2; if( output ) { if( output & LIS_PRINT_MEM ) solver->rhistory[iter2] = nrm2; if( output & LIS_PRINT_OUT && A->my_rank==0 ) lis_print_rhistory(iter,nrm2); } if( tol >= nrm2 ) break; } while( i<m && iter2 <maxiter ); /* Solve H * Y = S for upper Hessenberg matrix H */ tmp.hi[0] = s->value[ii]; tmp.lo[0] = s->value_lo[ii]; lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[ii+iih]); s->value[ii] = tmp.hi[0]; s->value_lo[ii] = tmp.lo[0]; for(k=1;k<=ii;k++) { jj = ii-k; t.hi[0] = s->value[jj]; t.lo[0] = s->value_lo[jj]; for(j=jj+1;j<=ii;j++) { tmp.hi[0] = s->value[j]; tmp.lo[0] = s->value_lo[j]; lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+j*h_dim]); lis_quad_sub((LIS_QUAD *)t.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)tmp.hi); } lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)&h[jj+jj*h_dim]); s->value[jj] = tmp.hi[0]; s->value_lo[jj] = tmp.lo[0]; } /* z = z + y * v */ for(k=0;k<n;k++) { aa.hi[0] = s->value[0]; aa.lo[0] = s->value_lo[0]; bb.hi[0] = v[0]->value[k]; bb.lo[0] = v[0]->value_lo[k]; lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)bb.hi); z->value[k] = tmp.hi[0]; z->value_lo[k] = tmp.lo[0]; } for(j=1;j<=ii;j++) { aa.hi[0] = s->value[j]; aa.lo[0] = s->value_lo[j]; lis_vector_axpyex_mmm(aa,v[j],z); } /* r = M^-1 * z */ time = lis_wtime(); lis_psolve(solver,z,r); ptime += lis_wtime()-time; /* x = x + r */ lis_vector_axpyex_mmm(one,r,x); if( tol >= nrm2 ) { solver->retcode = LIS_SUCCESS; solver->iter = iter2; solver->iter2 = iter; solver->resid = nrm2; solver->ptime = ptime; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } for(j=1;j<=i;j++) { jj = i1-j+1; tmp.hi[0] = s->value[jj]; tmp.lo[0] = s->value_lo[jj]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj-1+sn]); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj-1+cs]); lis_quad_minus((LIS_QUAD *)aa.hi); s->value[jj-1] = aa.hi[0]; s->value_lo[jj-1] = aa.lo[0]; s->value[jj] = bb.hi[0]; s->value_lo[jj] = bb.lo[0]; } for(j=0;j<=i1;j++) { t.hi[0] = s->value[j]; t.lo[0] = s->value_lo[j]; if( j==0 ) { lis_quad_sub((LIS_QUAD *)t.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)one.hi); } lis_vector_axpyex_mmm(t,v[j],v[0]); } } solver->retcode = LIS_MAXITER; solver->iter = iter2+1; solver->iter2 = iter; solver->resid = nrm2; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_MAXITER; }
LIS_INT lis_gmres_quad(LIS_SOLVER solver) { LIS_MATRIX A; LIS_PRECON M; LIS_VECTOR b,x; LIS_VECTOR r,s, z, *v; LIS_QUAD *h; LIS_QUAD_PTR aa,bb,rr,a2,b2,t,one,tmp; LIS_QUAD_PTR rnorm; LIS_REAL bnrm2, nrm2, tol; LIS_INT iter,maxiter,n,output,conv; double times,ptimes; LIS_INT i,j,k,m; LIS_INT ii,i1,iiv,i1v,iih,i1h,jj; LIS_INT h_dim; LIS_INT cs,sn; LIS_DEBUG_FUNC_IN; A = solver->A; M = solver->precon; b = solver->b; x = solver->x; n = A->n; maxiter = solver->options[LIS_OPTIONS_MAXITER]; output = solver->options[LIS_OPTIONS_OUTPUT]; m = solver->options[LIS_OPTIONS_RESTART]; conv = solver->options[LIS_OPTIONS_CONV_COND]; h_dim = m+1; ptimes = 0.0; s = solver->work[0]; r = solver->work[1]; z = solver->work[2]; v = &solver->work[3]; LIS_QUAD_SCALAR_MALLOC(aa,0,1); LIS_QUAD_SCALAR_MALLOC(bb,1,1); LIS_QUAD_SCALAR_MALLOC(rr,2,1); LIS_QUAD_SCALAR_MALLOC(a2,3,1); LIS_QUAD_SCALAR_MALLOC(b2,4,1); LIS_QUAD_SCALAR_MALLOC(t,5,1); LIS_QUAD_SCALAR_MALLOC(tmp,6,1); LIS_QUAD_SCALAR_MALLOC(one,7,1); LIS_QUAD_SCALAR_MALLOC(rnorm,8,1); h = (LIS_QUAD *)lis_malloc( sizeof(LIS_QUAD) * (h_dim+1) * (h_dim+2),"lis_gmres_quad::h" ); cs = (m+1)*h_dim; sn = (m+2)*h_dim; one.hi[0] = 1.0; one.lo[0] = 0.0; /* Initial Residual */ if( lis_solver_get_initial_residual(solver,NULL,NULL,v[0],&bnrm2) ) { lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } tol = solver->tol; iter=0; while( iter<maxiter ) { /* first column of V */ /* v = r / ||r||_2 */ lis_vector_nrm2ex_mm(v[0],&rnorm); lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)one.hi,(LIS_QUAD *)rnorm.hi); lis_vector_scaleex_mm(tmp,v[0]); /* s = ||r||_2 e_1 */ lis_vector_set_allex_nm(0.0,s); s->value[0] = rnorm.hi[0]; s->value_lo[0] = rnorm.lo[0]; i = 0; do { iter++; i++; ii = i-1; i1 = i; iiv = i-1; i1v = i; iih = (i-1)*h_dim; i1h = i*h_dim; /* z = M^-1 v */ times = lis_wtime(); lis_psolve(solver, v[iiv], z); ptimes += lis_wtime()-times; /* v = Az */ LIS_MATVEC(A,z, v[i1v]); for(k=0;k<i;k++) { /* h[k,i] = <w,v[k]> */ /* w = w - h[k,i]v[k] */ lis_vector_dotex_mmm(v[i1v],v[k],&t); h[k + iih].hi = t.hi[0]; h[k + iih].lo = t.lo[0]; lis_quad_minus((LIS_QUAD *)t.hi); lis_vector_axpyex_mmm(t,v[k],v[i1v]); } /* h[i+1,i] = ||w|| */ /* v[i+1] = w / h[i+1,i] */ lis_vector_nrm2ex_mm(v[i1v],&t); h[i1 + iih].hi = t.hi[0]; h[i1 + iih].lo = t.lo[0]; lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)one.hi,(LIS_QUAD *)t.hi); lis_vector_scaleex_mm(tmp,v[i1v]); for(k=1;k<=ii;k++) { jj = k-1; t.hi[0] = h[jj + iih].hi; t.lo[0] = h[jj + iih].lo; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[jj+cs],(LIS_QUAD *)t.hi); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+sn],(LIS_QUAD *)&h[k+iih]); lis_quad_add((LIS_QUAD *)aa.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)&h[jj+sn],(LIS_QUAD *)t.hi); lis_quad_minus((LIS_QUAD *)bb.hi); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+cs],(LIS_QUAD *)&h[k+iih]); lis_quad_add((LIS_QUAD *)bb.hi,(LIS_QUAD *)bb.hi,(LIS_QUAD *)tmp.hi); h[jj + iih].hi = aa.hi[0]; h[jj + iih].lo = aa.lo[0]; h[k + iih].hi = bb.hi[0]; h[k + iih].lo = bb.lo[0]; } aa.hi[0] = h[ii + iih].hi; aa.lo[0] = h[ii + iih].lo; bb.hi[0] = h[i1 + iih].hi; bb.lo[0] = h[i1 + iih].lo; lis_quad_sqr((LIS_QUAD *)a2.hi,(LIS_QUAD *)aa.hi); lis_quad_sqr((LIS_QUAD *)b2.hi,(LIS_QUAD *)bb.hi); lis_quad_add((LIS_QUAD *)rr.hi,(LIS_QUAD *)a2.hi,(LIS_QUAD *)b2.hi); lis_quad_sqrt((LIS_QUAD *)rr.hi,(LIS_QUAD *)rr.hi); if( rr.hi[0]==0.0 ) { rr.hi[0]=1.0e-17; rr.lo[0]=0.0; } lis_quad_div((LIS_QUAD *)&h[ii + cs],(LIS_QUAD *)aa.hi,(LIS_QUAD *)rr.hi); lis_quad_div((LIS_QUAD *)&h[ii + sn],(LIS_QUAD *)bb.hi,(LIS_QUAD *)rr.hi); tmp.hi[0] = s->value[ii]; tmp.lo[0] = s->value_lo[ii]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[ii + sn],(LIS_QUAD *)tmp.hi); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)&h[ii + cs],(LIS_QUAD *)tmp.hi); lis_quad_minus((LIS_QUAD *)aa.hi); s->value[i1] = aa.hi[0]; s->value_lo[i1] = aa.lo[0]; s->value[ii] = bb.hi[0]; s->value_lo[ii] = bb.lo[0]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)&h[ii+iih]); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)&h[i1+iih]); lis_quad_add((LIS_QUAD *)aa.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi); h[ii + iih].hi = aa.hi[0]; h[ii + iih].lo = aa.lo[0]; /* convergence check */ nrm2 = fabs(s->value[i1]) * bnrm2; if( output ) { if( output & LIS_PRINT_MEM ) solver->residual[iter] = nrm2; if( output & LIS_PRINT_OUT && A->my_rank==0 ) printf("iter: %5d residual = %e\n", iter, nrm2); } if( tol >= nrm2 ) break; } while( i<m && iter <maxiter ); /* Solve H*Y =S for upper triangular H */ tmp.hi[0] = s->value[ii]; tmp.lo[0] = s->value_lo[ii]; lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[ii + iih]); s->value[ii] = tmp.hi[0]; s->value_lo[ii] = tmp.lo[0]; for(k=1;k<=ii;k++) { jj = ii-k; t.hi[0] = s->value[jj]; t.lo[0] = s->value_lo[jj]; for(j=jj+1;j<=ii;j++) { tmp.hi[0] = s->value[j]; tmp.lo[0] = s->value_lo[j]; lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj + j*h_dim]); lis_quad_sub((LIS_QUAD *)t.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)tmp.hi); } lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)&h[jj + jj*h_dim]); s->value[jj] = tmp.hi[0]; s->value_lo[jj] = tmp.lo[0]; } /* x = x + yv */ for(k=0;k<n;k++) { aa.hi[0] = s->value[0]; aa.lo[0] = s->value_lo[0]; bb.hi[0] = v[0]->value[k]; bb.lo[0] = v[0]->value_lo[k]; lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)bb.hi); z->value[k] = tmp.hi[0]; z->value_lo[k] = tmp.lo[0]; } for(j=1;j<=ii;j++) { aa.hi[0] = s->value[j]; aa.lo[0] = s->value_lo[j]; lis_vector_axpyex_mmm(aa,v[j],z); } /* r = M^-1 z */ times = lis_wtime(); lis_psolve(solver, z, r); ptimes += lis_wtime()-times; /* x = x + r */ lis_vector_axpyex_mmm(one,r,x); if( tol >= nrm2 ) { solver->retcode = LIS_SUCCESS; solver->iter = iter; solver->iter2 = 0; solver->resid = nrm2; solver->ptimes = ptimes; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } for(j=1;j<=i;j++) { jj = i1-j+1; tmp.hi[0] = s->value[jj]; tmp.lo[0] = s->value_lo[jj]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj-1 + sn]); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj-1 + cs]); lis_quad_minus((LIS_QUAD *)aa.hi); s->value[jj-1] = aa.hi[0]; s->value_lo[jj-1] = aa.lo[0]; s->value[jj] = bb.hi[0]; s->value_lo[jj] = bb.lo[0]; } for(j=0;j<=i1;j++) { t.hi[0] = s->value[j]; t.lo[0] = s->value_lo[j]; if( j==0 ) { lis_quad_sub((LIS_QUAD *)t.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)one.hi); } lis_vector_axpyex_mmm(t,v[j],v[0]); } } solver->retcode = LIS_MAXITER; solver->iter = iter+1; solver->iter2 = 0; solver->resid = nrm2; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_MAXITER; }