LIS_INT lis_finalize(void) { LIS_DEBUG_FUNC_IN; lis_precon_register_free(); /* if( cmd_args ) lis_args_free(cmd_args); */ if( cmd_args ) { lis_args_free(cmd_args); cmd_args = NULL; } #ifdef _OPENMP lis_free(lis_vec_tmp); #endif #ifdef USE_QUAD_PRECISION lis_free(lis_quad_scalar_tmp); lis_quad_x87_fpu_finalize(lis_x87_fpu_cw); #endif lis_free_all(); #ifdef USE_MPI if (!lis_mpi_initialized) MPI_Finalize(); #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_vector_destroy(LIS_VECTOR vec) { LIS_DEBUG_FUNC_IN; if( lis_is_malloc(vec) ) { if( vec->value && vec->is_destroy ) lis_free( vec->value ); if( vec->work ) lis_free( vec->work ); if( vec->ranges ) lis_free( vec->ranges ); if( vec ) lis_free(vec); } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_solver_destroy(LIS_SOLVER solver) { LIS_DEBUG_FUNC_IN; if( solver ) { lis_solver_work_destroy(solver); lis_vector_destroy(solver->d); if( solver->At ) lis_matrix_destroy(solver->At); if( solver->residual ) lis_free(solver->residual); lis_free(solver); } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_esolver_destroy(LIS_ESOLVER esolver) { LIS_INT i,ss; LIS_DEBUG_FUNC_IN; if( esolver ) { lis_esolver_work_destroy(esolver); if( esolver->rhistory ) lis_free(esolver->rhistory); if( esolver->evalue ) lis_free(esolver->evalue); if( esolver->resid ) lis_free(esolver->resid); if( esolver->iter) lis_free(esolver->iter); if( esolver->iter2) lis_free(esolver->iter2); if( esolver->evector ) { if ( esolver->options[LIS_EOPTIONS_ESOLVER] == LIS_ESOLVER_LI || esolver->options[LIS_EOPTIONS_ESOLVER] == LIS_ESOLVER_AI || esolver->options[LIS_EOPTIONS_ESOLVER] == LIS_ESOLVER_SI ) { ss=esolver->options[LIS_EOPTIONS_SUBSPACE]; for(i=0;i<ss+2;i++) lis_vector_destroy(esolver->evector[i]); } lis_free(esolver->evector); } lis_free(esolver); } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_matrix_setDLU_csr(LIS_INT nnzl, LIS_INT nnzu, LIS_SCALAR *diag, LIS_INT *lptr, LIS_INT *lindex, LIS_SCALAR *lvalue, LIS_INT *uptr, LIS_INT *uindex, LIS_SCALAR *uvalue, LIS_MATRIX A) { LIS_INT err; LIS_MATRIX_DIAG D; LIS_DEBUG_FUNC_IN; #if 0 err = lis_matrix_check(A,LIS_MATRIX_CHECK_SET); if( err ) return err; #else if(lis_matrix_is_assembled(A)) return LIS_SUCCESS; else { err = lis_matrix_check(A,LIS_MATRIX_CHECK_SET); if( err ) return err; } #endif A->L = (LIS_MATRIX_CORE)lis_calloc(sizeof(struct LIS_MATRIX_CORE_STRUCT), "lis_matrix_setDLU_csr::A->L"); if( A->L==NULL ) { LIS_SETERR_MEM(sizeof(struct LIS_MATRIX_CORE_STRUCT)); return LIS_OUT_OF_MEMORY; } A->U = (LIS_MATRIX_CORE)lis_calloc(sizeof(struct LIS_MATRIX_CORE_STRUCT), "lis_matrix_setDLU_csr::A->U"); if( A->U==NULL ) { LIS_SETERR_MEM(sizeof(struct LIS_MATRIX_CORE_STRUCT)); lis_matrix_DLU_destroy(A); return LIS_OUT_OF_MEMORY; } err = lis_matrix_diag_create(A->n,0,A->comm,&D); if( err ) { lis_matrix_DLU_destroy(A); return err; } lis_free(D->value); D->value = diag; A->D = D; A->L->nnz = nnzl; A->L->ptr = lptr; A->L->index = lindex; A->L->value = lvalue; A->U->nnz = nnzu; A->U->ptr = uptr; A->U->index = uindex; A->U->value = uvalue; A->is_copy = LIS_FALSE; A->status = -LIS_MATRIX_CSR; A->is_splited = LIS_TRUE; LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
void lis_set_argv_end_f(LIS_INT *ierr) { LIS_INT i; LIS_DEBUG_FUNC_IN; for(i=0;i<f_argc_tmp;i++) { lis_free(f_argv_tmp[i]); } lis_free(f_argv_tmp); f_argv_tmp = NULL; f_argc_tmp = 0; *ierr = LIS_SUCCESS; LIS_DEBUG_FUNC_OUT; return; }
LIS_INT lis_quad_free(LIS_QUAD_PTR *a) { LIS_DEBUG_FUNC_IN; lis_free(a->hi); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_input_hb(LIS_MATRIX A, LIS_VECTOR b, LIS_VECTOR x, FILE *file) { LIS_INT err; LIS_INT matrix_type; LIS_MATRIX B; LIS_DEBUG_FUNC_IN; matrix_type = A->matrix_type; err = lis_input_hb_csr(A,b,x,file); if( err ) return err; if( matrix_type!=LIS_MATRIX_CSR && matrix_type!=LIS_MATRIX_CSC ) { err = lis_matrix_duplicate(A,&B); if( err ) return err; lis_matrix_set_type(B,matrix_type); err = lis_matrix_convert(A,B); if( err ) return err; lis_matrix_storage_destroy(A); lis_matrix_DLU_destroy(A); lis_matrix_diag_destroy(A->WD); if( A->l2g_map ) lis_free( A->l2g_map ); if( A->commtable ) lis_commtable_destroy( A->commtable ); if( A->ranges ) lis_free( A->ranges ); err = lis_matrix_copy_struct(B,A); if( err ) return err; lis_free(B); if( A->matrix_type==LIS_MATRIX_JAD ) { A->work = (LIS_SCALAR *)lis_malloc(A->n*sizeof(LIS_SCALAR),"lis_input_hb::A->work"); if( A->work==NULL ) { LIS_SETERR_MEM(A->n*sizeof(LIS_SCALAR)); return LIS_OUT_OF_MEMORY; } } } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_precon_create_adds(LIS_SOLVER solver, LIS_PRECON precon) { LIS_INT i,j; LIS_INT precon_type,worklen; LIS_INT err; LIS_VECTOR *work; LIS_DEBUG_FUNC_IN; precon_type = solver->options[LIS_OPTIONS_PRECON]; worklen = 2; work = (LIS_VECTOR *)lis_malloc( worklen*sizeof(LIS_VECTOR),"lis_precon_create_adds::work" ); if( work==NULL ) { LIS_SETERR_MEM(worklen*sizeof(LIS_VECTOR)); return LIS_OUT_OF_MEMORY; } if( solver->precision==LIS_PRECISION_DEFAULT ) { for(i=0;i<worklen;i++) { err = lis_vector_duplicate(solver->A,&work[i]); if( err ) break; } } else { for(i=0;i<worklen;i++) { err = lis_vector_duplicateex(LIS_PRECISION_QUAD,solver->A,&work[i]); if( err ) break; } } if( i<worklen ) { for(j=0;j<i;j++) lis_vector_destroy(work[j]); lis_free(work); return err; } precon->worklen = worklen; precon->work = work; err = lis_precon_create_xxx[precon_type](solver,precon); if( err ) { lis_precon_destroy(precon); return err; } precon->A = solver->A; precon->is_copy = LIS_FALSE; LIS_DEBUG_FUNC_OUT; return err; }
LIS_INT lis_precon_destroy(LIS_PRECON precon) { LIS_INT i; LIS_DEBUG_FUNC_IN; if( precon ) { if( precon->is_copy ) lis_matrix_destroy(precon->A); lis_vector_destroy(precon->Pb); lis_vector_destroy(precon->D); lis_vector_destroy(precon->temp); lis_matrix_ilu_destroy(precon->L); lis_matrix_ilu_destroy(precon->U); lis_matrix_diag_destroy(precon->WD); if( precon->solver ) { lis_vector_destroy(precon->solver->x); lis_precon_destroy(precon->solver->precon); lis_solver_destroy(precon->solver); } #if defined(USE_SAAMG) lis_commtable_destroy(precon->commtable); if( precon->precon_type==LIS_PRECON_TYPE_SAAMG ) { (*(void (*)())f_clear_matrix_ptr)(&precon->level_num); } #endif if( precon->work ) { for(i=0;i<precon->worklen;i++) { lis_vector_destroy(precon->work[i]); } lis_free(precon->work); } lis_free(precon); } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_gmres_malloc_work(LIS_SOLVER solver) { LIS_VECTOR *work; LIS_INT i,j,restart,worklen,err; LIS_DEBUG_FUNC_IN; restart = solver->options[LIS_OPTIONS_RESTART]; worklen = NWORK + (restart+1); work = (LIS_VECTOR *)lis_malloc( worklen*sizeof(LIS_VECTOR),"lis_gmres_malloc_work::work" ); if( work==NULL ) { LIS_SETERR_MEM(worklen*sizeof(LIS_VECTOR)); return LIS_ERR_OUT_OF_MEMORY; } if( solver->precision==LIS_PRECISION_DEFAULT ) { for(i=1;i<worklen;i++) { err = lis_vector_duplicate(solver->A,&work[i]); if( err ) break; } } else { for(i=1;i<worklen;i++) { err = lis_vector_duplicateex(LIS_PRECISION_QUAD,solver->A,&work[i]); if( err ) break; memset(work[i]->value_lo,0,solver->A->np*sizeof(LIS_SCALAR)); } } if( i<worklen ) { for(j=1;j<i;j++) lis_vector_destroy(work[j]); lis_free(work); return err; } if( solver->precision==LIS_PRECISION_DEFAULT ) { lis_vector_create(solver->A->comm,&work[0]); } else { lis_vector_createex(LIS_PRECISION_QUAD,solver->A->comm,&work[0]); } lis_vector_set_size(work[0],restart+1,0); solver->worklen = worklen; solver->work = work; LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_idrs_malloc_work(LIS_SOLVER solver) { LIS_VECTOR *work; LIS_INT i,j,s,worklen,err; LIS_DEBUG_FUNC_IN; /* err = lis_matrix_convert(solver->A,&solver->At,LIS_MATRIX_CCS); if( err ) return err; */ s = solver->options[LIS_OPTIONS_IDRS_RESTART]; worklen = NWORK + 3*s; work = (LIS_VECTOR *)lis_malloc( worklen*sizeof(LIS_VECTOR),"lis_idrs_malloc_work::work" ); if( work==NULL ) { LIS_SETERR_MEM(worklen*sizeof(LIS_VECTOR)); return LIS_ERR_OUT_OF_MEMORY; } if( solver->precision==LIS_PRECISION_DEFAULT ) { for(i=0;i<worklen;i++) { err = lis_vector_duplicate(solver->A,&work[i]); if( err ) break; } } else { for(i=0;i<worklen;i++) { err = lis_vector_duplicateex(LIS_PRECISION_QUAD,solver->A,&work[i]); if( err ) break; memset(work[i]->value_lo,0,solver->A->np*sizeof(LIS_SCALAR)); } } if( i<worklen ) { for(j=0;j<i;j++) lis_vector_destroy(work[j]); lis_free(work); return err; } solver->worklen = worklen; solver->work = work; LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_args_free(LIS_ARGS args) { LIS_ARGS arg,t; LIS_DEBUG_FUNC_IN; arg = args->next; while( arg!=args ) { t = arg; arg = arg->next; lis_free2(2,t->arg1,t->arg2); t->next->prev = t->prev; t->prev->next = t->next; lis_free(t); } if (args) lis_free(args); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_precon_register_free(void) { LIS_DEBUG_FUNC_IN; if( precon_register_top ) { lis_free(precon_register_top); precon_register_top = NULL; } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_vector_unset(LIS_VECTOR vec) { LIS_INT err; LIS_DEBUG_FUNC_IN; err = lis_vector_check(vec,LIS_VECTOR_CHECK_NULL); if( err ) return err; if( vec->is_copy ) lis_free(vec->value); vec->value = NULL; vec->status = LIS_VECTOR_NULL; LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_esolver_work_destroy(LIS_ESOLVER esolver) { LIS_INT i; LIS_DEBUG_FUNC_IN; if( esolver && esolver->work ) { for(i=0;i<esolver->worklen;i++) lis_vector_destroy(esolver->work[i]); lis_free(esolver->work); esolver->work = NULL; esolver->worklen = 0; } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_solver_work_destroy(LIS_SOLVER solver) { LIS_INT i; LIS_DEBUG_FUNC_IN; if( solver && solver->work ) { for(i=0;i<solver->worklen;i++) lis_vector_destroy(solver->work[i]); lis_free(solver->work); solver->work = NULL; solver->worklen = 0; } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_vector_set(LIS_VECTOR vec, LIS_SCALAR *value) { LIS_INT err; LIS_DEBUG_FUNC_IN; err = lis_vector_check(vec,LIS_VECTOR_CHECK_NULL); if( err ) return err; if( vec->is_destroy ) lis_free(vec->value); vec->value = value; vec->is_copy = LIS_FALSE; vec->status = LIS_VECTOR_ASSEMBLING; LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_esi_malloc_work(LIS_ESOLVER esolver) { LIS_VECTOR *work; LIS_INT i,j,worklen,err,ss; LIS_DEBUG_FUNC_IN; ss = esolver->options[LIS_EOPTIONS_SUBSPACE]; worklen = NWORK + ss; work = (LIS_VECTOR *)lis_malloc( worklen*sizeof(LIS_VECTOR),"lis_esi_malloc_work::work" ); if( work==NULL ) { LIS_SETERR_MEM(worklen*sizeof(LIS_VECTOR)); return LIS_ERR_OUT_OF_MEMORY; } if( esolver->eprecision==LIS_PRECISION_DEFAULT ) { for(i=0;i<worklen;i++) { err = lis_vector_duplicate(esolver->A,&work[i]); if( err ) break; } } else { for(i=0;i<worklen;i++) { err = lis_vector_duplicateex(LIS_PRECISION_QUAD,esolver->A,&work[i]); if( err ) break; } } if( i<worklen ) { for(j=0;j<i;j++) lis_vector_destroy(work[j]); lis_free(work); return err; } esolver->worklen = worklen; esolver->work = work; LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_cgs_malloc_work(LIS_SOLVER solver) { LIS_VECTOR *work; LIS_INT i,j,worklen,err; LIS_DEBUG_FUNC_IN; worklen = NWORK; work = (LIS_VECTOR *)lis_malloc( worklen*sizeof(LIS_VECTOR),"lis_cgs_malloc_work::work" ); if( work==NULL ) { LIS_SETERR_MEM(worklen*sizeof(LIS_VECTOR)); return LIS_ERR_OUT_OF_MEMORY; } if( solver->precision==LIS_PRECISION_DEFAULT ) { for(i=0;i<worklen;i++) { err = lis_vector_duplicate(solver->A,&work[i]); if( err ) break; } } else { for(i=0;i<worklen;i++) { err = lis_vector_duplicateex(LIS_PRECISION_QUAD,solver->A,&work[i]); if( err ) break; memset(work[i]->value_lo,0,solver->A->np*sizeof(LIS_SCALAR)); } } if( i<worklen ) { for(j=0;j<i;j++) lis_vector_destroy(work[j]); lis_free(work); return err; } solver->worklen = worklen; solver->work = work; LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_matrix_ilu_destroy(LIS_MATRIX_ILU A) { LIS_INT i,j; LIS_DEBUG_FUNC_IN; if( lis_is_malloc(A) ) { if( A->bsz ) { for(i=0;i<A->n;i++) { free(A->index[i]); for(j=0;j<A->nnz[i];j++) { free(A->values[i][j]); } if( A->nnz[i]>0 ) free(A->values[i]); } lis_free2(5,A->bsz,A->nnz,A->index,A->values,A->nnz_ma); } else { for(i=0;i<A->n;i++) { if( A->nnz[i]>0 ) { free(A->index[i]); free(A->value[i]); } } lis_free2(4,A->nnz,A->index,A->value,A->nnz_ma); } lis_free(A); } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_matrix_split2_csr(LIS_MATRIX A) { LIS_INT i,j,n; LIS_INT nnzl,nnzu; LIS_INT err; LIS_INT *lptr,*lindex,*uptr,*uindex; LIS_SCALAR *lvalue,*uvalue; #ifdef _OPENMP LIS_INT kl,ku; LIS_INT *liw,*uiw; #endif LIS_DEBUG_FUNC_IN; n = A->n; nnzl = 0; nnzu = 0; lptr = NULL; lindex = NULL; lvalue = NULL; uptr = NULL; uindex = NULL; uvalue = NULL; #ifdef _OPENMP liw = (LIS_INT *)lis_malloc((n+1)*sizeof(LIS_INT),"lis_matrix_split2_csr::liw"); if( liw==NULL ) { LIS_SETERR_MEM((n+1)*sizeof(LIS_INT)); return LIS_OUT_OF_MEMORY; } uiw = (LIS_INT *)lis_malloc((n+1)*sizeof(LIS_INT),"lis_matrix_split2_csr::uiw"); if( uiw==NULL ) { LIS_SETERR_MEM((n+1)*sizeof(LIS_INT)); lis_free(liw); return LIS_OUT_OF_MEMORY; } #pragma omp parallel for private(i) for(i=0;i<n+1;i++) { liw[i] = 0; uiw[i] = 0; } #pragma omp parallel for private(i,j) for(i=0;i<n;i++) { for(j=A->ptr[i];j<A->ptr[i+1];j++) { if( A->index[j]<n ) { liw[i+1]++; } else { uiw[i+1]++; } } } for(i=0;i<n;i++) { liw[i+1] += liw[i]; uiw[i+1] += uiw[i]; } nnzl = liw[n]; nnzu = uiw[n]; #else for(i=0;i<n;i++) { for(j=A->ptr[i];j<A->ptr[i+1];j++) { if( A->index[j]<n ) { nnzl++; } else { nnzu++; } } } #endif err = lis_matrix_LU_create(A); if( err ) { return err; } err = lis_matrix_malloc_csr(n,nnzl,&lptr,&lindex,&lvalue); if( err ) { return err; } err = lis_matrix_malloc_csr(n,nnzu,&uptr,&uindex,&uvalue); if( err ) { lis_free2(6,lptr,lindex,lvalue,uptr,uindex,uvalue); return err; } #ifdef _OPENMP #pragma omp parallel for private(i) for(i=0;i<n+1;i++) { lptr[i] = liw[i]; uptr[i] = uiw[i]; } #pragma omp parallel for private(i,j,kl,ku) for(i=0;i<n;i++) { kl = lptr[i]; ku = uptr[i]; for(j=A->ptr[i];j<A->ptr[i+1];j++) { if( A->index[j]<n ) { lindex[kl] = A->index[j]; lvalue[kl] = A->value[j]; kl++; } else { uindex[ku] = A->index[j]; uvalue[ku] = A->value[j]; ku++; } } } lis_free2(2,liw,uiw); #else nnzl = 0; nnzu = 0; lptr[0] = 0; uptr[0] = 0; for(i=0;i<n;i++) { for(j=A->ptr[i];j<A->ptr[i+1];j++) { if( A->index[j]<n ) { lindex[nnzl] = A->index[j]; lvalue[nnzl] = A->value[j]; nnzl++; } else { uindex[nnzu] = A->index[j]; uvalue[nnzu] = A->value[j]; nnzu++; } } lptr[i+1] = nnzl; uptr[i+1] = nnzu; } #endif A->L->nnz = nnzl; A->L->ptr = lptr; A->L->index = lindex; A->L->value = lvalue; A->U->nnz = nnzu; A->U->ptr = uptr; A->U->index = uindex; A->U->value = uvalue; A->is_splited = LIS_TRUE; LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_matrix_copyDLU_csr(LIS_MATRIX Ain, LIS_MATRIX_DIAG *D, LIS_MATRIX *L, LIS_MATRIX *U) { LIS_INT err; LIS_INT i,n,np,lnnz,unnz; LIS_INT *lptr,*lindex; LIS_INT *uptr,*uindex; LIS_SCALAR *lvalue,*uvalue,*diag; LIS_DEBUG_FUNC_IN; *D = NULL; *L = NULL; *U = NULL; err = lis_matrix_check(Ain,LIS_MATRIX_CHECK_ALL); if( err ) return err; n = Ain->n; np = Ain->np; err = lis_matrix_duplicate(Ain,L); if( err ) { return err; } err = lis_matrix_duplicate(Ain,U); if( err ) { lis_matrix_destroy(*L); return err; } err = lis_matrix_diag_duplicateM(Ain,D); if( err ) { lis_matrix_destroy(*L); lis_matrix_destroy(*U); return err; } lis_free((*D)->value); if( Ain->is_splited ) { } lnnz = Ain->L->nnz; unnz = Ain->U->nnz; lptr = NULL; lindex = NULL; uptr = NULL; uindex = NULL; diag = NULL; err = lis_matrix_malloc_csr(n,lnnz,&lptr,&lindex,&lvalue); if( err ) { return err; } err = lis_matrix_malloc_csr(n,unnz,&uptr,&uindex,&uvalue); if( err ) { lis_free2(7,diag,uptr,lptr,uindex,lindex,uvalue,lvalue); return err; } diag = (LIS_SCALAR *)lis_malloc(np*sizeof(LIS_SCALAR),"lis_matrix_copyDLU_csr::diag"); if( diag==NULL ) { lis_free2(7,diag,uptr,lptr,uindex,lindex,uvalue,lvalue); return err; } #ifdef _OPENMP #pragma omp parallel for private(i) #endif for(i=0;i<n;i++) { diag[i] = Ain->D->value[i]; } lis_matrix_elements_copy_csr(n,Ain->L->ptr,Ain->L->index,Ain->L->value,lptr,lindex,lvalue); lis_matrix_elements_copy_csr(n,Ain->U->ptr,Ain->U->index,Ain->U->value,uptr,uindex,uvalue); (*D)->value = diag; err = lis_matrix_set_csr(lnnz,lptr,lindex,lvalue,*L); if( err ) { lis_free2(7,diag,uptr,lptr,uindex,lindex,uvalue,lvalue); return err; } err = lis_matrix_set_csr(unnz,uptr,uindex,uvalue,*U); if( err ) { lis_free2(7,diag,uptr,lptr,uindex,lindex,uvalue,lvalue); return err; } err = lis_matrix_assemble(*L); if( err ) { return err; } err = lis_matrix_assemble(*U); if( err ) { return err; } LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
void lis_matvect_ell(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[]) { LIS_INT i,j,jj; LIS_INT n,np,maxnzr; #ifdef _OPENMP LIS_INT k,is,ie,nprocs; LIS_SCALAR t; LIS_SCALAR *w; #endif n = A->n; np = A->np; if( A->is_splited ) { #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0; i<n; i++) { y[i] = A->D->value[i]*x[i]; } for(j=0;j<A->L->maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0;i<n;i++) { y[A->L->index[jj + i]] += A->L->value[jj + i] * x[i]; } } for(j=0;j<A->U->maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0;i<n;i++) { y[A->U->index[jj + i]] += A->U->value[jj + i] * x[i]; } } } else { #ifdef _OPENMP maxnzr = A->maxnzr; nprocs = omp_get_max_threads(); w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_ell::w" ); #pragma omp parallel private(i,j,t,jj,k,is,ie) { k = omp_get_thread_num(); LIS_GET_ISIE(k,nprocs,n,is,ie); #pragma omp for for(j=0;j<nprocs;j++) { memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) ); } for(j=0;j<maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=is;i<ie;i++) { w[k*np + A->index[jj + i]] += A->value[jj + i] * x[i]; } } #pragma omp barrier #pragma omp for #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0;i<np;i++) { t = 0.0; for(j=0;j<nprocs;j++) { t += w[j*np+i]; } y[i] = t; } } lis_free(w); #else maxnzr = A->maxnzr; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0; i<n; i++) { y[i] = 0.0; } for(j=0;j<maxnzr;j++) { jj = j*n; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0;i<n;i++) { y[A->index[jj + i]] += A->value[jj + i] * x[i]; } } #endif } }
LIS_INT lis_matvec_ilu(LIS_MATRIX A, LIS_MATRIX_ILU LU, LIS_VECTOR X, LIS_VECTOR Y) { LIS_INT i,j,jj,n,np; LIS_SCALAR *x,*y; #ifdef _OPENMP LIS_INT nprocs,k; LIS_SCALAR t,*w; #endif #ifdef USE_QUAD_PRECISION LIS_INT j0,j1; #ifdef _OPENMP LIS_SCALAR *ww,*wwl; #endif #endif LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; np = A->np; n = LU->n; x = X->value; y = Y->value; #ifdef USE_QUAD_PRECISION if( X->precision==LIS_PRECISION_DEFAULT ) #endif { #ifdef USE_MPI LIS_MATVEC_SENDRECV; #endif #ifdef _OPENMP nprocs = omp_get_max_threads(); w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_crs::w" ); #pragma omp parallel private(i,j,k,jj,t) { k = omp_get_thread_num(); #pragma omp for for(j=0;j<nprocs;j++) { memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = k*np + LU->index[i][j]; w[jj] += LU->value[i][j] * X->value[i]; } } #pragma omp for for(i=0;i<np;i++) { t = 0.0; for(j=0;j<nprocs;j++) { t += w[j*np+i]; } Y->value[i] = t; } } lis_free(w); #else for(i=0;i<np;i++) { Y->value[i] = 0.0; } for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = LU->index[i][j]; Y->value[jj] += LU->value[i][j] * X->value[i]; } } #endif } #ifdef USE_QUAD_PRECISION else { #ifdef USE_MPI lis_send_recv_mp(A->commtable,X); #endif #ifdef _OPENMP #ifndef USE_FMA2_SSE2 nprocs = omp_get_max_threads(); ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_crs_mp::ww" ); wwl = &ww[nprocs*np]; #ifndef USE_SSE2 #pragma omp parallel private(i,j,jj,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el) #else #pragma omp parallel private(i,j,jj,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) #endif { k = omp_get_thread_num(); #pragma omp for for(j=0;j<nprocs;j++) { memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) ); memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = k*np + LU->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #else LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #pragma omp for for(i=0;i<np;i++) { Y->value[i] = Y->value_lo[i] = 0.0; for(j=0;j<nprocs;j++) { #ifndef USE_SSE2 LIS_QUAD_ADD(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]); #else LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]); #endif } } } lis_free(ww); #else nprocs = omp_get_max_threads(); ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR), "lis_matvect_crs_mp2::ww" ); wwl = &ww[nprocs*np]; #pragma omp parallel private(i,j,j0,j1,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh) { k = omp_get_thread_num(); #pragma omp for for(j=0;j<nprocs;j++) { memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) ); memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0; i<n; i++) { for(j=0;j<LU->nnz[i]-1;j+=2) { j0 = k*np + LU->index[i][j]; j1 = k*np + LU->index[i][j+1]; #ifdef USE_SSE2 LIS_QUAD_FMAD2_SSE2_STSD(ww[j0],wwl[j0],ww[j1],wwl[j1],ww[j0],wwl[j0],ww[j1],wwl[j1],X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } for(;j<LU->nnz[i];j++) { j0 = LU->index[i][j]; #ifdef USE_SSE2 LIS_QUAD_FMAD_SSE2(ww[j0],wwl[j0],ww[j0],wwl[j0],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #pragma omp for for(i=0;i<np;i++) { Y->value[i] = Y->value_lo[i] = 0.0; for(j=0;j<nprocs;j++) { #ifdef USE_SSE2 LIS_QUAD_ADD_SSE2(Y->value[i],Y->value_lo[i],Y->value[i],Y->value_lo[i],ww[j*np+i],wwl[j*np+i]); #endif } } } lis_free(ww); #endif #else #ifndef USE_FMA2_SSE2 for(i=0;i<np;i++) { Y->value[i] = 0.0; Y->value_lo[i] = 0.0; } for(i=0;i<n;i++) { for(j=0;j<LU->nnz[i];j++) { jj = LU->index[i][j]; #ifndef USE_SSE2 LIS_QUAD_FMAD(Y->value[jj],Y->value_lo[jj],Y->value[jj],Y->value_lo[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #else LIS_QUAD_FMAD_SSE2(Y->value[jj],Y->value_lo[jj],Y->value[jj],Y->value_lo[jj],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #else for(i=0; i<np; i++) { Y->value[i] = 0.0; Y->value_lo[i] = 0.0; } for(i=0; i<n; i++) { for(j=0;j<LU->nnz[i]-1;j+=2) { j0 = LU->index[i][j]; j1 = LU->index[i][j+1]; #ifdef USE_SSE2 LIS_QUAD_FMAD2_SSE2_STSD(Y->value[j0],Y->value_lo[j0],Y->value[j1],Y->value_lo[j1],Y->value[j0],Y->value_lo[j0],Y->value[j1],Y->value_lo[j1],X->value[i],X->value_lo[i],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } for(;j<LU->nnz[i];j++) { j0 = LU->index[i][j]; #ifdef USE_SSE2 LIS_QUAD_FMAD_SSE2(Y->value[j0],Y->value_lo[j0],Y->value[j0],Y->value_lo[j0],X->value[i],X->value_lo[i],LU->value[i][j]); #endif } } #endif #endif } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
LIS_INT lis_fgmres_quad(LIS_SOLVER solver) { LIS_MATRIX A; LIS_VECTOR b,x; LIS_VECTOR r,s,*z,*v; LIS_QUAD *h; LIS_QUAD_PTR aa,bb,rr,a2,b2,t,one,tmp; LIS_REAL bnrm2,nrm2,tol; LIS_INT iter,maxiter,n,output; double time,ptime; LIS_REAL rnorm; LIS_INT i,j,k,m; LIS_INT ii,i1,iiv,i1v,iih,jj; LIS_INT h_dim; LIS_INT cs,sn; LIS_DEBUG_FUNC_IN; A = solver->A; b = solver->b; x = solver->x; n = A->n; maxiter = solver->options[LIS_OPTIONS_MAXITER]; output = solver->options[LIS_OPTIONS_OUTPUT]; m = solver->options[LIS_OPTIONS_RESTART]; h_dim = m+1; ptime = 0.0; s = solver->work[0]; r = solver->work[1]; z = &solver->work[2]; v = &solver->work[m+2]; h = (LIS_QUAD *)lis_malloc( sizeof(LIS_QUAD)*(h_dim+1)*(h_dim+2),"lis_fgmres_quad::h" ); cs = (m+1)*h_dim; sn = (m+2)*h_dim; LIS_QUAD_SCALAR_MALLOC(aa,0,1); LIS_QUAD_SCALAR_MALLOC(bb,1,1); LIS_QUAD_SCALAR_MALLOC(rr,2,1); LIS_QUAD_SCALAR_MALLOC(a2,3,1); LIS_QUAD_SCALAR_MALLOC(b2,4,1); LIS_QUAD_SCALAR_MALLOC(t,5,1); LIS_QUAD_SCALAR_MALLOC(tmp,6,1); LIS_QUAD_SCALAR_MALLOC(one,7,1); one.hi[0] = 1.0; one.lo[0] = 0.0; /* Initial Residual */ if( lis_solver_get_initial_residual(solver,NULL,NULL,v[0],&bnrm2) ) { lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } tol = solver->tol; rnorm = 1.0/bnrm2; iter=0; while( iter<maxiter ) { /* first column of V */ /* v = r / ||r||_2 */ lis_vector_scaleex_nm(bnrm2,v[0]); /* s = ||r||_2 e_1 */ lis_vector_set_allex_nm(0.0,s); s->value[0] = rnorm; s->value_lo[0] = 0.0; i = 0; do { iter++; i++; ii = i-1; i1 = i; iiv = i-1; i1v = i; iih = (i-1)*h_dim; /* z = M^-1 * v */ time = lis_wtime(); lis_psolve(solver,v[iiv],z[iiv]); ptime += lis_wtime()-time; /* w = A * z */ lis_matvec(A,z[iiv], v[i1v]); for(k=0;k<i;k++) { /* h[k,i] = <w,v[k]> */ /* w = w - h[k,i] * v[k] */ lis_vector_dotex_mmm(v[i1v],v[k],&t); h[k+iih].hi = t.hi[0]; h[k+iih].lo = t.lo[0]; lis_quad_minus((LIS_QUAD *)t.hi); lis_vector_axpyex_mmm(t,v[k],v[i1v]); } /* h[i+1,i] = ||w|| */ /* v[i+1] = w / h[i+1,i] */ lis_vector_nrm2ex_mm(v[i1v],&t); h[i1+iih].hi = t.hi[0]; h[i1+iih].lo = t.lo[0]; lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)one.hi,(LIS_QUAD *)t.hi); lis_vector_scaleex_mm(tmp,v[i1v]); for(k=1;k<=ii;k++) { jj = k-1; t.hi[0] = h[jj+iih].hi; t.lo[0] = h[jj+iih].lo; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[jj+cs],(LIS_QUAD *)t.hi); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+sn],(LIS_QUAD *)&h[k+iih]); lis_quad_add((LIS_QUAD *)aa.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)&h[jj+sn],(LIS_QUAD *)t.hi); lis_quad_minus((LIS_QUAD *)bb.hi); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+cs],(LIS_QUAD *)&h[k+iih]); lis_quad_add((LIS_QUAD *)bb.hi,(LIS_QUAD *)bb.hi,(LIS_QUAD *)tmp.hi); h[jj+iih].hi = aa.hi[0]; h[jj+iih].lo = aa.lo[0]; h[k+iih].hi = bb.hi[0]; h[k+iih].lo = bb.lo[0]; } aa.hi[0] = h[ii+iih].hi; aa.lo[0] = h[ii+iih].lo; bb.hi[0] = h[i1+iih].hi; bb.lo[0] = h[i1+iih].lo; lis_quad_sqr((LIS_QUAD *)a2.hi,(LIS_QUAD *)aa.hi); lis_quad_sqr((LIS_QUAD *)b2.hi,(LIS_QUAD *)bb.hi); lis_quad_add((LIS_QUAD *)rr.hi,(LIS_QUAD *)a2.hi,(LIS_QUAD *)b2.hi); lis_quad_sqrt((LIS_QUAD *)rr.hi,(LIS_QUAD *)rr.hi); if( rr.hi[0]==0.0 ) { rr.hi[0]=1.0e-17; rr.lo[0]=0.0; } lis_quad_div((LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)aa.hi,(LIS_QUAD *)rr.hi); lis_quad_div((LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)bb.hi,(LIS_QUAD *)rr.hi); tmp.hi[0] = s->value[ii]; tmp.lo[0] = s->value_lo[ii]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)tmp.hi); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)tmp.hi); lis_quad_minus((LIS_QUAD *)aa.hi); s->value[i1] = aa.hi[0]; s->value_lo[i1] = aa.lo[0]; s->value[ii] = bb.hi[0]; s->value_lo[ii] = bb.lo[0]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)&h[ii+iih]); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)&h[i1+iih]); lis_quad_add((LIS_QUAD *)aa.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi); h[ii+iih].hi = aa.hi[0]; h[ii+iih].lo = aa.lo[0]; /* convergence check */ nrm2 = fabs(s->value[i1]); if( output ) { if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2; if( output & LIS_PRINT_OUT && A->my_rank==0 ) lis_print_rhistory(iter,nrm2); } if( tol >= nrm2 ) break; } while( i<m && iter <maxiter ); /* Solve H * Y = S for upper Hessenberg matrix H */ tmp.hi[0] = s->value[ii]; tmp.lo[0] = s->value_lo[ii]; lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[ii+iih]); s->value[ii] = tmp.hi[0]; s->value_lo[ii] = tmp.lo[0]; for(k=1;k<=ii;k++) { jj = ii-k; t.hi[0] = s->value[jj]; t.lo[0] = s->value_lo[jj]; for(j=jj+1;j<=ii;j++) { tmp.hi[0] = s->value[j]; tmp.lo[0] = s->value_lo[j]; lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+j*h_dim]); lis_quad_sub((LIS_QUAD *)t.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)tmp.hi); } lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)&h[jj+jj*h_dim]); s->value[jj] = tmp.hi[0]; s->value_lo[jj] = tmp.lo[0]; } /* x = x + y * z */ for(j=0;j<=ii;j++) { aa.hi[0] = s->value[j]; aa.lo[0] = s->value_lo[j]; lis_vector_axpyex_mmm(aa,z[j],x); } if( tol >= nrm2 ) { solver->retcode = LIS_SUCCESS; solver->iter = iter; solver->resid = nrm2; solver->ptime = ptime; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } lis_matvec(A,x,v[0]); lis_vector_xpay(b,-1.0,v[0]); memset(v[0]->value_lo,0,n*sizeof(LIS_SCALAR)); lis_vector_nrm2(v[0],&rnorm); bnrm2 = 1.0/rnorm; } solver->retcode = LIS_MAXITER; solver->iter = iter+1; solver->resid = nrm2; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_MAXITER; }
LIS_INT lis_gmres(LIS_SOLVER solver) { LIS_MATRIX A; LIS_VECTOR b,x; LIS_VECTOR r,s,z,*v; LIS_SCALAR *h; LIS_SCALAR aa,bb,rr,a2,b2,t; LIS_REAL tnrm2; LIS_REAL bnrm2,nrm2,tol; LIS_INT iter,maxiter,n,output; double time,ptime; LIS_REAL rnorm; LIS_INT i,j,k,m; LIS_INT ii,i1,iiv,i1v,iih,jj; LIS_INT h_dim; LIS_INT cs,sn; LIS_DEBUG_FUNC_IN; A = solver->A; b = solver->b; x = solver->x; n = A->n; maxiter = solver->options[LIS_OPTIONS_MAXITER]; output = solver->options[LIS_OPTIONS_OUTPUT]; m = solver->options[LIS_OPTIONS_RESTART]; h_dim = m+1; ptime = 0.0; s = solver->work[0]; r = solver->work[1]; z = solver->work[2]; v = &solver->work[3]; h = (LIS_SCALAR *)lis_malloc( sizeof(LIS_SCALAR)*(h_dim+1)*(h_dim+2),"lis_gmres::h" ); cs = (m+1)*h_dim; sn = (m+2)*h_dim; /* r = M^-1 * (b - A * x) */ lis_matvec(A,x,z); lis_vector_xpay(b,-1.0,z); lis_psolve(solver,z,v[0]); /* Initial Residual */ if( lis_solver_get_initial_residual(solver,NULL,NULL,v[0],&bnrm2) ) { lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } tol = solver->tol; iter=0; while( iter<maxiter ) { /* first column of V */ /* v = r / ||r||_2 */ lis_vector_nrm2(v[0],&rnorm); lis_vector_scale(1.0/rnorm,v[0]); /* s = ||r||_2 e_1 */ lis_vector_set_all(0,s); s->value[0] = rnorm; i = 0; do { iter++; i++; ii = i-1; i1 = i; iiv = i-1; i1v = i; iih = (i-1)*h_dim; /* z = M^-1 * v */ time = lis_wtime(); lis_psolve(solver,v[iiv],z); ptime += lis_wtime()-time; /* w = A * z */ lis_matvec(A,z,v[i1v]); for(k=0;k<i;k++) { /* h[k,i] = <w,v[k]> */ /* w = w - h[k,i] * v[k] */ lis_vector_dot(v[i1v],v[k],&t); h[k+iih] = t; lis_vector_axpy(-t,v[k],v[i1v]); } /* h[i+1,i] = ||w|| */ /* v[i+1] = w / h[i+1,i] */ lis_vector_nrm2(v[i1v],&tnrm2); h[i1+iih] = tnrm2; lis_vector_scale(1.0/tnrm2,v[i1v]); for(k=1;k<=ii;k++) { jj = k-1; t = h[jj+iih]; aa = h[jj+cs]*t; aa += h[jj+sn]*h[k+iih]; bb = -h[jj+sn]*t; bb += h[jj+cs]*h[k+iih]; h[jj+iih] = aa; h[k+iih] = bb; } aa = h[ii+iih]; bb = h[i1+iih]; a2 = aa*aa; b2 = bb*bb; rr = sqrt(a2+b2); if( rr==0.0 ) rr=1.0e-17; h[ii+cs] = aa/rr; h[ii+sn] = bb/rr; s->value[i1] = -h[ii+sn]*s->value[ii]; s->value[ii] = h[ii+cs]*s->value[ii]; aa = h[ii+cs]*h[ii+iih]; aa += h[ii+sn]*h[i1+iih]; h[ii+iih] = aa; /* convergence check */ nrm2 = sabs(s->value[i1])*bnrm2; if( output ) { if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2; if( output & LIS_PRINT_OUT && A->my_rank==0 ) lis_print_rhistory(iter,nrm2); } if( tol >= nrm2 ) break; } while( i<m && iter <maxiter ); /* Solve H * Y = S for upper Hessenberg matrix H */ s->value[ii] = s->value[ii]/h[ii+iih]; for(k=1;k<=ii;k++) { jj = ii-k; t = s->value[jj]; for(j=jj+1;j<=ii;j++) { t -= h[jj+j*h_dim]*s->value[j]; } s->value[jj] = t/h[jj+jj*h_dim]; } /* z = z + y * v */ #ifdef _OPENMP #pragma omp parallel for private(k) #endif for(k=0;k<n;k++) { z->value[k] = s->value[0]*v[0]->value[k]; } for(j=1;j<=ii;j++) { lis_vector_axpy(s->value[j],v[j],z); } /* r = M^-1 * z */ time = lis_wtime(); lis_psolve(solver,z,r); ptime += lis_wtime()-time; /* x = x + r */ lis_vector_axpy(1,r,x); if( tol >= nrm2 ) { solver->retcode = LIS_SUCCESS; solver->iter = iter; solver->resid = nrm2; solver->ptime = ptime; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } for(j=1;j<=i;j++) { jj = i1-j+1; s->value[jj-1] = -h[jj-1+sn]*s->value[jj]; s->value[jj] = h[jj-1+cs]*s->value[jj]; } for(j=0;j<=i1;j++) { t = s->value[j]; if( j==0 ) t = t-1.0; lis_vector_axpy(t,v[j],v[0]); } } solver->retcode = LIS_MAXITER; solver->iter = iter+1; solver->resid = nrm2; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_MAXITER; }
LIS_INT lis_gmres_switch(LIS_SOLVER solver) { LIS_MATRIX A; LIS_VECTOR b,x; LIS_VECTOR r,s,z,*v; LIS_QUAD *h; LIS_SCALAR *hd; LIS_QUAD_PTR aa,bb,rr,a2,b2,t,one,tmp; LIS_QUAD_PTR rnorm; LIS_REAL bnrm2,nrm2,tol,tol2; LIS_INT iter,maxiter,n,output; LIS_INT iter2,maxiter2; double time,ptime; LIS_INT i,j,k,m; LIS_INT ii,i1,iiv,i1v,iih,jj; LIS_INT h_dim; LIS_INT cs,sn; LIS_DEBUG_FUNC_IN; A = solver->A; b = solver->b; x = solver->x; n = A->n; maxiter = solver->options[LIS_OPTIONS_MAXITER]; maxiter2 = solver->options[LIS_OPTIONS_SWITCH_MAXITER]; output = solver->options[LIS_OPTIONS_OUTPUT]; tol = solver->params[LIS_PARAMS_RESID-LIS_OPTIONS_LEN]; tol2 = solver->params[LIS_PARAMS_SWITCH_RESID-LIS_OPTIONS_LEN]; m = solver->options[LIS_OPTIONS_RESTART]; h_dim = m+1; ptime = 0.0; s = solver->work[0]; r = solver->work[1]; z = solver->work[2]; v = &solver->work[3]; LIS_QUAD_SCALAR_MALLOC(aa,0,1); LIS_QUAD_SCALAR_MALLOC(bb,1,1); LIS_QUAD_SCALAR_MALLOC(rr,2,1); LIS_QUAD_SCALAR_MALLOC(a2,3,1); LIS_QUAD_SCALAR_MALLOC(b2,4,1); LIS_QUAD_SCALAR_MALLOC(t,5,1); LIS_QUAD_SCALAR_MALLOC(tmp,6,1); LIS_QUAD_SCALAR_MALLOC(one,7,1); LIS_QUAD_SCALAR_MALLOC(rnorm,8,1); h = (LIS_QUAD *)lis_malloc( sizeof(LIS_QUAD)*(h_dim+1)*(h_dim+2),"lis_gmres_switch::h" ); hd = (LIS_SCALAR *)h; cs = (m+1)*h_dim; sn = (m+2)*h_dim; one.hi[0] = 1.0; one.lo[0] = 0.0; z->precision = LIS_PRECISION_DEFAULT; /* r = M^-1 * (b - A * x) */ lis_matvec(A,x,z); lis_vector_xpay(b,-1.0,z); lis_psolve(solver,z,v[0]); /* Initial Residual */ if( lis_solver_get_initial_residual(solver,NULL,NULL,v[0],&bnrm2) ) { lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } tol2 = solver->tol_switch; iter=0; while( iter<maxiter2 ) { /* first column of V */ /* v = r / ||r||_2 */ lis_vector_nrm2(v[0],&rnorm.hi[0]); lis_vector_scale(1.0/rnorm.hi[0],v[0]); /* s = ||r||_2 e_1 */ lis_vector_set_all(0,s); s->value[0] = rnorm.hi[0]; i = 0; do { iter++; i++; ii = i-1; i1 = i; iiv = i-1; i1v = i; iih = (i-1)*h_dim; /* z = M^-1 * v */ time = lis_wtime(); lis_psolve(solver,v[iiv],z); ptime += lis_wtime()-time; /* w = A * z */ lis_matvec(A,z, v[i1v]); for(k=0;k<i;k++) { /* h[k,i] = <w,v[k]> */ /* w = w - h[k,i] * v[k] */ lis_vector_dot(v[i1v],v[k],&t.hi[0]); hd[k+iih] = t.hi[0]; lis_vector_axpy(-t.hi[0],v[k],v[i1v]); } /* h[i+1,i] = ||w|| */ /* v[i+1] = w / h[i+1,i] */ lis_vector_nrm2(v[i1v],&t.hi[0]); hd[i1+iih] = t.hi[0]; lis_vector_scale(1.0/t.hi[0],v[i1v]); for(k=1;k<=ii;k++) { jj = k-1; t.hi[0] = hd[jj+iih]; aa.hi[0] = hd[jj+cs]*t.hi[0]; aa.hi[0] += hd[jj+sn]*hd[k+iih]; bb.hi[0] = -hd[jj+sn]*t.hi[0]; bb.hi[0] += hd[jj+cs]*hd[k+iih]; hd[jj+iih] = aa.hi[0]; hd[k+iih] = bb.hi[0]; } aa.hi[0] = hd[ii+iih]; bb.hi[0] = hd[i1+iih]; a2.hi[0] = aa.hi[0]*aa.hi[0]; b2.hi[0] = bb.hi[0]*bb.hi[0]; rr.hi[0] = sqrt(a2.hi[0]+b2.hi[0]); if( rr.hi[0]==0.0 ) rr.hi[0]=1.0e-17; hd[ii+cs] = aa.hi[0]/rr.hi[0]; hd[ii+sn] = bb.hi[0]/rr.hi[0]; s->value[i1] = -hd[ii+sn]*s->value[ii]; s->value[ii] = hd[ii+cs]*s->value[ii]; aa.hi[0] = hd[ii+cs]*hd[ii+iih]; aa.hi[0] += hd[ii+sn]*hd[i1+iih]; hd[ii+iih] = aa.hi[0]; /* convergence check */ nrm2 = fabs(s->value[i1])*bnrm2; if( output ) { if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2; if( output & LIS_PRINT_OUT && A->my_rank==0 ) lis_print_rhistory(iter,nrm2); } if( tol2 >= nrm2 ) break; } while( i<m && iter <maxiter2 ); /* Solve H * Y = S for upper Hessenberg matrix H */ s->value[ii] = s->value[ii]/hd[ii+iih]; for(k=1;k<=ii;k++) { jj = ii-k; t.hi[0] = s->value[jj]; for(j=jj+1;j<=ii;j++) { t.hi[0] -= hd[jj+j*h_dim]*s->value[j]; } s->value[jj] = t.hi[0]/hd[jj+jj*h_dim]; } /* z = z + y * v */ for(k=0;k<n;k++) { z->value[k] = s->value[0]*v[0]->value[k]; } for(j=1;j<=ii;j++) { lis_vector_axpy(s->value[j],v[j],z); } /* r = M^-1 * z */ time = lis_wtime(); lis_psolve(solver,z,r); ptime += lis_wtime()-time; /* x = x + r */ lis_vector_axpy(1,r,x); if( tol2 >= nrm2 ) { solver->iter = iter; solver->iter2 = iter; solver->ptime = ptime; break; } for(j=1;j<=i;j++) { jj = i1-j+1; s->value[jj-1] = -hd[jj-1+sn]*s->value[jj]; s->value[jj] = hd[jj-1+cs]*s->value[jj]; } for(j=0;j<=i1;j++) { t.hi[0] = s->value[j]; if( j==0 ) t.hi[0] = t.hi[0]-1.0; lis_vector_axpy(t.hi[0],v[j],v[0]); } } /* Initial Residual */ z->precision = LIS_PRECISION_QUAD; solver->options[LIS_OPTIONS_INITGUESS_ZEROS] = LIS_FALSE; lis_vector_copyex_mn(x,solver->xx); lis_solver_get_initial_residual(solver,NULL,NULL,v[0],&bnrm2); tol = solver->tol; iter2=iter; while( iter2<maxiter ) { /* first column of V */ /* v = r / ||r||_2 */ lis_vector_nrm2ex_mm(v[0],&rnorm); lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)one.hi,(LIS_QUAD *)rnorm.hi); lis_vector_scaleex_mm(tmp,v[0]); /* s = ||r||_2 e_1 */ lis_vector_set_allex_nm(0.0,s); s->value[0] = rnorm.hi[0]; s->value_lo[0] = rnorm.lo[0]; i = 0; do { iter2++; i++; ii = i-1; i1 = i; iiv = i-1; i1v = i; iih = (i-1)*h_dim; /* z = M^-1 * v */ time = lis_wtime(); lis_psolve(solver,v[iiv],z); ptime += lis_wtime()-time; /* w = A * z */ lis_matvec(A,z, v[i1v]); for(k=0;k<i;k++) { /* h[k,i] = <w,v[k]> */ /* w = w - h[k,i] * v[k] */ lis_vector_dotex_mmm(v[i1v],v[k],&t); h[k+iih].hi = t.hi[0]; h[k+iih].lo = t.lo[0]; lis_quad_minus((LIS_QUAD *)t.hi); lis_vector_axpyex_mmm(t,v[k],v[i1v]); } /* h[i+1,i] = ||w|| */ /* v[i+1] = w / h[i+1,i] */ lis_vector_nrm2ex_mm(v[i1v],&t); h[i1+iih].hi = t.hi[0]; h[i1+iih].lo = t.lo[0]; lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)one.hi,(LIS_QUAD *)t.hi); lis_vector_scaleex_mm(tmp,v[i1v]); for(k=1;k<=ii;k++) { jj = k-1; t.hi[0] = h[jj+iih].hi; t.lo[0] = h[jj+iih].lo; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[jj+cs],(LIS_QUAD *)t.hi); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+sn],(LIS_QUAD *)&h[k+iih]); lis_quad_add((LIS_QUAD *)aa.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)&h[jj+sn],(LIS_QUAD *)t.hi); lis_quad_minus((LIS_QUAD *)bb.hi); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+cs],(LIS_QUAD *)&h[k+iih]); lis_quad_add((LIS_QUAD *)bb.hi,(LIS_QUAD *)bb.hi,(LIS_QUAD *)tmp.hi); h[jj+iih].hi = aa.hi[0]; h[jj+iih].lo = aa.lo[0]; h[k+iih].hi = bb.hi[0]; h[k+iih].lo = bb.lo[0]; } aa.hi[0] = h[ii+iih].hi; aa.lo[0] = h[ii+iih].lo; bb.hi[0] = h[i1+iih].hi; bb.lo[0] = h[i1+iih].lo; lis_quad_sqr((LIS_QUAD *)a2.hi,(LIS_QUAD *)aa.hi); lis_quad_sqr((LIS_QUAD *)b2.hi,(LIS_QUAD *)bb.hi); lis_quad_add((LIS_QUAD *)rr.hi,(LIS_QUAD *)a2.hi,(LIS_QUAD *)b2.hi); lis_quad_sqrt((LIS_QUAD *)rr.hi,(LIS_QUAD *)rr.hi); lis_quad_div((LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)aa.hi,(LIS_QUAD *)rr.hi); lis_quad_div((LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)bb.hi,(LIS_QUAD *)rr.hi); tmp.hi[0] = s->value[ii]; tmp.lo[0] = s->value_lo[ii]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)tmp.hi); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)tmp.hi); lis_quad_minus((LIS_QUAD *)aa.hi); s->value[i1] = aa.hi[0]; s->value_lo[i1] = aa.lo[0]; s->value[ii] = bb.hi[0]; s->value_lo[ii] = bb.lo[0]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)&h[ii+cs],(LIS_QUAD *)&h[ii+iih]); lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[ii+sn],(LIS_QUAD *)&h[i1+iih]); lis_quad_add((LIS_QUAD *)aa.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi); h[ii+iih].hi = aa.hi[0]; h[ii+iih].lo = aa.lo[0]; /* convergence check */ nrm2 = fabs(s->value[i1])*bnrm2; if( output ) { if( output & LIS_PRINT_MEM ) solver->rhistory[iter2] = nrm2; if( output & LIS_PRINT_OUT && A->my_rank==0 ) lis_print_rhistory(iter,nrm2); } if( tol >= nrm2 ) break; } while( i<m && iter2 <maxiter ); /* Solve H * Y = S for upper Hessenberg matrix H */ tmp.hi[0] = s->value[ii]; tmp.lo[0] = s->value_lo[ii]; lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[ii+iih]); s->value[ii] = tmp.hi[0]; s->value_lo[ii] = tmp.lo[0]; for(k=1;k<=ii;k++) { jj = ii-k; t.hi[0] = s->value[jj]; t.lo[0] = s->value_lo[jj]; for(j=jj+1;j<=ii;j++) { tmp.hi[0] = s->value[j]; tmp.lo[0] = s->value_lo[j]; lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj+j*h_dim]); lis_quad_sub((LIS_QUAD *)t.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)tmp.hi); } lis_quad_div((LIS_QUAD *)tmp.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)&h[jj+jj*h_dim]); s->value[jj] = tmp.hi[0]; s->value_lo[jj] = tmp.lo[0]; } /* z = z + y * v */ for(k=0;k<n;k++) { aa.hi[0] = s->value[0]; aa.lo[0] = s->value_lo[0]; bb.hi[0] = v[0]->value[k]; bb.lo[0] = v[0]->value_lo[k]; lis_quad_mul((LIS_QUAD *)tmp.hi,(LIS_QUAD *)aa.hi,(LIS_QUAD *)bb.hi); z->value[k] = tmp.hi[0]; z->value_lo[k] = tmp.lo[0]; } for(j=1;j<=ii;j++) { aa.hi[0] = s->value[j]; aa.lo[0] = s->value_lo[j]; lis_vector_axpyex_mmm(aa,v[j],z); } /* r = M^-1 * z */ time = lis_wtime(); lis_psolve(solver,z,r); ptime += lis_wtime()-time; /* x = x + r */ lis_vector_axpyex_mmm(one,r,x); if( tol >= nrm2 ) { solver->retcode = LIS_SUCCESS; solver->iter = iter2; solver->iter2 = iter; solver->resid = nrm2; solver->ptime = ptime; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } for(j=1;j<=i;j++) { jj = i1-j+1; tmp.hi[0] = s->value[jj]; tmp.lo[0] = s->value_lo[jj]; lis_quad_mul((LIS_QUAD *)aa.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj-1+sn]); lis_quad_mul((LIS_QUAD *)bb.hi,(LIS_QUAD *)tmp.hi,(LIS_QUAD *)&h[jj-1+cs]); lis_quad_minus((LIS_QUAD *)aa.hi); s->value[jj-1] = aa.hi[0]; s->value_lo[jj-1] = aa.lo[0]; s->value[jj] = bb.hi[0]; s->value_lo[jj] = bb.lo[0]; } for(j=0;j<=i1;j++) { t.hi[0] = s->value[j]; t.lo[0] = s->value_lo[j]; if( j==0 ) { lis_quad_sub((LIS_QUAD *)t.hi,(LIS_QUAD *)t.hi,(LIS_QUAD *)one.hi); } lis_vector_axpyex_mmm(t,v[j],v[0]); } } solver->retcode = LIS_MAXITER; solver->iter = iter2+1; solver->iter2 = iter; solver->resid = nrm2; lis_free(h); LIS_DEBUG_FUNC_OUT; return LIS_MAXITER; }
LIS_INT lis_orthomin_quad(LIS_SOLVER solver) { LIS_Comm comm; LIS_MATRIX A; LIS_PRECON M; LIS_VECTOR x; LIS_VECTOR r, rtld, *p, *ap, *aptld; LIS_QUAD *dotsave; LIS_QUAD_PTR alpha, beta, tmp, one; LIS_REAL bnrm2, nrm2, tol; LIS_INT iter,maxiter,output,conv; double time,ptime; LIS_INT m,l,lmax,ip,ip0; LIS_DEBUG_FUNC_IN; comm = LIS_COMM_WORLD; A = solver->A; M = solver->precon; x = solver->x; maxiter = solver->options[LIS_OPTIONS_MAXITER]; output = solver->options[LIS_OPTIONS_OUTPUT]; m = solver->options[LIS_OPTIONS_RESTART]; conv = solver->options[LIS_OPTIONS_CONV_COND]; ptime = 0.0; LIS_QUAD_SCALAR_MALLOC(alpha,0,1); LIS_QUAD_SCALAR_MALLOC(beta,1,1); LIS_QUAD_SCALAR_MALLOC(tmp,3,1); LIS_QUAD_SCALAR_MALLOC(one,4,1); r = solver->work[0]; rtld = solver->work[1]; p = &solver->work[2]; ap = &solver->work[ (m+1)+2]; aptld = &solver->work[2*(m+1)+2]; one.hi[0] = 1.0; one.lo[0] = 0.0; dotsave = (LIS_QUAD *)lis_malloc( sizeof(LIS_QUAD) * (m+1),"lis_orthomin_quad::dotsave" ); /* Initial Residual */ if( lis_solver_get_initial_residual(solver,M,r,rtld,&bnrm2) ) { LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } tol = solver->tol; iter=1; while( iter<=maxiter ) { ip = (iter-1) % (m+1); /* p[ip] = rtld */ lis_vector_copyex_mm(rtld,p[ip]); /* ap[ip] = A*p[ip] */ /* aptld[ip] = M^-1 ap[ip] */ lis_matvec(A,p[ip],ap[ip]); time = lis_wtime(); lis_psolve(solver, ap[ip], aptld[ip]); ptime += lis_wtime()-time; lmax = _min(m,iter-1); for(l=1;l<=lmax;l++) { ip0 = (ip+m+1-l) % (m+1); /* beta = -<Ar[ip],Ap[ip0]> / <Ap[ip0],Ap[ip0]> */ lis_vector_dotex_mmm(aptld[ip],aptld[ip0],&beta); lis_quad_mul((LIS_QUAD *)beta.hi,(LIS_QUAD *)beta.hi,&dotsave[l-1]); lis_quad_minus((LIS_QUAD *)beta.hi); lis_vector_axpyex_mmm(beta,p[ip0] ,p[ip]); lis_vector_axpyex_mmm(beta,ap[ip0] ,ap[ip]); lis_vector_axpyex_mmm(beta,aptld[ip0],aptld[ip]); } for(l=m-1;l>0;l--) { dotsave[l] = dotsave[l-1]; } lis_vector_dotex_mmm(aptld[ip],aptld[ip],&tmp); dotsave[0].hi = tmp.hi[0]; dotsave[0].lo = tmp.lo[0]; /* test breakdown */ if( tmp.hi[0]==0.0 && tmp.lo[0]==0.0 ) { solver->retcode = LIS_BREAKDOWN; solver->iter = iter; solver->resid = nrm2; lis_free(dotsave); LIS_DEBUG_FUNC_OUT; return LIS_BREAKDOWN; } lis_quad_div(&dotsave[0],(LIS_QUAD *)one.hi,&dotsave[0]); /* alpha = <rtld,Aptld[ip]> */ lis_vector_dotex_mmm(rtld,aptld[ip],&alpha); lis_quad_mul((LIS_QUAD *)alpha.hi,(LIS_QUAD *)alpha.hi,&dotsave[0]); lis_vector_axpyex_mmm( alpha,p[ip],x); lis_quad_minus((LIS_QUAD *)alpha.hi); lis_vector_axpyex_mmm(alpha,ap[ip],r); lis_vector_axpyex_mmm(alpha,aptld[ip],rtld); lis_quad_minus((LIS_QUAD *)alpha.hi); /* convergence check */ lis_solver_get_residual[conv](r,solver,&nrm2); if( output ) { if( output & LIS_PRINT_MEM ) solver->rhistory[iter] = nrm2; if( output & LIS_PRINT_OUT ) lis_print_rhistory(comm,iter,nrm2); } if( tol > nrm2 ) { solver->retcode = LIS_SUCCESS; solver->iter = iter; solver->resid = nrm2; solver->ptime = ptime; lis_free(dotsave); LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; } iter++; } solver->retcode = LIS_MAXITER; solver->iter = iter; solver->resid = nrm2; lis_free(dotsave); LIS_DEBUG_FUNC_OUT; return LIS_MAXITER; }
void lis_matvect_msr(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[]) { LIS_INT i,j,js,je,jj; LIS_INT n,np; LIS_SCALAR t; #ifdef _OPENMP LIS_INT k,is,ie,nprocs; LIS_SCALAR *w; #endif n = A->n; np = A->np; if( A->is_splited ) { for(i=0; i<n; i++) { y[i] = A->D->value[i] * x[i]; } for(i=0; i<n; i++) { t = x[i]; js = A->L->index[i]; je = A->L->index[i+1]; for(j=js; j<je; j++) { jj = A->L->index[j]; y[jj] += A->L->value[j] * t; } js = A->U->index[i]; je = A->U->index[i+1]; for(j=js; j<je; j++) { jj = A->U->index[j]; y[jj] += A->U->value[j] * t; } } } else { #ifdef _OPENMP nprocs = omp_get_max_threads(); w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvect_msr::w" ); #pragma omp parallel private(i,j,js,je,t,jj,k) { k = omp_get_thread_num(); #pragma omp for for(j=0; j<nprocs; j++) { memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0; i<n; i++) { js = A->index[i]; je = A->index[i+1]; t = x[i]; for(j=js; j<je; j++) { jj = k*np+A->index[j]; w[jj] += A->value[j] * t; } w[k*np+i] += A->value[i] * x[i]; } #pragma omp for for(i=0; i<np; i++) { t = 0.0; for(j=0; j<nprocs; j++) { t += w[j*np+i]; } y[i] = t; } } lis_free(w); #else for(i=0; i<n; i++) { y[i] = A->value[i] * x[i]; } for(i=0; i<n; i++) { t = x[i]; js = A->index[i]; je = A->index[i+1]; for(j=js; j<je; j++) { jj = A->index[j]; y[jj] += A->value[j] * t; } } #endif } }