void Msap(spinor * const P, spinor * const Q, const int Ncy, const int Niter) { int blk, ncy = 0, eo, vol; spinor * r, * a, * b; double nrm; spinor ** solver_field = NULL; const int nr_sf = 6; /* * here it would be probably better to get the working fields as a parameter * from the calling function */ init_solver_field(&solver_field, VOLUME, nr_sf); r = solver_field[0]; a = solver_field[1]; b = solver_field[2]; for(ncy = 0; ncy < Ncy; ncy++) { /* compute the global residue */ /* this can be done more efficiently */ /* here only a naive implementation */ for(eo = 0; eo < 2; eo++) { D_psi(r, P); diff(r, Q, r, VOLUME); nrm = square_norm(r, VOLUME, 1); if(g_proc_id == 0 && g_debug_level > 2 && eo == 1) { /* GG, was 1 */ printf("Msap: %d %1.3e\n", ncy, nrm); fflush(stdout); } /* choose the even (odd) block */ /*blk = eolist[eo];*/ for (blk = 0; blk < nb_blocks; blk++) { if(block_list[blk].evenodd == eo) { vol = block_list[blk].volume; /* get part of r corresponding to block blk into b */ copy_global_to_block(b, r, blk); // does this work?? i.e. solver_field[3] mrblk(a, b, solver_field[3], Niter, 1.e-31, 1, vol, &dummy_Di, blk); /* add a up to full spinor P */ add_block_to_global(P, a, blk); } } } } finalize_solver(solver_field, nr_sf); return; }
double cloverdetratio_rwacc(const int id, hamiltonian_field_t * const hf) { monomial * mnl = &monomial_list[id]; int save_sloppy = g_sloppy_precision_flag; double atime, etime; atime = gettime(); g_mu = mnl->mu2; boundary(mnl->kappa2); init_sw_fields(); sw_term( (const su3**) hf->gaugefield, mnl->kappa2, mnl->c_sw); sw_invert(EE, mnl->mu2); g_mu3 = 0.; mnl->Qp(mnl->w_fields[1], mnl->pf); g_mu3 = 0.; g_mu = mnl->mu; boundary(mnl->kappa); sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); sw_invert(EE, mnl->mu); chrono_guess(mnl->w_fields[0], mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, mnl->csg_N, mnl->csg_n, VOLUME/2, &Qtm_plus_psi); g_sloppy_precision_flag = 0; mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver); mnl->Qm(mnl->w_fields[0], mnl->w_fields[0]); g_sloppy_precision_flag = save_sloppy; /* Compute the energy contr. from second field */ mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME/2, 1); g_mu = g_mu1; g_mu3 = 0.; boundary(g_kappa); etime = gettime(); if(g_proc_id == 0) { if(g_debug_level > 1) { printf("# Time for %s monomial rwacc step: %e s\n", mnl->name, etime-atime); } if(g_debug_level > 3) { printf("called cloverdetratio_rwacc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0); } } return(mnl->energy1 - mnl->energy0); }
void cloverdet_heatbath(const int id, hamiltonian_field_t * const hf) { monomial * mnl = &monomial_list[id]; double atime, etime; atime = gettime(); int N = VOLUME/2; g_mu = mnl->mu; g_mu3 = mnl->rho; g_c_sw = mnl->c_sw; boundary(mnl->kappa); mnl->csg_n = 0; mnl->csg_n2 = 0; mnl->iter0 = 0; mnl->iter1 = 0; init_sw_fields(); sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); if(!mnl->even_odd_flag) { N = VOLUME; random_spinor_field_lexic(mnl->w_fields[0], mnl->rngrepro, RN_GAUSS); } else { sw_invert(EE, mnl->mu); random_spinor_field_eo(mnl->w_fields[0], mnl->rngrepro, RN_GAUSS); } mnl->energy0 = square_norm(mnl->w_fields[0], N, 1); mnl->Qp(mnl->pf, mnl->w_fields[0]); chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array, mnl->csg_N, &mnl->csg_n, N); g_mu = g_mu1; g_mu3 = 0.; boundary(g_kappa); etime = gettime(); if(g_proc_id == 0) { if(g_debug_level > 1) { printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); } if(g_debug_level > 3) { printf("called cloverdet_heatbath for id %d energy %f\n", id, mnl->energy0); } } return; }
double Scene::compute_radius() const { double square_radius = 0.0; const GAABB3 bbox = compute_bbox(); if (bbox.is_valid()) { for (size_t i = 0; i < 8; ++i) { const double square_distance = static_cast<double>(square_norm(bbox.compute_corner(i))); square_radius = max(square_radius, square_distance); } } return sqrt(square_radius); }
double cloverdet_acc(const int id, hamiltonian_field_t * const hf) { monomial * mnl = &monomial_list[id]; int save_sloppy = g_sloppy_precision_flag; double atime, etime; atime = gettime(); g_mu = mnl->mu; g_mu3 = mnl->rho; g_c_sw = mnl->c_sw; boundary(mnl->kappa); sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); sw_invert(EE, mnl->mu); chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array, mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq); g_sloppy_precision_flag = 0; mnl->iter0 = cg_her(mnl->w_fields[0], mnl->pf, mnl->maxiter, mnl->accprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq); mnl->Qm(mnl->w_fields[0], mnl->w_fields[0]); g_sloppy_precision_flag = save_sloppy; /* Compute the energy contr. from first field */ mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME/2, 1); g_mu = g_mu1; g_mu3 = 0.; boundary(g_kappa); etime = gettime(); if(g_proc_id == 0) { if(g_debug_level > 1) { printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); } if(g_debug_level > 3) { printf("called cloverdet_acc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0); } } return(mnl->energy1 - mnl->energy0); }
void cloverdetratio_heatbath(const int id, hamiltonian_field_t * const hf) { monomial * mnl = &monomial_list[id]; g_mu = mnl->mu; g_c_sw = mnl->c_sw; boundary(mnl->kappa); mnl->csg_n = 0; mnl->csg_n2 = 0; mnl->iter0 = 0; mnl->iter1 = 0; init_sw_fields(); sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); sw_invert(EE, mnl->mu); random_spinor_field(g_spinor_field[4], VOLUME/2, mnl->rngrepro); mnl->energy0 = square_norm(g_spinor_field[4], VOLUME/2, 1); g_mu3 = mnl->rho; mnl->Qp(g_spinor_field[3], g_spinor_field[4]); g_mu3 = mnl->rho2; zero_spinor_field(mnl->pf,VOLUME/2); mnl->iter0 = cg_her(mnl->pf, g_spinor_field[3], mnl->maxiter, mnl->accprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq); chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array, mnl->csg_N, &mnl->csg_n, VOLUME/2); mnl->Qm(mnl->pf, mnl->pf); if(g_proc_id == 0 && g_debug_level > 3) { printf("called cloverdetratio_heatbath for id %d \n", id); } g_mu3 = 0.; g_mu = g_mu1; boundary(g_kappa); return; }
double Scene::compute_radius() const { double square_radius = 0.0; for (const_each<AssemblyInstanceContainer> i = impl->m_assembly_instances; i; ++i) { const AssemblyInstance& inst = *i; const GAABB3 inst_bbox = inst.compute_parent_bbox(); GVector3 corners[8]; inst_bbox.compute_corners(corners); for (size_t j = 0; j < 8; ++j) { const double square_distance = square_norm(corners[j]); if (square_radius < square_distance) square_radius = square_distance; } } return sqrt(square_radius); }
/* P output = solution , Q input = source */ int cg_mms_tm(spinor * const P, spinor * const Q, const int max_iter, double eps_sq, const int rel_prec, const int N, matrix_mult f) { static double normsq, pro, err, alpha_cg = 1., beta_cg = 0., squarenorm; int iteration, im, append = 0; char filename[100]; static double gamma, alpham1; int const cg_mms_default_precision = 32; double tmp_mu = g_mu; WRITER * writer = NULL; paramsInverterInfo *inverterInfo = NULL; paramsPropagatorFormat *propagatorFormat = NULL; spinor * temp_save; //used to save all the masses spinor ** solver_field = NULL; const int nr_sf = 5; init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); init_mms_tm(g_no_extra_masses); /* currently only implemented for P=0 */ zero_spinor_field(P, N); /* Value of the bare MMS-masses (\mu^2 - \mu_0^2) */ for(im = 0; im < g_no_extra_masses; im++) { sigma[im] = g_extra_masses[im]*g_extra_masses[im] - g_mu*g_mu; assign(xs_mms_solver[im], P, N); assign(ps_mms_solver[im], Q, N); zitam1[im] = 1.0; zita[im] = 1.0; alphas[im] = 1.0; betas[im] = 0.0; } squarenorm = square_norm(Q, N, 1); assign(solver_field[0], P, N); /* normsp = square_norm(P, N, 1); */ /* initialize residue r and search vector p */ /* if(normsp == 0){ */ /* currently only implemented for P=0 */ if(1) { /* if a starting solution vector equal to zero is chosen */ assign(solver_field[1], Q, N); assign(solver_field[2], Q, N); normsq = square_norm(Q, N, 1); } else{ /* if a starting solution vector different from zero is chosen */ f(solver_field[3], solver_field[0]); diff(solver_field[1], Q, solver_field[3], N); assign(solver_field[2], solver_field[1], N); normsq = square_norm(solver_field[2], N, 1); } /* main loop */ for(iteration = 0; iteration < max_iter; iteration++) { /* Q^2*p and then (p,Q^2*p) */ f(solver_field[4], solver_field[2]); pro = scalar_prod_r(solver_field[2], solver_field[4], N, 1); /* For the update of the coeff. of the shifted pol. we need alpha_cg(i-1) and alpha_cg(i). This is the reason why we need this double definition of alpha */ alpham1 = alpha_cg; /* Compute alpha_cg(i+1) */ alpha_cg = normsq/pro; for(im = 0; im < g_no_extra_masses; im++) { /* Now gamma is a temp variable that corresponds to zita(i+1) */ gamma = zita[im]*alpham1/(alpha_cg*beta_cg*(1.-zita[im]/zitam1[im]) + alpham1*(1.+sigma[im]*alpha_cg)); /* Now zita(i-1) is put equal to the old zita(i) */ zitam1[im] = zita[im]; /* Now zita(i+1) is updated */ zita[im] = gamma; /* Update of alphas(i) = alpha_cg(i)*zita(i+1)/zita(i) */ alphas[im] = alpha_cg*zita[im]/zitam1[im]; /* Compute xs(i+1) = xs(i) + alphas(i)*ps(i) */ assign_add_mul_r(xs_mms_solver[im], ps_mms_solver[im], alphas[im], N); } /* Compute x_(i+1) = x_i + alpha_cg(i+1) p_i */ assign_add_mul_r(solver_field[0], solver_field[2], alpha_cg, N); /* Compute r_(i+1) = r_i - alpha_cg(i+1) Qp_i */ assign_add_mul_r(solver_field[1], solver_field[4], -alpha_cg, N); /* Check whether the precision eps_sq is reached */ err = square_norm(solver_field[1], N, 1); if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { printf("CGMMS iteration: %d residue: %g\n", iteration, err); fflush( stdout ); } if( ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1)) ) { assign(P, solver_field[0], N); f(solver_field[2], P); diff(solver_field[3], solver_field[2], Q, N); err = square_norm(solver_field[3], N, 1); if(g_debug_level > 0 && g_proc_id == g_stdio_proc) { printf("# CG MMS true residue at final iteration (%d) was %g.\n", iteration, err); fflush( stdout); } g_sloppy_precision = 0; g_mu = tmp_mu; /* save all the results of (Q^dagger Q)^(-1) \gamma_5 \phi */ /* here ... */ /* when im == -1 save the base mass*/ for(im = -1; im < g_no_extra_masses; im++) { if(im==-1) { temp_save=solver_field[0]; } else { temp_save=xs_mms_solver[im]; } if(SourceInfo.type != 1) { if (PropInfo.splitted) { sprintf(filename, "%s.%.4d.%.2d.%.2d.cgmms.%.2d.inverted", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, im+1); } else { sprintf(filename, "%s.%.4d.%.2d.cgmms.%.2d.inverted", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, im+1); } } else { sprintf(filename, "%s.%.4d.%.5d.cgmms.%.2d.0", SourceInfo.basename, SourceInfo.nstore, SourceInfo.sample, im+1); } if(g_kappa != 0) { mul_r(temp_save, (2*g_kappa)*(2*g_kappa), temp_save, N); } append = !PropInfo.splitted; construct_writer(&writer, filename, append); if (PropInfo.splitted || SourceInfo.ix == index_start) { //Create the inverter info NOTE: always set to TWILSON=12 and 1 flavour (to be adjusted) inverterInfo = construct_paramsInverterInfo(err, iteration+1, 12, 1); if (im == -1) { inverterInfo->cgmms_mass = inverterInfo->mu; } else { inverterInfo->cgmms_mass = g_extra_masses[im]/(2 * inverterInfo->kappa); } write_spinor_info(writer, PropInfo.format, inverterInfo, append); //Create the propagatorFormat NOTE: always set to 1 flavour (to be adjusted) propagatorFormat = construct_paramsPropagatorFormat(cg_mms_default_precision, 1); write_propagator_format(writer, propagatorFormat); free(inverterInfo); free(propagatorFormat); } convert_lexic_to_eo(solver_field[2], solver_field[1], temp_save); write_spinor(writer, &solver_field[2], &solver_field[1], 1, 32); destruct_writer(writer); } finalize_solver(solver_field, nr_sf); return(iteration+1); } /* Compute beta_cg(i+1) = (r(i+1),r(i+1))/(r(i),r(i)) Compute p(i+1) = r(i+1) + beta(i+1)*p(i) */ beta_cg = err/normsq; assign_mul_add_r(solver_field[2], beta_cg, solver_field[1], N); normsq = err; /* Compute betas(i+1) = beta_cg(i)*(zita(i+1)*alphas(i))/(zita(i)*alpha_cg(i)) Compute ps(i+1) = zita(i+1)*r(i+1) + betas(i+1)*ps(i) */ for(im = 0; im < g_no_extra_masses; im++) { betas[im] = beta_cg*zita[im]*alphas[im]/(zitam1[im]*alpha_cg); assign_mul_add_mul_r(ps_mms_solver[im], solver_field[1], betas[im], zita[im], N); } } assign(P, solver_field[0], N); g_sloppy_precision = 0; finalize_solver(solver_field, nr_sf); return(-1); }
int cr(spinor * const P, spinor * const Q, const int m, const int max_restarts, const double eps_sq, const int rel_prec, const int N, const int precon, matrix_mult f) { int k, l, restart, i, iter = 0; double norm_sq, err; spinor * xi, * Axi, * chi, * Achi, *tmp; _Complex double alpha, beta; static _Complex double one = 1.0; double norm, rAr, newrAr; double atime, etime; spinor ** solver_field = NULL; const int nr_sf = 5; int save_sloppy = g_sloppy_precision; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } atime = gettime(); xi = solver_field[0]; Axi = solver_field[1]; chi = solver_field[2]; Achi = solver_field[3]; tmp = solver_field[4]; norm_sq = square_norm(Q, N, 1); if(norm_sq < 1.e-32) { norm_sq = 1.; } dfl_sloppy_prec = 0; f(tmp, P); diff(chi, Q, tmp, N); assign(xi, chi, N); f(Axi, xi); f(Achi, chi); rAr = scalar_prod(chi, Achi, N, 1); err = square_norm(chi, N, 1); if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { finalize_solver(solver_field, nr_sf); return(iter); } for(k = 0; k < m; k++) { dfl_sloppy_prec = 1; norm = square_norm(Axi, N, 1); alpha = rAr/norm; assign_add_mul(P, xi, alpha, N); /* get the new residual */ assign_diff_mul(chi, Axi, alpha, N); err = square_norm(chi, N, 1); iter ++; etime = gettime(); if(g_proc_id == g_stdio_proc && g_debug_level > 3){ printf("# CR: %d\t%g iterated residue, time spent %f s\n", iter, err, (etime - atime)); fflush(stdout); } /* Precision reached? */ if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { break; } #ifdef _USE_HALFSPINOR if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*norm_sq) && (rel_prec == 1))) { if (g_sloppy_precision_flag == 1) { g_sloppy_precision = 1; if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { printf("sloppy precision on\n"); fflush( stdout); } } } #endif f(Achi, chi); newrAr = scalar_prod(chi, Achi, N, 1); beta = newrAr/rAr; assign_mul_add_mul(xi, beta, chi, one, N); assign_mul_add_mul(Axi,beta, Achi, one, N); rAr = newrAr; } g_sloppy_precision = save_sloppy; finalize_solver(solver_field, nr_sf); return(-1); }
/*lambda: largest eigenvalue, k eigenvector */ int evamax(double *rz, int k, double q_off, double eps_sq) { static double ritz,norm0,normg,normg0,beta_cg; static double costh,sinth,cosd,sind,aaa,normp,xxx; static double xs1,xs2,xs3; int iteration; /* Initialize k to be gaussian */ random_spinor_field(g_spinor_field[k], VOLUME/2); norm0=square_norm(g_spinor_field[k], VOLUME/2, 1); /*normalize k */ assign_mul_bra_add_mul_r( g_spinor_field[k], 1./sqrt(norm0),0., g_spinor_field[k], VOLUME/2); Q_psi(DUM_SOLVER,k,q_off); Q_psi(DUM_SOLVER,DUM_SOLVER,q_off); /*compute the ritz functional */ /*put g on DUM_SOLVER+2 and p on DUM_SOLVER+1*/ ritz=scalar_prod_r(g_spinor_field[DUM_SOLVER], g_spinor_field[k], VOLUME/2, 1); zero_spinor_field(g_spinor_field[DUM_SOLVER+2],VOLUME/2); assign_add_mul_r_add_mul(g_spinor_field[DUM_SOLVER+2], g_spinor_field[DUM_SOLVER], g_spinor_field[k], 1., -ritz, VOLUME/2); assign(g_spinor_field[DUM_SOLVER+1], g_spinor_field[DUM_SOLVER+2], VOLUME/2); normg0=square_norm(g_spinor_field[DUM_SOLVER+2], VOLUME/2, 1); /* main loop */ for(iteration=1;iteration<=ITER_MAX_BCG;iteration++) { if(normg0 <= eps_sq) break; Q_psi(DUM_SOLVER+2,DUM_SOLVER+1,q_off); Q_psi(DUM_SOLVER+2,DUM_SOLVER+2,q_off); /* compute costh and sinth */ normp=square_norm(g_spinor_field[DUM_SOLVER+1], VOLUME/2, 1); xxx=scalar_prod_r(g_spinor_field[DUM_SOLVER+2], g_spinor_field[DUM_SOLVER+1], VOLUME/2, 1); xs1=0.5*(ritz+xxx/normp); xs2=0.5*(ritz-xxx/normp); normp=sqrt(normp); xs3=normg0/normp; aaa=sqrt(xs2*xs2+xs3*xs3); cosd=xs2/aaa; sind=xs3/aaa; if(cosd>=0.) { costh=sqrt(0.5*(1.+cosd)); sinth=0.5*sind/costh; } else { sinth=sqrt(0.5*(1.-cosd)); costh=0.5*sind/sinth; } ritz=xs1+aaa; assign_add_mul_r_add_mul(g_spinor_field[k], g_spinor_field[k], g_spinor_field[DUM_SOLVER+1], costh-1., sinth/normp, VOLUME/2); assign_add_mul_r_add_mul(g_spinor_field[DUM_SOLVER], g_spinor_field[DUM_SOLVER], g_spinor_field[DUM_SOLVER+2], costh-1., sinth/normp, VOLUME/2); /* compute g */ zero_spinor_field(g_spinor_field[DUM_SOLVER+2],VOLUME/2); assign_add_mul_r_add_mul(g_spinor_field[DUM_SOLVER+2], g_spinor_field[DUM_SOLVER], g_spinor_field[k], 1., -ritz, VOLUME/2); /* calculate the norm of g' and beta_cg=costh g'^2/g^2 */ normg=square_norm(g_spinor_field[DUM_SOLVER+2], VOLUME/2, 1); beta_cg=costh*normg/normg0; if(beta_cg*costh*normp>20.*sqrt(normg)) beta_cg=0.; normg0=normg; /* compute the new value of p */ assign_add_mul_r(g_spinor_field[DUM_SOLVER+1], g_spinor_field[k], -scalar_prod_r(g_spinor_field[k], g_spinor_field[DUM_SOLVER+1], VOLUME/2), VOLUME/2, 1); assign_mul_add_r(g_spinor_field[DUM_SOLVER+1],beta_cg, g_spinor_field[DUM_SOLVER+2], VOLUME/2); /* restore the state of the iteration */ if(iteration%20==0) { /* readjust x */ xxx=sqrt(square_norm(g_spinor_field[k], VOLUME/2), 1); assign_mul_bra_add_mul_r( g_spinor_field[k], 1./xxx,0., g_spinor_field[k], VOLUME/2); Q_psi(DUM_SOLVER,k,q_off); Q_psi(DUM_SOLVER,DUM_SOLVER,q_off); /*compute the ritz functional */ ritz=scalar_prod_r(g_spinor_field[DUM_SOLVER], g_spinor_field[k], VOLUME/2, 1); /*put g on DUM_SOLVER+2 and p on DUM_SOLVER+1*/ zero_spinor_field(g_spinor_field[DUM_SOLVER+2],VOLUME/2); assign_add_mul_r_add_mul(g_spinor_field[DUM_SOLVER+2], g_spinor_field[DUM_SOLVER], g_spinor_field[k], 1., -ritz, VOLUME/2); normg0=square_norm(g_spinor_field[DUM_SOLVER+2], VOLUME/2, 1); /*subtract a linear combination of x and g from p to insure (x,p)=0 and (p,g)=(g,g) */ cosd=scalar_prod_r(g_spinor_field[k], g_spinor_field[DUM_SOLVER+1], VOLUME/2, 1); assign_add_mul_r(g_spinor_field[DUM_SOLVER+1], g_spinor_field[k], -cosd, VOLUME/2); cosd=scalar_prod_r(g_spinor_field[DUM_SOLVER+1], g_spinor_field[DUM_SOLVER+2], VOLUME/2, 1)-normg0; assign_add_mul_r(g_spinor_field[DUM_SOLVER+1], g_spinor_field[DUM_SOLVER+2], -cosd/sqrt(normg0), VOLUME/2); } } *rz=ritz; return iteration; }
void op_invert(const int op_id, const int index_start, const int write_prop) { operator * optr = &operator_list[op_id]; double atime = 0., etime = 0., nrm1 = 0., nrm2 = 0.; int i; optr->iterations = 0; optr->reached_prec = -1.; g_kappa = optr->kappa; boundary(g_kappa); atime = gettime(); if(optr->type == TMWILSON || optr->type == WILSON || optr->type == CLOVER) { g_mu = optr->mu; g_c_sw = optr->c_sw; if(optr->type == CLOVER) { if (g_cart_id == 0 && g_debug_level > 1) { printf("#\n# csw = %e, computing clover leafs\n", g_c_sw); } init_sw_fields(VOLUME); sw_term( (const su3**) g_gauge_field, optr->kappa, optr->c_sw); /* this must be EE here! */ /* to match clover_inv in Qsw_psi */ sw_invert(EE, optr->mu); } for(i = 0; i < 2; i++) { if (g_cart_id == 0) { printf("#\n# 2 kappa mu = %e, kappa = %e, c_sw = %e\n", g_mu, g_kappa, g_c_sw); } if(optr->type != CLOVER) { if(use_preconditioning){ g_precWS=(void*)optr->precWS; } else { g_precWS=NULL; } optr->iterations = invert_eo( optr->prop0, optr->prop1, optr->sr0, optr->sr1, optr->eps_sq, optr->maxiter, optr->solver, optr->rel_prec, 0, optr->even_odd_flag,optr->no_extra_masses, optr->extra_masses, optr->id ); /* check result */ M_full(g_spinor_field[4], g_spinor_field[5], optr->prop0, optr->prop1); } else { optr->iterations = invert_clover_eo(optr->prop0, optr->prop1, optr->sr0, optr->sr1, optr->eps_sq, optr->maxiter, optr->solver, optr->rel_prec, &g_gauge_field, &Qsw_pm_psi, &Qsw_minus_psi); /* check result */ Msw_full(g_spinor_field[4], g_spinor_field[5], optr->prop0, optr->prop1); } diff(g_spinor_field[4], g_spinor_field[4], optr->sr0, VOLUME / 2); diff(g_spinor_field[5], g_spinor_field[5], optr->sr1, VOLUME / 2); nrm1 = square_norm(g_spinor_field[4], VOLUME / 2, 1); nrm2 = square_norm(g_spinor_field[5], VOLUME / 2, 1); optr->reached_prec = nrm1 + nrm2; /* convert to standard normalisation */ /* we have to mult. by 2*kappa */ if (optr->kappa != 0.) { mul_r(optr->prop0, (2*optr->kappa), optr->prop0, VOLUME / 2); mul_r(optr->prop1, (2*optr->kappa), optr->prop1, VOLUME / 2); } if (optr->solver != CGMMS && write_prop) /* CGMMS handles its own I/O */ optr->write_prop(op_id, index_start, i); if(optr->DownProp) { optr->mu = -optr->mu; } else break; } } else if(optr->type == DBTMWILSON || optr->type == DBCLOVER) { g_mubar = optr->mubar; g_epsbar = optr->epsbar; g_c_sw = 0.; if(optr->type == DBCLOVER) { g_c_sw = optr->c_sw; if (g_cart_id == 0 && g_debug_level > 1) { printf("#\n# csw = %e, computing clover leafs\n", g_c_sw); } init_sw_fields(VOLUME); sw_term( (const su3**) g_gauge_field, optr->kappa, optr->c_sw); sw_invert_nd(optr->mubar*optr->mubar-optr->epsbar*optr->epsbar); } for(i = 0; i < SourceInfo.no_flavours; i++) { if(optr->type != DBCLOVER) { optr->iterations = invert_doublet_eo( optr->prop0, optr->prop1, optr->prop2, optr->prop3, optr->sr0, optr->sr1, optr->sr2, optr->sr3, optr->eps_sq, optr->maxiter, optr->solver, optr->rel_prec); } else { optr->iterations = invert_cloverdoublet_eo( optr->prop0, optr->prop1, optr->prop2, optr->prop3, optr->sr0, optr->sr1, optr->sr2, optr->sr3, optr->eps_sq, optr->maxiter, optr->solver, optr->rel_prec); } g_mu = optr->mubar; if(optr->type != DBCLOVER) { M_full(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], optr->prop0, optr->prop1); } else { Msw_full(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], optr->prop0, optr->prop1); } assign_add_mul_r(g_spinor_field[DUM_DERI+1], optr->prop2, -optr->epsbar, VOLUME/2); assign_add_mul_r(g_spinor_field[DUM_DERI+2], optr->prop3, -optr->epsbar, VOLUME/2); g_mu = -g_mu; if(optr->type != DBCLOVER) { M_full(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4], optr->prop2, optr->prop3); } else { Msw_full(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4], optr->prop2, optr->prop3); } assign_add_mul_r(g_spinor_field[DUM_DERI+3], optr->prop0, -optr->epsbar, VOLUME/2); assign_add_mul_r(g_spinor_field[DUM_DERI+4], optr->prop1, -optr->epsbar, VOLUME/2); diff(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], optr->sr0, VOLUME/2); diff(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI+2], optr->sr1, VOLUME/2); diff(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+3], optr->sr2, VOLUME/2); diff(g_spinor_field[DUM_DERI+4], g_spinor_field[DUM_DERI+4], optr->sr3, VOLUME/2); nrm1 = square_norm(g_spinor_field[DUM_DERI+1], VOLUME/2, 1); nrm1 += square_norm(g_spinor_field[DUM_DERI+2], VOLUME/2, 1); nrm1 += square_norm(g_spinor_field[DUM_DERI+3], VOLUME/2, 1); nrm1 += square_norm(g_spinor_field[DUM_DERI+4], VOLUME/2, 1); optr->reached_prec = nrm1; g_mu = g_mu1; /* For standard normalisation */ /* we have to mult. by 2*kappa */ mul_r(g_spinor_field[DUM_DERI], (2*optr->kappa), optr->prop0, VOLUME/2); mul_r(g_spinor_field[DUM_DERI+1], (2*optr->kappa), optr->prop1, VOLUME/2); mul_r(g_spinor_field[DUM_DERI+2], (2*optr->kappa), optr->prop2, VOLUME/2); mul_r(g_spinor_field[DUM_DERI+3], (2*optr->kappa), optr->prop3, VOLUME/2); /* the final result should be stored in the convention used in */ /* hep-lat/0606011 */ /* this requires multiplication of source with */ /* (1+itau_2)/sqrt(2) and the result with (1-itau_2)/sqrt(2) */ mul_one_pm_itau2(optr->prop0, optr->prop2, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+2], -1., VOLUME/2); mul_one_pm_itau2(optr->prop1, optr->prop3, g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+3], -1., VOLUME/2); /* write propagator */ if(write_prop) optr->write_prop(op_id, index_start, i); mul_r(optr->prop0, 1./(2*optr->kappa), g_spinor_field[DUM_DERI], VOLUME/2); mul_r(optr->prop1, 1./(2*optr->kappa), g_spinor_field[DUM_DERI+1], VOLUME/2); mul_r(optr->prop2, 1./(2*optr->kappa), g_spinor_field[DUM_DERI+2], VOLUME/2); mul_r(optr->prop3, 1./(2*optr->kappa), g_spinor_field[DUM_DERI+3], VOLUME/2); /* mirror source, but not for volume sources */ if(i == 0 && SourceInfo.no_flavours == 2 && SourceInfo.type != 1) { if (g_cart_id == 0) { fprintf(stdout, "# Inversion done in %d iterations, squared residue = %e!\n", optr->iterations, optr->reached_prec); } mul_one_pm_itau2(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+2], optr->sr0, optr->sr2, -1., VOLUME/2); mul_one_pm_itau2(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+3], optr->sr1, optr->sr3, -1., VOLUME/2); mul_one_pm_itau2(optr->sr0, optr->sr2, g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI], +1., VOLUME/2); mul_one_pm_itau2(optr->sr1, optr->sr3, g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+1], +1., VOLUME/2); } /* volume sources need only one inversion */ else if(SourceInfo.type == 1) i++; } } else if(optr->type == OVERLAP) { g_mu = 0.; m_ov=optr->m; eigenvalues(&optr->no_ev, 5000, optr->ev_prec, 0, optr->ev_readwrite, nstore, optr->even_odd_flag); /* ov_check_locality(); */ /* index_jd(&optr->no_ev_index, 5000, 1.e-12, optr->conf_input, nstore, 4); */ ov_n_cheby=optr->deg_poly; if(use_preconditioning==1) g_precWS=(void*)optr->precWS; else g_precWS=NULL; if(g_debug_level > 3) ov_check_ginsparg_wilson_relation_strong(); invert_overlap(op_id, index_start); if(write_prop) optr->write_prop(op_id, index_start, 0); } etime = gettime(); if (g_cart_id == 0 && g_debug_level > 0) { fprintf(stdout, "# Inversion done in %d iterations, squared residue = %e!\n", optr->iterations, optr->reached_prec); fprintf(stdout, "# Inversion done in %1.2e sec. \n", etime - atime); } return; }
/* P output = solution , Q input = source */ int mixed_cg_her(spinor * const P, spinor * const Q, solver_params_t solver_params, const int max_iter, double eps_sq, const int rel_prec, const int N, matrix_mult f, matrix_mult32 f32) { int i = 0, iter = 0, j = 0; float sqnrm = 0., sqnrm2, squarenorm; float pro, err, alpha_cg, beta_cg; double sourcesquarenorm, sqnrm_d, squarenorm_d; spinor *delta, *y, *xhigh; spinor32 *x, *stmp; spinor ** solver_field = NULL; spinor32 ** solver_field32 = NULL; const int nr_sf = 3; const int nr_sf32 = 4; int max_inner_it = mixcg_maxinnersolverit; int N_outer = max_iter/max_inner_it; //to be on the save side we allow at least 10 outer iterations if(N_outer < 10) N_outer = 10; int save_sloppy = g_sloppy_precision_flag; double atime, etime, flops; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); init_solver_field_32(&solver_field32, VOLUMEPLUSRAND, nr_sf32); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); init_solver_field_32(&solver_field32, VOLUMEPLUSRAND/2, nr_sf32); } squarenorm_d = square_norm(Q, N, 1); sourcesquarenorm = squarenorm_d; sqnrm_d = squarenorm_d; delta = solver_field[0]; y = solver_field[1]; xhigh = solver_field[2]; x = solver_field32[3]; assign(delta, Q, N); //set solution to zero zero_spinor_field(P, N); atime = gettime(); for(i = 0; i < N_outer; i++) { /* main CG loop in lower precision */ zero_spinor_field_32(x, N); zero_spinor_field_32(solver_field32[0], N); assign_to_32(solver_field32[1], delta, N); assign_to_32(solver_field32[2], delta, N); sqnrm = (float) sqnrm_d; sqnrm2 = sqnrm; /*inner CG loop */ for(j = 0; j <= max_inner_it; j++) { f32(solver_field32[0], solver_field32[2]); pro = scalar_prod_r_32(solver_field32[2], solver_field32[0], N, 1); alpha_cg = sqnrm2 / pro; assign_add_mul_r_32(x, solver_field32[2], alpha_cg, N); assign_mul_add_r_32(solver_field32[0], -alpha_cg, solver_field32[1], N); err = square_norm_32(solver_field32[0], N, 1); if(g_proc_id == g_stdio_proc && g_debug_level > 2) { printf("inner CG: %d res^2 %g\n", iter+j, err); fflush(stdout); } //if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))){ if((err <= mixcg_innereps*sqnrm)|| (j==max_inner_it) || ((1.3*err <= eps_sq) && (rel_prec == 0)) || ((1.3*err <= eps_sq*sourcesquarenorm) && (rel_prec == 1))) { break; } beta_cg = err / sqnrm2; assign_mul_add_r_32(solver_field32[2], beta_cg, solver_field32[0], N); stmp = solver_field32[0]; solver_field32[0] = solver_field32[1]; solver_field32[1] = stmp; sqnrm2 = err; } /* end inner CG loop */ iter += j; /* we want to apply a true double matrix with f(y,P) -> set sloppy off here*/ g_sloppy_precision_flag = 0; /* calculate defect in double precision */ assign_to_64(xhigh, x, N); add(P, P, xhigh, N); f(y, P); diff(delta, Q, y, N); sqnrm_d = square_norm(delta, N, 1); if(g_debug_level > 2 && g_proc_id == 0) { printf("mixed CG: last inner residue: %g\t\n", err); printf("mixed CG: true residue %d %g\t\n",iter, sqnrm_d); fflush(stdout); } /* here we can reset it to its initial value*/ g_sloppy_precision_flag = save_sloppy; if(((sqnrm_d <= eps_sq) && (rel_prec == 0)) || ((sqnrm_d <= eps_sq*sourcesquarenorm) && (rel_prec == 1))) { etime = gettime(); if(g_debug_level > 0 && g_proc_id == 0) { if(N != VOLUME){ /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ /* 2*1608.0 because the linalg is over VOLUME/2 */ flops = (2*(2*1608.0+2*3*4) + 2*3*4 + iter*(2.*(2*1608.0+2*3*4) + 10*3*4))*N/1.0e6f; printf("# mixed CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iter, eps_sq, etime-atime); printf("# mixed CG: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime)); } else{ /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ flops = (2*(1608.0+2*3*4) + 2*3*4 + iter*(2.*(1608.0+2*3*4) + 10*3*4))*N/1.0e6f; printf("# mixed CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iter, eps_sq, etime-atime); printf("# mixed CG: flopcount (for non-e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime)); } } finalize_solver(solver_field, nr_sf); finalize_solver_32(solver_field32, nr_sf32); return(iter+i); } iter++; } finalize_solver(solver_field, nr_sf); finalize_solver_32(solver_field32, nr_sf32); return(-1); }
int gcr(spinor * const P, spinor * const Q, const int m, const int max_restarts, const double eps_sq, const int rel_prec, const int N, const int precon, matrix_mult f) { int k, l, restart, i, iter = 0; double norm_sq, err; spinor * rho, * tmp; complex ctmp; spinor ** solver_field = NULL; const int nr_sf = 2; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } rho = solver_field[0]; tmp = solver_field[1]; init_gcr(m, N+RAND); norm_sq = square_norm(Q, N, 1); if(norm_sq < 1.e-32) { norm_sq = 1.; } for(restart = 0; restart < max_restarts; restart++) { dfl_sloppy_prec = 0; f(tmp, P); diff(rho, Q, tmp, N); err = square_norm(rho, N, 1); if(g_proc_id == g_stdio_proc && g_debug_level > 2){ printf("GCR: iteration number: %d, true residue: %g\n", iter, err); fflush(stdout); } if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { finalize_solver(solver_field, nr_sf); return(iter); } for(k = 0; k < m; k++) { if(precon == 0) { assign(xi[k], rho, N); } else { zero_spinor_field(xi[k], N); Msap_eo(xi[k], rho, 6); /* Msap(xi[k], rho, 8); */ } dfl_sloppy_prec = 1; dfl_little_D_prec = 1.e-12; f(tmp, xi[k]); /* tmp will become chi[k] */ for(l = 0; l < k; l++) { a[l][k] = scalar_prod(chi[l], tmp, N, 1); assign_diff_mul(tmp, chi[l], a[l][k], N); } b[k] = sqrt(square_norm(tmp, N, 1)); mul_r(chi[k], 1./b[k], tmp, N); c[k] = scalar_prod(chi[k], rho, N, 1); assign_diff_mul(rho, chi[k], c[k], N); err = square_norm(rho, N, 1); iter ++; if(g_proc_id == g_stdio_proc && g_debug_level > 0){ if(rel_prec == 1) printf("# GCR: %d\t%g >= %g iterated residue\n", iter, err, eps_sq*norm_sq); else printf("# GCR: %d\t%g >= %giterated residue\n", iter, err, eps_sq); fflush(stdout); } /* Precision reached? */ if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { break; } } /* prepare for restart */ _mult_real(c[k], c[k], 1./b[k]); assign_add_mul(P, xi[k], c[k], N); for(l = k-1; l >= 0; l--) { for(i = l+1; i <= k; i++) { _mult_assign_complex(ctmp, a[l][i], c[i]); /* c[l] -= ctmp */ _diff_complex(c[l], ctmp); } _mult_real(c[l], c[l], 1./b[l]); assign_add_mul(P, xi[l], c[l], N); } } finalize_solver(solver_field, nr_sf); return(-1); }
int incr_eigcg(const int N, const int nrhs, const int nrhs1, spinor * const x, spinor * const b, const int ldh, matrix_mult f, const double eps_sq1, const double eps_sq, double restart_eps_sq, const int rand_guess_opt, const int rel_prec, const int maxit, int nev, const int v_max) { /*Static variables and arrays.*/ static spinor **solver_field; /*4 spinor fields*/ static int ncurEvals=0; /* current number of stored eigenvectors */ static int ncurRHS=0; /* current number of the system being solved */ static spinor **evecs; /* accumulated eigenvectors for deflation. */ static void *_evals; static double *evals; /* Ritz values */ static void *_v; static spinor *V; /* work array for eigenvector search basis in eigCG */ static void *_h; static _Complex double *H; /* The ncurEvals^2 matrix: H=evecs'*A*evecs */ static void *_hu; static _Complex double *HU; /* used for diagonalization of H if eigenvalues requested also used as a copy of H if needed*/ static void *_initwork; static _Complex double *initwork; /* vector of size ldh using with init-CG */ static void *_ework; static _Complex double *ework; /* end of the thinking part */ static void *_work; static _Complex double *work; static void *_rwork; static double *rwork; static void *_IPIV; static int *IPIV; /*integer array to store permutations when solving the small linear system*/ /* some constants */ char cU='U'; char cN='N'; char cV='V'; _Complex double tpone= 1.0e+00; _Complex double tzero= 0.0e+00; //tpone.re=+1.0e+00; tpone.im=0.0e+00; //tzero.re=+0.0e+00; tzero.im=0.0e+00; /* Timing vars */ double wt1,wt2,wE,wI; double eps_sq_used; /* Variables */ double machEps = 1e-15; double normb, normsq, tmpd,tmpd2; _Complex double tempz; int i,j, ONE = 1; int tmpsize,tmpi,info=0; int numIts, flag, nAdded, nev_used; int maxit_remain; int esize,nrsf; int parallel; /* for parallel processing of the scalar products */ /* leading dimension for spinor vectors */ int LDN; if(N==VOLUME) LDN = VOLUMEPLUSRAND; else LDN = VOLUMEPLUSRAND/2; #ifdef MPI parallel=1; #else parallel=0; #endif /*think more about this */ esize=2*12*N+4*nev*nev; /* fixed size for ework used for restarting in eigcg*/ nrsf=4; /*number of solver fields */ int lwork=3*ldh; double cur_res; //current residual squared (initial value will be computed in eigcg) /*increment the RHS counter*/ ncurRHS = ncurRHS +1; //set the tolerance to be used for this right-hand side if(ncurRHS > nrhs1){ eps_sq_used = eps_sq; } else{ eps_sq_used = eps_sq1; } if(ncurRHS==1)/* If this is the first system, allocate needed memory for the solver*/ { init_solver_field(&solver_field, LDN, nrsf); } if(nev==0){ /*incremental eigcg is used as a cg solver. No need to restart forcing no-restart*/ if(g_proc_id == g_stdio_proc && g_debug_level > 0) { fprintf(stdout, "CG won't be restarted in this mode since no deflation will take place (nev=0)\n"); fflush(stdout); } restart_eps_sq=0.0; } if((ncurRHS==1) && (nev >0) )/* If this is the first right-hand side and eigenvectors are needed, allocate needed memory*/ { init_solver_field(&evecs, LDN, ldh); #if (defined SSE || defined SSE2 || defined SSE3) /*Extra elements are needed for allignment */ //_v = malloc(LDN*v_max*sizeof(spinor)+ALIGN_BASE); _v = calloc(LDN*v_max+ALIGN_BASE,sizeof(spinor)); V = (spinor *)(((unsigned long int)(_v)+ALIGN_BASE)&~ALIGN_BASE); //_h=malloc(ldh*ldh*sizeof(_Complex double )+ALIGN_BASE); _h=calloc(ldh*ldh+ALIGN_BASE,sizeof(_Complex double )); H = (_Complex double *)(((unsigned long int)(_h)+ALIGN_BASE)&~ALIGN_BASE); //_hu=malloc(ldh*ldh*sizeof(_Complex double )+ALIGN_BASE); _hu=calloc(ldh*ldh+ALIGN_BASE,sizeof(_Complex double )); HU = (_Complex double *)(((unsigned long int)(_hu)+ALIGN_BASE)&~ALIGN_BASE); //_ework = malloc(esize*sizeof(_Complex double )+ALIGN_BASE); _ework = calloc(esize+ALIGN_BASE,sizeof(_Complex double )); ework=(_Complex double *)(((unsigned long int)(_ework)+ALIGN_BASE)&~ALIGN_BASE); //_initwork = malloc(ldh*sizeof(_Complex double )+ALIGN_BASE); _initwork = calloc(ldh+ALIGN_BASE,sizeof(_Complex double )); initwork = (_Complex double *)(((unsigned long int)(_initwork)+ALIGN_BASE)&~ALIGN_BASE); //_work = malloc(lwork*sizeof(_Complex double )+ALIGN_BASE); _work = calloc(lwork+ALIGN_BASE,sizeof(_Complex double )); work = (_Complex double *)(((unsigned long int)(_work)+ALIGN_BASE)&~ALIGN_BASE); //_rwork = malloc(3*ldh*sizeof(double)+ALIGN_BASE); _rwork = calloc(3*ldh+ALIGN_BASE,sizeof(double)); rwork = (double *)(((unsigned long int)(_rwork)+ALIGN_BASE)&~ALIGN_BASE); //_IPIV = malloc(ldh*sizeof(int)+ALIGN_BASE); _IPIV = calloc(ldh+ALIGN_BASE,sizeof(int)); IPIV = (int *)(((unsigned long int)(_IPIV)+ALIGN_BASE)&~ALIGN_BASE); //_evals = malloc(ldh*sizeof(double)+ALIGN_BASE); _evals = calloc(ldh+ALIGN_BASE,sizeof(double)); evals = (double *)(((unsigned long int)(_evals)+ALIGN_BASE)&~ALIGN_BASE); #else V = (spinor *) calloc(LDN*v_max,sizeof(spinor)); H = calloc(ldh*ldh, sizeof(_Complex double )); HU= calloc(ldh*ldh, sizeof(_Complex double )); initwork = calloc(ldh, sizeof(_Complex double )); ework = calloc(esize, sizeof(_Complex double )); work = calloc(lwork,sizeof(_Complex double )); rwork= calloc(3*ldh,sizeof(double)); IPIV = calloc(ldh, sizeof(int)); evals = (double *) calloc(ldh, sizeof(double)); #endif } /*if(ncurRHS==1)*/ if(g_proc_id == g_stdio_proc && g_debug_level > 0) { fprintf(stdout, "System %d, eps_sq %e\n",ncurRHS,eps_sq_used); fflush(stdout); } /*---------------------------------------------------------------*/ /* Call eigCG until this right-hand side converges */ /*---------------------------------------------------------------*/ wE = 0.0; wI = 0.0; /* Start accumulator timers */ flag = -1; /* First time through. Run eigCG regularly */ maxit_remain = maxit; /* Initialize Max and current # of iters */ numIts = 0; while( flag == -1 || flag == 3) { //if(g_proc_id==g_stdio_proc) //printf("flag= %d, ncurEvals= %d\n",flag,ncurEvals); if(ncurEvals > 0) { /* --------------------------------------------------------- */ /* Perform init-CG with evecs vectors */ /* xinit = xinit + evecs*Hinv*evec'*(b-Ax0) */ /* --------------------------------------------------------- */ wt1 = gettime(); /*r0=b-Ax0*/ normsq = square_norm(x,N,parallel); if(normsq>0.0) { f(solver_field[0],x); /* solver_field[0]= A*x */ diff(solver_field[1],b,solver_field[0],N); /* solver_filed[1]=b-A*x */ } else assign(solver_field[1],b,N); /* solver_field[1]=b */ /* apply the deflation using init-CG */ /* evecs'*(b-Ax) */ for(i=0; i<ncurEvals; i++) { initwork[i]= scalar_prod(evecs[i],solver_field[1],N,parallel); } /* solve the linear system H y = c */ tmpsize=ldh*ncurEvals; _FT(zcopy) (&tmpsize,H,&ONE,HU,&ONE); /* copy H into HU */ _FT(zgesv) (&ncurEvals,&ONE,HU,&ldh,IPIV,initwork,&ldh,&info); if(info != 0) { if(g_proc_id == g_stdio_proc) { fprintf(stderr, "Error in ZGESV:, info = %d\n",info); fflush(stderr); } exit(1); } /* x = x + evecs*inv(H)*evecs'*r */ for(i=0; i<ncurEvals; i++) { assign_add_mul(x,evecs[i],initwork[i],N); } /* compute elapsed time and add to accumulator */ wt2 = gettime(); wI = wI + wt2-wt1; }/* if(ncurEvals > 0) */ /* ------------------------------------------------------------ */ /* Adjust nev for eigcg according to available ldh/restart */ /* ------------------------------------------------------------ */ if (flag == 3) { /* restart with the same rhs, set nev_used = 0 */ nev_used = 0; /* if convergence seems before next restart do not restart again */ if(rel_prec) { if (cur_res*(restart_eps_sq) < eps_sq*normb*normb) restart_eps_sq=0.0; } else { if (cur_res*(restart_eps_sq) < eps_sq) restart_eps_sq=0.0; } /* if(rel_prec) */ } else { /* First time through this rhs. Find nev evecs */ /* limited by the ldh evecs we can store in total */ if (ldh-ncurEvals < nev) nev = ldh - ncurEvals; nev_used = nev; } /* ------------------------------------------------------------ */ /* Solve Ax = b with x initial guess */ /* ------------------------------------------------------------ */ wt1 = gettime(); eigcg( N, LDN, x, b, &normb, eps_sq_used, restart_eps_sq, rel_prec, maxit_remain, &numIts, &cur_res, &flag, solver_field, f, nev_used, v_max, V, esize, ework); //if(g_proc_id == g_stdio_proc) //printf("eigcg flag= %d \n",flag); wt2 = gettime(); wE = wE + wt2-wt1; /* if flag == 3 update the remain max number of iterations */ maxit_remain = maxit - numIts; } /* end while (flag ==-1 || flag == 3) */ /* ------------------------------------------------ */ /* ---------- */ /* Reporting */ /* ---------- */ /* compute the exact residual */ f(solver_field[0],x); /* solver_field[0]= A*x */ diff(solver_field[1],b,solver_field[0],N); /* solver_filed[1]=b-A*x */ normsq=square_norm(solver_field[1],N,parallel); if(g_debug_level > 0 && g_proc_id == g_stdio_proc) { fprintf(stdout, "For this rhs:\n"); fprintf(stdout, "Total initCG Wallclock : %-f\n", wI); fprintf(stdout, "Total eigpcg Wallclock : %-f\n", wE); fprintf(stdout, "Iterations: %-d\n", numIts); fprintf(stdout, "Residual: %e, Actual Resid of LinSys : %e\n", cur_res,normsq); if (flag != 0) { fprintf(stderr, "Error: eigcg returned with nonzero exit status\n"); return flag; fflush(stderr); } fflush(stdout); } /* ------------------------------------------------------------------- */ /* ------------------------------------------------------------------- */ /* Update the evecs and the factorization of evecs'*A*evecs */ /* ------------------------------------------------------------------- */ if (nev > 0) { wt1 = gettime(); /* Append new Ritz vectors to the basis and orthogonalize them to evecs */ for(i=0; i<nev_used; i++) assign(evecs[i+ncurEvals],&V[i*LDN],N); nAdded = ortho_new_vectors(evecs,N,ncurEvals,nev_used,machEps); /* expand H */ for(j=ncurEvals; j< (ncurEvals+nAdded); j++) { f(solver_field[0],evecs[j]); for(i=0; i<=j; i++) { H[i+j*ldh] = scalar_prod(evecs[i],solver_field[0],N,parallel); H[j+i*ldh]= conj(H[i+j*ldh]); //H[j+i*ldh].re = H[i+j*ldh].re; //H[j+i*ldh].im = -H[i+j*ldh].im; } } /* update the number of vectors in the basis */ ncurEvals = ncurEvals + nAdded; /* ---------- */ /* Reporting */ /* ---------- */ wt2 = gettime(); if(g_proc_id == g_stdio_proc && g_debug_level > 0) { fprintf(stdout,"ncurRHS %d\n",ncurRHS); fprintf(stdout,"ncurEvals %d \n",ncurEvals); fprintf(stdout,"Update\n"); fprintf(stdout,"Added %d vecs\n",nAdded); fprintf(stdout,"U Wallclock : %-f\n", wt2-wt1); fprintf(stdout,"Note: Update Wall time doesn't include time for computing eigenvalues and their residuals.\n"); fflush(stdout); } if(g_debug_level > 3) /*compute eigenvalues and their residuals if requested*/ { /* copy H into HU */ tmpsize=ldh*ncurEvals; _FT(zcopy) (&tmpsize,H,&ONE,HU,&ONE); /* compute eigenvalues and eigenvectors of HU (using V and spinor fields as tmp work spaces)*/ _FT(zheev)(&cV, &cU, &ncurEvals, HU, &ldh, evals, work, &lwork, rwork, &info,1,1); if(info != 0) { if(g_proc_id == g_stdio_proc) { fprintf(stderr,"Error in ZHEEV:, info = %d\n",info); fflush(stderr); } exit(1); } /* compute residuals and print out results */ for(i=0; i<ncurEvals; i++) { tmpi=12*N; tmpsize=12*LDN; _FT(zgemv)(&cN,&tmpi,&ncurEvals,&tpone,(_Complex double *)evecs[0],&tmpsize, &HU[i*ldh], &ONE,&tzero,(_Complex double *) solver_field[0],&ONE,1); normsq=square_norm(solver_field[0],N,parallel); f(solver_field[1],solver_field[0]); tempz = scalar_prod(solver_field[0],solver_field[1],N,parallel); evals[i] = creal(tempz)/normsq; mul_r(solver_field[2],evals[i],solver_field[0],N); diff(solver_field[3],solver_field[1],solver_field[2], N); tmpd2= square_norm(solver_field[3],N,parallel); tmpd= sqrt(tmpd2/normsq); if(g_proc_id == g_stdio_proc) {fprintf(stdout,"RR Eval[%d]: %22.15E rnorm: %22.15E\n", i+1, evals[i], tmpd); fflush(stdout);} } }/*if(plvl >= 2)*/ } /* if(nev>0) */ /*--------------------------------------*/ /*free memory that is no longer needed */ /* and reset ncurRHS and ncurEvals */ /*--------------------------------------*/ if(ncurRHS == nrhs) /*this was the last system to be solved */ { ncurRHS=0; ncurEvals=0; finalize_solver(solver_field,nrsf); } if( (ncurRHS == nrhs) && (nev >0) )/*this was the last system to be solved and there were allocated memory for eigenvector computation*/ { finalize_solver(evecs,ldh); #if (defined SSE || defined SSE2 || defined SSE3) free(_v); free(_h); free(_hu); free(_ework); free(_initwork); free(_IPIV); free(_evals); free(_rwork); free(_work); #else free(V); free(H); free(HU); free(ework); free(initwork); free(IPIV); free(evals); free(rwork); free(work); #endif } return numIts; }
void DirectLightingIntegrator::add_emitting_triangle_sample_contribution( const LightSample& sample, const MISHeuristic mis_heuristic, const Dual3d& outgoing, Spectrum& radiance, SpectrumStack& aovs) const { const Material* material = sample.m_triangle->m_material; const Material::RenderData& material_data = material->get_render_data(); const EDF* edf = material_data.m_edf; // No contribution if we are computing indirect lighting but this light does not cast indirect light. if (m_indirect && !(edf->get_flags() & EDF::CastIndirectLight)) return; // Compute the incoming direction in world space. Vector3d incoming = sample.m_point - m_point; // Cull light samples behind the shading surface if the BSDF is either reflective or transmissive, // but not both. if (m_bsdf.get_type() != BSDF::AllBSDFTypes) { double cos_in = dot(incoming, m_shading_basis.get_normal()); if (m_bsdf.get_type() == BSDF::Transmissive) cos_in = -cos_in; if (cos_in <= 0.0) return; } // No contribution if the shading point is behind the light. double cos_on = dot(-incoming, sample.m_shading_normal); if (cos_on <= 0.0) return; // Compute the transmission factor between the light sample and the shading point. const double transmission = m_shading_context.get_tracer().trace_between( m_shading_point, sample.m_point, VisibilityFlags::ShadowRay); // Discard occluded samples. if (transmission == 0.0) return; // Compute the square distance between the light sample and the shading point. const double square_distance = square_norm(incoming); const double rcp_sample_square_distance = 1.0 / square_distance; const double rcp_sample_distance = sqrt(rcp_sample_square_distance); // Don't use this sample if we're closer than the light near start value. if (square_distance < square(edf->get_light_near_start())) return; // Normalize the incoming direction. incoming *= rcp_sample_distance; cos_on *= rcp_sample_distance; // Evaluate the BSDF. Spectrum bsdf_value; const double bsdf_prob = m_bsdf.evaluate( m_bsdf_data, false, // not adjoint true, // multiply by |cos(incoming, normal)| m_geometric_normal, m_shading_basis, outgoing.get_value(), incoming, m_light_sampling_modes, bsdf_value); if (bsdf_prob == 0.0) return; // Build a shading point on the light source. ShadingPoint light_shading_point; sample.make_shading_point( light_shading_point, sample.m_shading_normal, m_shading_context.get_intersector()); #ifdef APPLESEED_WITH_OSL if (material_data.m_shader_group) { m_shading_context.execute_osl_emission( *material_data.m_shader_group, light_shading_point); } #endif // Evaluate the EDF inputs. InputEvaluator edf_input_evaluator(m_shading_context.get_texture_cache()); edf->evaluate_inputs(edf_input_evaluator, light_shading_point); // Evaluate the EDF. Spectrum edf_value; edf->evaluate( edf_input_evaluator.data(), sample.m_geometric_normal, Basis3d(sample.m_shading_normal), -incoming, edf_value); const double g = cos_on * rcp_sample_square_distance; double weight = transmission * g / sample.m_probability; // Apply MIS weighting. weight *= mis( mis_heuristic, m_light_sample_count * sample.m_probability, m_bsdf_sample_count * bsdf_prob * g); // Add the contribution of this sample to the illumination. edf_value *= static_cast<float>(weight); edf_value *= bsdf_value; radiance += edf_value; aovs.add(edf->get_render_layer_index(), edf_value); }
bool DirectLightingIntegrator::compute_incoming_radiance( SamplingContext& sampling_context, Vector3d& incoming, double& incoming_prob, Spectrum& radiance) const { if (!m_light_sampler.has_lights_or_emitting_triangles()) return false; sampling_context.split_in_place(3, 1); const Vector3d s = sampling_context.next_vector2<3>(); LightSample sample; m_light_sampler.sample(m_time, s, sample); if (sample.m_triangle) { const Material* material = sample.m_triangle->m_material; const Material::RenderData& material_data = material->get_render_data(); const EDF* edf = material_data.m_edf; // No contribution if we are computing indirect lighting but this light does not cast indirect light. if (m_indirect && !(edf->get_flags() & EDF::CastIndirectLight)) return false; // Compute the incoming direction in world space. incoming = sample.m_point - m_point; // No contribution if the shading point is behind the light. double cos_on_light = dot(-incoming, sample.m_shading_normal); if (cos_on_light <= 0.0) return false; // Compute the transmission factor between the light sample and the shading point. const double transmission = m_shading_context.get_tracer().trace_between( m_shading_point, sample.m_point, VisibilityFlags::ShadowRay); // Discard occluded samples. if (transmission == 0.0) return false; // Don't use this sample if we're closer than the light near start value. const double square_distance = square_norm(incoming); if (square_distance < square(edf->get_light_near_start())) return false; // Normalize the incoming direction. const double rcp_square_distance = 1.0 / square_distance; const double rcp_distance = sqrt(rcp_square_distance); incoming *= rcp_distance; cos_on_light *= rcp_distance; // Build a shading point on the light source. ShadingPoint light_shading_point; sample.make_shading_point( light_shading_point, sample.m_shading_normal, m_shading_context.get_intersector()); #ifdef APPLESEED_WITH_OSL if (material_data.m_shader_group) { m_shading_context.execute_osl_emission( *material_data.m_shader_group, light_shading_point); } #endif // Evaluate the EDF inputs. InputEvaluator edf_input_evaluator(m_shading_context.get_texture_cache()); edf->evaluate_inputs(edf_input_evaluator, light_shading_point); // Evaluate the EDF. edf->evaluate( edf_input_evaluator.data(), sample.m_geometric_normal, Basis3d(sample.m_shading_normal), -incoming, radiance); // Compute probability with respect to solid angle of incoming direction. const double g = cos_on_light * rcp_square_distance; incoming_prob = sample.m_probability / g; // Compute and return the incoming radiance. radiance *= static_cast<float>(transmission * g / sample.m_probability); } else { const Light* light = sample.m_light; // No contribution if we are computing indirect lighting but this light does not cast indirect light. if (m_indirect && !(light->get_flags() & Light::CastIndirectLight)) return false; // Evaluate the light. InputEvaluator input_evaluator(m_shading_context.get_texture_cache()); Vector3d emission_position, emission_direction; light->evaluate( input_evaluator, sample.m_light_transform, m_point, emission_position, emission_direction, radiance); // Compute the transmission factor between the light sample and the shading point. const double transmission = m_shading_context.get_tracer().trace_between( m_shading_point, emission_position, VisibilityFlags::ShadowRay); // Discard occluded samples. if (transmission == 0.0) return false; // Compute the incoming direction in world space. incoming = -emission_direction; incoming_prob = BSDF::DiracDelta; // Compute and return the incoming radiance. const double attenuation = light->compute_distance_attenuation(m_point, emission_position); radiance *= static_cast<float>(transmission * attenuation / sample.m_probability); } return true; }
int arpack_cg( /* solver params */ const int N, /* (IN) Number of lattice sites for this process*/ solver_params_t solver_params, /* (IN) parameters for solver */ spinor * const x, /* (IN/OUT) initial guess on input, solution on output for this RHS*/ spinor * const b, /* (IN) right-hand side*/ matrix_mult f, /* (IN) f(s,r) computes s=A*r, i.e. matrix-vector multiply in double precision */ matrix_mult f32, /* (IN) f(s,r) computes s=A*r, i.e. matrix-vector multiply in single precision */ const double eps_sq, /* (IN) squared tolerance of convergence of the linear system for systems nrhs1+1 till nrhs*/ const int rel_prec, /* (IN) 0 for using absoute error for convergence 1 for using relative error for convergence*/ const int maxit, /* (IN) Maximum allowed number of iterations to solution for the linear system*/ matrix_mult f_final, /* (IN) final operator application during projection of type 1 */ matrix_mult f_initial /* (IN) initial operator application during projection of type 1 */ ) { /* Static variables and arrays. */ static int ncurRHS=0; /* current number of the system being solved */ static void *_ax,*_r,*_tmps1,*_tmps2; static spinor *ax,*r,*tmps1,*tmps2; static _Complex double *evecs,*evals,*H,*HU,*Hinv,*initwork,*tmpv1; static _Complex double *zheev_work; static double *hevals,*zheev_rwork; static int *IPIV; static int info_arpack=0; static int nconv=0; /* number of converged eigenvectors as returned by arpack */ int i,j,tmpsize; char cV='V',cN='N', cU='U'; int ONE=1; int zheev_lwork,zheev_info; _Complex double c1, c2, c3, tpone=1.0,tzero=0.0; double d1,d2,d3; double et1,et2; /* timing variables */ char evecs_filename[500]; char howmny = 'P'; FILE *evecs_fs=NULL; size_t evecs_count; WRITER *evecs_writer=NULL; spinor *evecs_ptr0 = NULL, *evecs_ptr1 = NULL; paramsPropagatorFormat *evecs_propagatorFormat = NULL; void *evecs_io_buffer = NULL; int parallel; /* for parallel processing of the scalar products */ #ifdef TM_USE_MPI parallel=1; #else parallel=0; #endif /* leading dimension for spinor vectors */ int LDN; if(N==VOLUME) LDN = VOLUMEPLUSRAND; else LDN = VOLUMEPLUSRAND/2; /*(IN) Number of right-hand sides to be solved*/ const int nrhs = solver_params.arpackcg_nrhs; /*(IN) First number of right-hand sides to be solved using tolerance eps_sq1*/ const int nrhs1 = solver_params.arpackcg_nrhs1; /*(IN) squared tolerance of convergence of the linear system for systems 1 till nrhs1*/ const double eps_sq1 = solver_params.arpackcg_eps_sq1; /*(IN) suqared tolerance for restarting cg */ const double res_eps_sq = solver_params.arpackcg_res_eps_sq; /* parameters for arpack */ /*(IN) number of eigenvectors to be computed by arpack*/ const int nev = solver_params.arpackcg_nev; /*(IN) size of the subspace used by arpack with the condition (nev+1) =< ncv*/ const int ncv = solver_params.arpackcg_ncv; /*(IN) tolerance for computing eigenvalues with arpack */ double arpack_eig_tol = solver_params.arpackcg_eig_tol; /*(IN) maximum number of iterations to be used by arpack*/ int arpack_eig_maxiter = solver_params.arpackcg_eig_maxiter; /*(IN) 0 for eigenvalues with smallest real part "SR" 1 for eigenvalues with largest real part "LR" 2 for eigenvalues with smallest absolute value "SM" 3 for eigenvalues with largest absolute value "LM" 4 for eigenvalues with smallest imaginary part "SI" 5 for eigenvalues with largest imaginary part "LI"*/ int kind = solver_params.arpackcg_evals_kind; /*(IN) 0 don't compute the eiegnvalues and their residuals of the original system 1 compute the eigenvalues and the residuals for the original system (the orthonormal basis still be used in deflation and they are not overwritten).*/ int comp_evecs = solver_params.arpackcg_comp_evecs; /*(IN) 0 no polynomial acceleration; 1 use polynomial acceleration*/ int acc = solver_params.use_acc; /*(IN) degree of the Chebyshev polynomial (irrelevant if acc=0)*/ int cheb_k = solver_params.cheb_k; /*(IN) lower end of the interval where the acceleration will be used (irrelevant if acc=0)*/ double emin = solver_params.op_evmin; /*(IN) upper end of the interval where the acceleration will be used (irrelevant if acc=0)*/ double emax = solver_params.op_evmax; /*(IN) file name to be used for printing out debugging information from arpack*/ char *arpack_logfile = solver_params.arpack_logfile; /*(IN) read eigenvectors in Schur basis from file */ int arpack_read_ev = solver_params.arpackcg_read_ev; /*(IN) write eigenvectors in Schur basis to file */ int arpack_write_ev = solver_params.arpackcg_write_ev; /*(IN) file name to be used for reading and writing evecs from and to disc */ char *arpack_evecs_filename = solver_params.arpack_evecs_filename; /*(IN) precision used for writing eigenvectors */ int arpack_evecs_writeprec = solver_params.arpack_evecs_writeprec; /* how to project with approximate eigenvectors */ int projection_type = solver_params.projection_type; /* file format for evecs used by arpack */ char *arpack_evecs_fileformat = solver_params.arpack_evecs_fileformat; /*------------------------------------------------------------- if this is the first right hand side, allocate memory, call arpack, and compute resiudals of eigenvectors if needed -------------------------------------------------------------*/ if(ncurRHS==0){ #if (defined SSE || defined SSE2 || defined SSE3) _ax = malloc((LDN+ALIGN_BASE)*sizeof(spinor)); if(_ax==NULL) { if(g_proc_id == g_stdio_proc) fprintf(stderr,"[arpack_cg] insufficient memory for _ax inside arpack_cg.\n"); exit(1); } else {ax = (spinor *) ( ((unsigned long int)(_ax)+ALIGN_BASE)&~ALIGN_BASE);} _r = malloc((LDN+ALIGN_BASE)*sizeof(spinor)); if(_r==NULL) { if(g_proc_id == g_stdio_proc) fprintf(stderr,"[arpack_cg] insufficient memory for _r inside arpack_cg.\n"); exit(1); } else {r = (spinor *) ( ((unsigned long int)(_r)+ALIGN_BASE)&~ALIGN_BASE);} _tmps1 = malloc((LDN+ALIGN_BASE)*sizeof(spinor)); if(_tmps1==NULL) { if(g_proc_id == g_stdio_proc) fprintf(stderr,"[arpack_cg] insufficient memory for _tmps1 inside arpack_cg.\n"); exit(1); } else {tmps1 = (spinor *) ( ((unsigned long int)(_tmps1)+ALIGN_BASE)&~ALIGN_BASE);} _tmps2 = malloc((LDN+ALIGN_BASE)*sizeof(spinor)); if(_tmps2==NULL) { if(g_proc_id == g_stdio_proc) fprintf(stderr,"[arpack_cg] insufficient memory for _tmps2 inside arpack_cg.\n"); exit(1); } else {tmps2 = (spinor *) ( ((unsigned long int)(_tmps2)+ALIGN_BASE)&~ALIGN_BASE);} #else ax = (spinor *) malloc(LDN*sizeof(spinor)); r = (spinor *) malloc(LDN*sizeof(spinor)); tmps1 = (spinor *) malloc(LDN*sizeof(spinor)); tmps2 = (spinor *) malloc(LDN*sizeof(spinor)); if( (ax == NULL) || (r==NULL) || (tmps1==NULL) || (tmps2==NULL) ) { if(g_proc_id == g_stdio_proc) fprintf(stderr,"[arpack_cg] insufficient memory for ax,r,tmps1,tmps2 inside arpack_cg.\n"); exit(1); } #endif evecs = (_Complex double *) malloc(ncv*12*N*sizeof(_Complex double)); /* note: no extra buffer */ evals = (_Complex double *) malloc(ncv*sizeof(_Complex double)); tmpv1 = (_Complex double *) malloc(12*N*sizeof(_Complex double)); if((evecs == NULL) || (evals==NULL) || (tmpv1==NULL)) { if(g_proc_id == g_stdio_proc) fprintf(stderr,"[arpack_cg] insufficient memory for evecs and evals inside arpack_cg.\n"); exit(1); } if ( arpack_read_ev == 1) { if (strcmp(arpack_evecs_fileformat, "partfile") == 0) { /* set evec filenmae */ sprintf(evecs_filename, "%s.%.5d.pt%.2dpx%.2dpy%.2dpz%.2d", arpack_evecs_filename, nev, g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3]); evecs_fs = fopen(evecs_filename, "r"); if (evecs_fs == NULL) { fprintf(stderr, "[arpack_cg] (%.4d) Error, could not open file %s for reading\n", g_cart_id, evecs_filename); return(-2); } fprintf(stdout, "# [arpack_cg] reading eigenvectors from file %s\n", evecs_filename); if(arpack_evecs_writeprec == 64) { evecs_io_buffer = (void*)evecs; et1=gettime(); evecs_count = fread( evecs_io_buffer, sizeof(_Complex double), (size_t)nev*12*N, evecs_fs); et2=gettime(); } else { evecs_io_buffer = malloc(sizeof(_Complex double) * (size_t)nev*12*N ); if( evecs_io_buffer == NULL) { fprintf(stderr, "[arpack_cg] (%.4d) Error, could not allocate memory for evecs_io_buffer\n", g_cart_id); return(-42); } et1=gettime(); evecs_count = fread( evecs_io_buffer, sizeof(_Complex double)/2, (size_t)nev*12*N, evecs_fs); et2=gettime(); single2double(evecs, evecs_io_buffer, nev*24*N); free( evecs_io_buffer ); evecs_io_buffer = NULL; } if( evecs_count != ((size_t)nev*12*N) ) { fprintf(stderr, "[arpack_cg] (%.4d) Error, could not proper amount of data from file %s\n", g_cart_id, evecs_filename); return(-3); } fclose(evecs_fs); evecs_fs = NULL; if(g_proc_id == g_stdio_proc) { fprintf(stdout,"# [arpack_cg] ARPACK time for reading %d eigenvectors: %+e seconds\n", nev, et2-et1); } } else if(strcmp(arpack_evecs_fileformat, "single") == 0) { if(N==VOLUME) { for(i=0; i<nev; i++) { sprintf(evecs_filename, "%s.ev%.5d", arpack_evecs_filename, i); evecs_ptr0 = (spinor*)&(evecs[i*12*N]); evecs_ptr1 = NULL; read_spinor(evecs_ptr0, evecs_ptr1, evecs_filename, 0); } /* end of loop on eigenvectors */ } else if(N==VOLUME/2) { for(i=0; i<nev/2; i++) { sprintf(evecs_filename, "%s.ev%.5d", arpack_evecs_filename, 2*i); evecs_ptr0 = (spinor*)&(evecs[(2*i )*12*N]); evecs_ptr1 = (spinor*)&(evecs[(2*i+1)*12*N]); read_spinor(evecs_ptr0, evecs_ptr1, evecs_filename, 0); } /* end of loop on eigenvectors */ } } /* of if arpack_evecs_fileformat */ /* set info_arpack pro forma to SUCCESS */ nconv = nev; info_arpack = 0; } else { et1=gettime(); evals_arpack(N,nev,ncv,kind,howmny,acc,cheb_k,emin,emax,evals,evecs,arpack_eig_tol,arpack_eig_maxiter,f,&info_arpack,&nconv,arpack_logfile); et2=gettime(); if(info_arpack != 0){ /* arpack didn't converge */ if(g_proc_id == g_stdio_proc) fprintf(stderr,"[arpack_cg] WARNING: ARPACK didn't converge. exiting..\n"); return -1; } if(g_proc_id == g_stdio_proc) { fprintf(stdout,"# [arpack_cg] ARPACK has computed %d eigenvectors\n",nconv); fprintf(stdout,"# [arpack_cg] ARPACK time: %+e\n",et2-et1); } if ( arpack_write_ev == 1) { if(strcmp(arpack_evecs_fileformat, "partfile") == 0 ) { if( g_cart_id == 0 ) fprintf(stdout, "# [arpack_cg] writing evecs in partfile format\n"); /* set evec filenmae */ sprintf(evecs_filename, "%s.%.5d.pt%.2dpx%.2dpy%.2dpz%.2d", arpack_evecs_filename, nconv, g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3]); evecs_fs = fopen(evecs_filename, "w"); if (evecs_fs == NULL) { fprintf(stderr, "[arpack_cg] (%.4d) Error, could not open file %s for writing\n", g_cart_id, evecs_filename); return(-4); } if(arpack_evecs_writeprec == 64) { evecs_io_buffer = (void*)evecs; et1=gettime(); evecs_count = fwrite( evecs_io_buffer, sizeof(_Complex double), (size_t)nconv*12*N, evecs_fs); et2=gettime(); } else { evecs_io_buffer = malloc(sizeof(_Complex double) * (size_t)nconv*12*N ); if( evecs_io_buffer == NULL) { fprintf(stderr, "[arpack_cg] (%.4d) Error, could not allocate memory for evecs_io_buffer\n", g_cart_id); return(-41); } double2single(evecs_io_buffer, evecs, nconv*24*N); et1=gettime(); evecs_count = fwrite( evecs_io_buffer, sizeof(_Complex double)/2, (size_t)nconv*12*N, evecs_fs); et2=gettime(); free(evecs_io_buffer); evecs_io_buffer = NULL; } if( evecs_count != ((size_t)nconv*12*N) ) { fprintf(stderr, "[arpack_cg] (%.4d) Error, could not write proper amount of data to file %s\n", g_cart_id, evecs_filename); return(-5); } fclose(evecs_fs); evecs_fs = NULL; if(g_proc_id == g_stdio_proc) { fprintf(stdout,"[arpack_cg] (%.4d) ARPACK time for writing %d eigenvectors: %+e seconds\n", g_cart_id, nconv, et2-et1); } } else if (strcmp(arpack_evecs_fileformat, "single") == 0) { if(N==VOLUME) { for(i=0; i<nconv; i++) { sprintf(evecs_filename, "%s.ev%.5d", arpack_evecs_filename, i); construct_writer(&evecs_writer, evecs_filename, 0); evecs_propagatorFormat = construct_paramsPropagatorFormat(arpack_evecs_writeprec, 1); write_propagator_format(evecs_writer, evecs_propagatorFormat); free(evecs_propagatorFormat); evecs_ptr0 = (spinor*)&(evecs[i*12*N]); evecs_ptr1 = NULL; write_spinor(evecs_writer, &evecs_ptr0, &evecs_ptr1, 1, arpack_evecs_writeprec); destruct_writer(evecs_writer); evecs_writer=NULL; } /* end of loop on converged eigenvectors */ } else if(N==VOLUME/2) { for(i=0; i<nconv/2; i++) { sprintf(evecs_filename, "%s.ev%.5d", arpack_evecs_filename, 2*i); construct_writer(&evecs_writer, evecs_filename, 0); evecs_propagatorFormat = construct_paramsPropagatorFormat(arpack_evecs_writeprec, 1); write_propagator_format(evecs_writer, evecs_propagatorFormat); free(evecs_propagatorFormat); evecs_ptr0 = (spinor*)&(evecs[(2*i )*12*N]); evecs_ptr1 = (spinor*)&(evecs[(2*i+1)*12*N]); write_spinor(evecs_writer, &evecs_ptr0, &evecs_ptr1,1, arpack_evecs_writeprec); destruct_writer(evecs_writer); evecs_writer=NULL; } /* end of loop on converged eigenvectors */ } /* end of if N == VOLUME */ } /* of if arpack_evecs_fileformat */ } /* end of if arpack_write_ev == 1 */ } /* end of if arpack_read_ev == 1 */ H = (_Complex double *) malloc(nconv*nconv*sizeof(_Complex double)); Hinv = (_Complex double *) malloc(nconv*nconv*sizeof(_Complex double)); initwork = (_Complex double *) malloc(nconv*sizeof(_Complex double)); IPIV = (int *) malloc(nconv*sizeof(int)); zheev_lwork = 3*nconv; zheev_work = (_Complex double *) malloc(zheev_lwork*sizeof(_Complex double)); zheev_rwork = (double *) malloc(3*nconv*sizeof(double)); hevals = (double *) malloc(nconv*sizeof(double)); if((H==NULL) || (Hinv==NULL) || (initwork==NULL) || (IPIV==NULL) || (zheev_lwork==NULL) || (zheev_rwork==NULL) || (hevals==NULL)) { if(g_proc_id == g_stdio_proc) fprintf(stderr,"[arpack_cg] insufficient memory for H, Hinv, initwork, IPIV, zheev_lwork, zheev_rwork, hevals inside arpack_cg.\n"); exit(1); } et1=gettime(); /* compute the elements of the hermitian matrix H leading dimension is nconv and active dimension is nconv */ if( projection_type == 0) { for(i=0; i<nconv; i++) { assign_complex_to_spinor(r,&evecs[i*12*N],12*N); f(ax,r); c1 = scalar_prod(r,ax,N,parallel); H[i+nconv*i] = creal(c1); /* diagonal should be real */ for(j=i+1; j<nconv; j++) { assign_complex_to_spinor(r,&evecs[j*12*N],12*N); c1 = scalar_prod(r,ax,N,parallel); H[j+nconv*i] = c1; H[i+nconv*j] = conj(c1); /* enforce hermiticity */ } } } else if ( projection_type == 1 ) { for(i=0; i<nconv; i++) { assign_complex_to_spinor(tmps1, &evecs[i*12*N], 12*N); f_final(r, tmps1); f(ax,r); c1 = scalar_prod(r,ax,N,parallel); c2 = scalar_prod(r,r,N,parallel); H[i+nconv*i] = creal(c1) / creal(c2); /* diagonal should be real */ for(j=i+1; j<nconv; j++) { assign_complex_to_spinor(tmps1, &evecs[j*12*N], 12*N); f_final(r, tmps1); c1 = scalar_prod(r,ax,N,parallel); c3 = scalar_prod(r, r, N, parallel); H[j+nconv*i] = c1 / sqrt( creal(c2) * creal(c3) ); H[i+nconv*j] = conj(c1) / sqrt( creal(c2) * creal(c3) ); /* enforce hermiticity */ } } } et2=gettime(); if(g_proc_id == g_stdio_proc) { fprintf(stdout,"[arpack_cg] time to compute H: %+e\n",et2-et1); } /* if(g_cart_id == 0) { for(i=0; i<nconv; i++) { for(j=0; j<nconv; j++) { fprintf(stdout, "# [arpack_cg] H[%d, %d] = %25.16e %25.16e\n", i, j, creal(H[i*nconv+j]), cimag(H[i*nconv+j])); }} } */ et1=gettime(); /* compute Ritz values and Ritz vectors if needed */ if( (nconv>0) && (comp_evecs !=0)) { HU = (_Complex double *) malloc(nconv*nconv*sizeof(_Complex double)); if( HU==NULL ) { if(g_proc_id == g_stdio_proc) fprintf(stderr,"[arpack_cg] insufficient memory for HU inside arpack_cg\n"); exit(2); } /* copy H into HU */ tmpsize=nconv*nconv; _FT(zcopy)(&tmpsize,H,&ONE,HU,&ONE); /* compute eigenvalues and eigenvectors of HU*/ /* SUBROUTINE ZHEEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK,INFO ) */ _FT(zheev)(&cV,&cU,&nconv,HU,&nconv,hevals,zheev_work,&zheev_lwork,zheev_rwork,&zheev_info,1,1); if(zheev_info != 0) { if(g_proc_id == g_stdio_proc) { fprintf(stderr,"[arpack_cg] Error in ZHEEV:, info = %d\n",zheev_info); fflush(stderr); } exit(1); } /* If you want to replace the schur (orthonormal) basis by eigen basis use something like this. It is better to use the schur basis because they are better conditioned. Use this part only to get the eigenvalues and their resduals for the operator (D^\daggerD) esize=(ncv-nconv)*12*N; Zrestart_X(evecs,12*N,HU,12*N,nconv,nconv,&evecs[nconv*N],esize); */ /* compute residuals and print out results */ if(g_proc_id == g_stdio_proc) {fprintf(stdout,"# [arpack_cg] Ritz values of A and their residulas (||A*x-lambda*x||/||x||\n"); fprintf(stdout,"# [arpack_cg] =============================================================\n"); fflush(stdout);} for(i=0; i<nconv; i++) { tmpsize=12*N; _FT(zgemv)(&cN,&tmpsize,&nconv,&tpone,evecs,&tmpsize, &HU[i*nconv],&ONE,&tzero,tmpv1,&ONE,1); assign_complex_to_spinor(r,tmpv1,12*N); d1=square_norm(r,N,parallel); f(ax,r); mul_r(tmps1,hevals[i],r,N); diff(tmps2,ax,tmps1,N); d2= square_norm(tmps2,N,parallel); d3= sqrt(d2/d1); if(g_proc_id == g_stdio_proc) {fprintf(stdout,"Eval[%06d]: %22.15E rnorm: %22.15E\n", i, hevals[i], d3); fflush(stdout);} } free( HU ); HU = NULL; } /* if( (nconv_arpack>0) && (comp_evecs !=0)) */ et2=gettime(); if(g_proc_id == g_stdio_proc) { fprintf(stdout,"[arpack_cg] time to compute eigenvectors: %+e\n",et2-et1); } } /* if(ncurRHS==0) */ double eps_sq_used,restart_eps_sq_used; /* tolerance squared for the linear system */ double cur_res; /* current residual squared */ /*increment the RHS counter*/ ncurRHS = ncurRHS +1; /* set the tolerance to be used for this right-hand side */ if(ncurRHS > nrhs1){ eps_sq_used = eps_sq; } else{ eps_sq_used = eps_sq1; } if(g_proc_id == g_stdio_proc && g_debug_level > 0) { fprintf(stdout, "# [arpack_cg] System %d, eps_sq %e, projection type %d\n",ncurRHS,eps_sq_used, projection_type); fflush(stdout); } /*---------------------------------------------------------------*/ /* Call init-CG until this right-hand side converges */ /*---------------------------------------------------------------*/ double wt1,wt2,wE,wI; double normsq,tol_sq; int flag,maxit_remain,numIts,its; int info_lapack; wE = 0.0; wI = 0.0; /* Start accumulator timers */ flag = -1; /* System has not converged yet */ maxit_remain = maxit; /* Initialize Max and current # of iters */ numIts = 0; restart_eps_sq_used=res_eps_sq; while( flag == -1 ) { if(nconv > 0) { /* --------------------------------------------------------- */ /* Perform init-CG with evecs vectors */ /* xinit = xinit + evecs*Hinv*evec'*(b-Ax0) */ /* --------------------------------------------------------- */ wt1 = gettime(); /*r0=b-Ax0*/ f(ax,x); /*ax = A*x */ diff(r,b,ax,N); /* r=b-A*x */ if( projection_type == 0) { /* x = x + evecs*inv(H)*evecs'*r */ for(int i=0; i < nconv; i++) { assign_complex_to_spinor(tmps1,&evecs[i*12*N],12*N); initwork[i]= scalar_prod(tmps1,r,N,parallel); } /* solve the linear system H y = c */ tmpsize=nconv*nconv; _FT(zcopy) (&tmpsize,H,&ONE,Hinv,&ONE); /* copy H into Hinv */ /* SUBROUTINE ZGESV( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) */ _FT(zgesv) (&nconv,&ONE,Hinv,&nconv,IPIV,initwork,&nconv,&info_lapack); if(info_lapack != 0) { if(g_proc_id == g_stdio_proc) { fprintf(stderr, "[arpack_cg] Error in ZGESV:, info = %d\n",info_lapack); fflush(stderr); } exit(1); } /* x = x + evecs*inv(H)*evecs'*r */ for(i=0; i<nconv; i++) { assign_complex_to_spinor(tmps1,&evecs[i*12*N],12*N); assign_add_mul(x,tmps1,initwork[i],N); } } else if ( projection_type == 1 ) { /* x = x + evecs*inv(H)*evecs'*r */ /* tmps2 = Q^+ r */ f_initial(tmps2, r); for(int i=0; i < nconv; i++) { /* tmps1 = v_i */ assign_complex_to_spinor(tmps1,&evecs[i*12*N],12*N); /* initwork_i = v_i^+ Q^+ r / lambda_i^2 */ initwork[i]= scalar_prod(tmps1, tmps2, N, parallel) / ( H[i*nconv+i] * H[i*nconv+i] ); } memset(tmps2, 0, N*sizeof(spinor) ); for(i=0; i<nconv; i++) { assign_complex_to_spinor(tmps1, &evecs[i*12*N], 12*N); assign_add_mul(tmps2, tmps1, initwork[i], N); } /* apply final operator */ f_final(tmps1, tmps2); assign_add_mul(x, tmps1, 1., N); } /* end of if projection type */ /* compute elapsed time and add to accumulator */ wt2 = gettime(); wI = wI + wt2-wt1; }/* if(nconv > 0) */ /* which tolerance to use */ if(eps_sq_used > restart_eps_sq_used) { tol_sq = eps_sq_used; flag = 1; /* shouldn't restart again */ } else { tol_sq = restart_eps_sq_used; } wt1 = gettime(); its = cg_her(x,b,maxit_remain,tol_sq,rel_prec,N,f); wt2 = gettime(); wE = wE + wt2-wt1; /* check convergence */ if(its == -1) { /* cg didn't converge */ if(g_proc_id == g_stdio_proc) { fprintf(stderr, "[arpack_cg] CG didn't converge within the maximum number of iterations in arpack_cg. Exiting...\n"); fflush(stderr); exit(1); } } else { numIts += its; maxit_remain = maxit - numIts; /* remaining number of iterations */ restart_eps_sq_used = restart_eps_sq_used*res_eps_sq; /* prepare for the next restart */ } } /* end while (flag ==-1) */ /* ---------- */ /* Reporting */ /* ---------- */ /* compute the exact residual */ f(ax,x); /* ax= A*x */ diff(r,b,ax,N); /* r=b-A*x */ normsq=square_norm(r,N,parallel); if(g_debug_level > 0 && g_proc_id == g_stdio_proc) { fprintf(stdout, "# [arpack_cg] For this rhs:\n"); fprintf(stdout, "# [arpack_cg] Total initCG Wallclock : %+e\n", wI); fprintf(stdout, "# [arpack_cg] Total cg Wallclock : %+e\n", wE); fprintf(stdout, "# [arpack_cg] Iterations: %-d\n", numIts); fprintf(stdout, "# [arpack_cg] Actual Resid of LinSys : %+e\n",normsq); } /* free memory if this was your last system to solve */ if(ncurRHS == nrhs){ #if ( (defined SSE) || (defined SSE2) || (defined SSE3)) free(_ax); free(_r); free(_tmps1); free(_tmps2); #else free(ax); free(r); free(tmps1); free(tmps2); #endif free(evecs); free(evals); free(H); free(Hinv); free(initwork); free(tmpv1); free(zheev_work); free(hevals); free(zheev_rwork); free(IPIV); } return numIts; }
T quat_square_norm(const Quaternion<T>& q) { return square_norm(q); }
int bicgstabell(spinor * const x0, spinor * const b, const int max_iter, double eps_sq, const int rel_prec, const int _l, const int N, matrix_mult f) { double err; int i, j, k, l; double rho0, rho1, beta, alpha, omega, gamma0 = 0., squarenorm; spinor * r[5], * u[5], * r0_tilde, * x; double tau[5][5], gamma[25], gammap[25], gammapp[25], sigma[25]; spinor ** solver_field = NULL; const int nr_sf = 2*(_l+1)+2; l = _l; k = -l; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } r0_tilde = solver_field[0]; for(i = 0; i <= l; i++){ r[i] = solver_field[2+2*i]; u[i] = solver_field[3+2*i]; } x = x0; assign(u[0], b, N); f(r0_tilde, x); diff(r[0], u[0], r0_tilde, N); zero_spinor_field(solver_field[1], N); assign(r0_tilde, r[0], N); squarenorm = square_norm(b, N, 1); rho0 = 1.; alpha = 0.; omega = 1.; err = square_norm(r0_tilde, N, 1); while( k < max_iter && (((err > eps_sq) && (rel_prec == 0)) || ((err > eps_sq*squarenorm) && (rel_prec == 1)) )) { k+=l; /* The BiCG part */ rho0 *= -omega; for(j = 0; j < l; j++) { rho1 = scalar_prod_r(r[j], r0_tilde, N, 1); beta = (rho1/rho0); beta *= alpha; rho0 = rho1; for(i = 0; i <= j; i++) { /* u_i = r_i - \beta u_i */ assign_mul_add_r(u[i], -beta, r[i], N); } f(u[j+1], u[j]); gamma0 = scalar_prod_r(u[j+1], r0_tilde, N, 1); alpha = rho0/gamma0; /* r_i = r_i - \alpha u_{i+1} */ for(i = 0; i <= j; i++) { assign_add_mul_r(r[i], u[i+1], -alpha, N); } f(r[j+1], r[j]); /* x = x + \alpha u_0 */ assign_add_mul_r(x, u[0], alpha, N); err = square_norm(r[j+1], N, 1); if(g_proc_id == 0 && g_debug_level > 1) {printf("%d %d err = %e\n", k, j, err);fflush(stdout);} } /* The MR part */ for(j = 1; j <= l; j++){ for(i = 1; i < j; i++){ tau[i][j] = scalar_prod_r(r[j], r[i], N, 1)/sigma[i]; assign_add_mul_r(r[j], r[i], -tau[i][j], N); } sigma[j] = scalar_prod_r(r[j], r[j], N, 1); gammap[j] = scalar_prod_r(r[0], r[j], N, 1)/sigma[j]; } gamma[l] = gammap[l]; omega = gamma[l]; for(j = l-1; j > 0; j--) { gamma[j] = gammap[j]; for(i = j+1; i <= l; i++) { gamma[j] -= (tau[j][i]*gamma[i]); } } for(j = 1; j < l; j++) { gammapp[j] = gamma[j+1]; for(i = j+1; i < l; i++){ gammapp[j] += (tau[j][i]*gamma[i+1]); } } assign_add_mul_r(x, r[0], gamma[1], N); assign_add_mul_r(r[0], r[l], -gammap[l], N); for(j = 1; j < l; j++){ assign_add_mul_r(x, r[j], gammapp[j], N); assign_add_mul_r(r[0], r[j], -gammap[j], N); } assign_add_mul_r(u[0], u[l], -gamma[l], N); for(j = 1; j < l; j++){ assign_add_mul_r(u[0], u[j], -gamma[j], N); } err = square_norm(r[0], N, 1); if(g_proc_id == 0 && g_debug_level > 0){ printf(" BiCGstabell iterated %d %d, %e rho0 = %e, alpha = %e, gamma0= %e\n", l, k, err, rho0, alpha, gamma0); fflush( stdout ); } } finalize_solver(solver_field, nr_sf); if(k == max_iter) return(-1); return(k); }
/* P output = solution , Q input = source */ int cg_mms_tm(spinor ** const P, spinor * const Q, solver_params_t * solver_params, double * cgmms_reached_prec) { static double normsq, pro, err, squarenorm; int iteration, N = solver_params->sdim, no_shifts = solver_params->no_shifts; static double gamma, alpham1; spinor ** solver_field = NULL; double atime, etime; const int nr_sf = 3; atime = gettime(); if(solver_params->sdim == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); init_mms_tm(no_shifts, VOLUMEPLUSRAND); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); init_mms_tm(no_shifts, VOLUMEPLUSRAND/2); } zero_spinor_field(P[0], N); alphas[0] = 1.0; betas[0] = 0.0; sigma[0] = solver_params->shifts[0]*solver_params->shifts[0]; if(g_proc_id == 0 && g_debug_level > 1) printf("# CGMMS: shift %d is %e\n", 0, sigma[0]); for(int im = 1; im < no_shifts; im++) { sigma[im] = solver_params->shifts[im]*solver_params->shifts[im] - sigma[0]; if(g_proc_id == 0 && g_debug_level > 1) printf("# CGMMS: shift %d is %e\n", im, sigma[im]); // these will be the result spinor fields zero_spinor_field(P[im], N); // these are intermediate fields assign(ps_mms_solver[im-1], Q, N); zitam1[im] = 1.0; zita[im] = 1.0; alphas[im] = 1.0; betas[im] = 0.0; } /* currently only implemented for P=0 */ squarenorm = square_norm(Q, N, 1); /* if a starting solution vector equal to zero is chosen */ assign(solver_field[0], Q, N); assign(solver_field[1], Q, N); normsq = squarenorm; /* main loop */ for(iteration = 0; iteration < solver_params->max_iter; iteration++) { /* Q^2*p and then (p,Q^2*p) */ solver_params->M_psi(solver_field[2], solver_field[1]); // add the zero's shift assign_add_mul_r(solver_field[2], solver_field[1], sigma[0], N); pro = scalar_prod_r(solver_field[1], solver_field[2], N, 1); /* For the update of the coeff. of the shifted pol. we need alphas[0](i-1) and alpha_cg(i). This is the reason why we need this double definition of alpha */ alpham1 = alphas[0]; /* Compute alphas[0](i+1) */ alphas[0] = normsq/pro; for(int im = 1; im < no_shifts; im++) { /* Now gamma is a temp variable that corresponds to zita(i+1) */ gamma = zita[im]*alpham1/(alphas[0]*betas[0]*(1.-zita[im]/zitam1[im]) + alpham1*(1.+sigma[im]*alphas[0])); // Now zita(i-1) is put equal to the old zita(i) zitam1[im] = zita[im]; // Now zita(i+1) is updated zita[im] = gamma; // Update of alphas(i) = alphas[0](i)*zita(i+1)/zita(i) alphas[im] = alphas[0]*zita[im]/zitam1[im]; // Compute xs(i+1) = xs(i) + alphas(i)*ps(i) assign_add_mul_r(P[im], ps_mms_solver[im-1], alphas[im], N); // in the CG the corrections are decreasing with the iteration number increasing // therefore, we can remove shifts when the norm of the correction vector // falls below a threshold // this is useful for computing time and needed, because otherwise // zita might get smaller than DOUBLE_EPS and, hence, zero if(iteration > 0 && (iteration % 20 == 0) && (im == no_shifts-1)) { double sn = square_norm(ps_mms_solver[im-1], N, 1); if(alphas[no_shifts-1]*alphas[no_shifts-1]*sn <= solver_params->squared_solver_prec) { no_shifts--; if(g_debug_level > 2 && g_proc_id == 0) { printf("# CGMMS: at iteration %d removed one shift, %d remaining\n", iteration, no_shifts); } } } } /* Compute x_(i+1) = x_i + alphas[0](i+1) p_i */ assign_add_mul_r(P[0], solver_field[1], alphas[0], N); /* Compute r_(i+1) = r_i - alphas[0](i+1) Qp_i */ assign_add_mul_r(solver_field[0], solver_field[2], -alphas[0], N); /* Check whether the precision eps_sq is reached */ err = square_norm(solver_field[0], N, 1); if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { printf("# CGMMS iteration: %d residue: %g\n", iteration, err); fflush( stdout ); } if( ((err <= solver_params->squared_solver_prec) && (solver_params->rel_prec == 0)) || ((err <= solver_params->squared_solver_prec*squarenorm) && (solver_params->rel_prec > 0)) || (iteration == solver_params->max_iter -1) ) { /* FIXME temporary output of precision until a better solution can be found */ *cgmms_reached_prec = err; break; } /* Compute betas[0](i+1) = (r(i+1),r(i+1))/(r(i),r(i)) Compute p(i+1) = r(i+1) + beta(i+1)*p(i) */ betas[0] = err/normsq; assign_mul_add_r(solver_field[1], betas[0], solver_field[0], N); normsq = err; /* Compute betas(i+1) = betas[0](i+1)*(zita(i+1)*alphas(i))/(zita(i)*alphas[0](i)) Compute ps(i+1) = zita(i+1)*r(i+1) + betas(i+1)*ps(i) */ for(int im = 1; im < no_shifts; im++) { betas[im] = betas[0]*zita[im]*alphas[im]/(zitam1[im]*alphas[0]); assign_mul_add_mul_r(ps_mms_solver[im-1], solver_field[0], betas[im], zita[im], N); } } etime = gettime(); g_sloppy_precision = 0; if(iteration == solver_params->max_iter -1) iteration = -1; else iteration++; if(g_debug_level > 0 && g_proc_id == 0) { printf("# CGMMS (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_params->no_shifts, iteration, solver_params->squared_solver_prec, etime - atime); } finalize_solver(solver_field, nr_sf); return(iteration); }
double eigenvalues(int * nr_of_eigenvalues, const int max_iterations, const double precision, const int maxmin, const int readwrite, const int nstore, const int even_odd_flag) { double returnvalue; complex norm2; #ifdef HAVE_LAPACK static spinor * eigenvectors_ = NULL; static int allocated = 0; char filename[200]; FILE * ofs; #ifdef MPI double atime, etime; #endif /********************** * For Jacobi-Davidson **********************/ int verbosity = g_debug_level, converged = 0, blocksize = 1, blockwise = 0; int solver_it_max = 50, j_max, j_min, ii, jj; /*int it_max = 10000;*/ /* complex *eigv_ = NULL, *eigv; */ double decay_min = 1.7, decay_max = 1.5, prec, threshold_min = 1.e-3, threshold_max = 5.e-2; /* static int v0dim = 0; */ int v0dim = 0; matrix_mult f; int N = (VOLUME)/2, N2 = (VOLUMEPLUSRAND)/2; spinor * max_eigenvector_ = NULL, * max_eigenvector; /********************** * General variables **********************/ int returncode=0; int returncode2=0; char eigenvector_prefix[512]; char eigenvalue_prefix[512]; no_eigenvalues = *nr_of_eigenvalues; sprintf(eigenvector_prefix,"eigenvector.%%s.%%.2d.%%.4d"); sprintf(eigenvalue_prefix,"eigenvalues.%%s.%%.4d"); if(!even_odd_flag) { N = (VOLUME); N2 = (VOLUMEPLUSRAND); f = &Q_pm_psi; } else { f = &Qtm_pm_psi; } evlength = N2; if(g_proc_id == g_stdio_proc && g_debug_level >0) { printf("Number of %s eigenvalues to compute = %d\n", maxmin ? "maximal" : "minimal",(*nr_of_eigenvalues)); printf("Using Jacobi-Davidson method! \n"); } if((*nr_of_eigenvalues) < 8){ j_max = 15; j_min = 8; } else{ j_max = 2*(*nr_of_eigenvalues); j_min = (*nr_of_eigenvalues); } if(precision < 1.e-14){ prec = 1.e-14; } else{ prec = precision; } #if (defined SSE || defined SSE2 || defined SSE3) max_eigenvector_ = calloc(N2+1, sizeof(spinor)); max_eigenvector = (spinor *)(((unsigned long int)(max_eigenvector_)+ALIGN_BASE)&~ALIGN_BASE); #else max_eigenvector_= calloc(N2, sizeof(spinor)); max_eigenvector = max_eigenvector_; #endif if(allocated == 0) { allocated = 1; #if (defined SSE || defined SSE2 || defined SSE3) eigenvectors_ = calloc(N2*(*nr_of_eigenvalues)+1, sizeof(spinor)); eigenvectors = (spinor *)(((unsigned long int)(eigenvectors_)+ALIGN_BASE)&~ALIGN_BASE); #else eigenvectors_= calloc(N2*(*nr_of_eigenvalues), sizeof(spinor)); eigenvectors = eigenvectors_; #endif eigenvls = (double*)malloc((*nr_of_eigenvalues)*sizeof(double)); inv_eigenvls = (double*)malloc((*nr_of_eigenvalues)*sizeof(double)); } solver_it_max = 50; /* compute the maximal one first */ jdher(N*sizeof(spinor)/sizeof(complex), N2*sizeof(spinor)/sizeof(complex), 50., 1.e-12, 1, 15, 8, max_iterations, 1, 0, 0, NULL, CG, solver_it_max, threshold_max, decay_max, verbosity, &converged, (complex*) max_eigenvector, (double*) &max_eigenvalue, &returncode2, JD_MAXIMAL, 1, f); if(readwrite) { if(even_odd_flag){ for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { sprintf(filename, eigenvector_prefix , maxmin ? "max" : "min", v0dim, nstore); if((read_eospinor(&eigenvectors[v0dim*N2], filename)) != 0) { break; } } } else { FILE *testfile; spinor *s; double sqnorm; for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { sprintf(filename, eigenvector_prefix, maxmin ? "max" : "min", v0dim, nstore); printf("reading eigenvectors ... "); testfile=fopen(filename,"r"); if( testfile != NULL){ fclose(testfile); s=(spinor*)&eigenvectors[v0dim*N2]; read_spinor(s,NULL, filename,0); sqnorm=square_norm(s,VOLUME,1); printf(" has | |^2 = %e \n",sqnorm); } else { printf(" no more eigenvectors \n"); break; } } } } if(readwrite != 2) { #ifdef MPI atime = MPI_Wtime(); #endif /* (re-) compute minimal eigenvalues */ converged = 0; solver_it_max = 200; if(maxmin) jdher(N*sizeof(spinor)/sizeof(complex), N2*sizeof(spinor)/sizeof(complex), 50., prec, (*nr_of_eigenvalues), j_max, j_min, max_iterations, blocksize, blockwise, v0dim, (complex*) eigenvectors, CG, solver_it_max, threshold_max, decay_max, verbosity, &converged, (complex*) eigenvectors, eigenvls, &returncode, JD_MAXIMAL, 1, f); else jdher(N*sizeof(spinor)/sizeof(complex), N2*sizeof(spinor)/sizeof(complex), 0., prec, (*nr_of_eigenvalues), j_max, j_min, max_iterations, blocksize, blockwise, v0dim, (complex*) eigenvectors, CG, solver_it_max, threshold_min, decay_min, verbosity, &converged, (complex*) eigenvectors, eigenvls, &returncode, JD_MINIMAL, 1, f); #ifdef MPI etime = MPI_Wtime(); if(g_proc_id == 0) { printf("Eigenvalues computed in %e sec. (MPI_Wtime)\n", etime-atime); } #endif } else { sprintf(filename, eigenvalue_prefix, maxmin ? "max" : "min", nstore); if((ofs = fopen(filename, "r")) != (FILE*) NULL) { for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { fscanf(ofs, "%d %lf\n", &v0dim, &eigenvls[v0dim]); if(feof(ofs)) break; converged = v0dim; } } fclose(ofs); } (*nr_of_eigenvalues) = converged; no_eigenvalues = converged; ev_minev = eigenvls[(*nr_of_eigenvalues)-1]; eigenvalues_for_cg_computed = converged; for (ii = 0; ii < (*nr_of_eigenvalues); ii++){ for (jj = 0; jj <= ii; jj++){ norm2 = scalar_prod(&(eigenvectors[ii*N2]),&(eigenvectors[jj*N2]), VOLUME, 1); if(ii==jj){ if((fabs(1.-norm2.re)>1e-12) || (fabs(norm2.im)>1e-12) || 1) { if(g_proc_id == g_stdio_proc){ printf("< %d | %d> =\t %e +i * %e \n", ii+1, jj+1, norm2.re, norm2.im); fflush(stdout); } } } else{ if((fabs(norm2.re)>1e-12) || (fabs(norm2.im)>1e-12) || 1) { if(g_proc_id == g_stdio_proc){ printf("< %d | %d> =\t %e +i * %e \n", ii+1, jj+1, norm2.re, norm2.im); fflush(stdout); } } } } } if(readwrite == 1 ) { if(even_odd_flag) for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { sprintf(filename, eigenvector_prefix, maxmin ? "max" : "min", v0dim, nstore); if((write_eospinor(&eigenvectors[v0dim*N2], filename, eigenvls[v0dim], prec, nstore)) != 0) { break; } } else{ WRITER *writer=NULL; spinor *s; double sqnorm; paramsPropagatorFormat *propagatorFormat = NULL; for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { sprintf(filename, eigenvector_prefix, maxmin ? "max" : "min", v0dim, nstore); construct_writer(&writer, filename, 0); /* todo write propagator format */ propagatorFormat = construct_paramsPropagatorFormat(64, 1); write_propagator_format(writer, propagatorFormat); free(propagatorFormat); s=(spinor*)&eigenvectors[v0dim*N2]; write_spinor(writer, &s,NULL, 1, 64); destruct_writer(writer); writer=NULL; sqnorm=square_norm(s,VOLUME,1); printf(" wrote eigenvector | |^2 = %e \n",sqnorm); } } } if(g_proc_id == 0 && readwrite != 2) { sprintf(filename, eigenvalue_prefix , maxmin ? "max" : "min", nstore); ofs = fopen(filename, "w"); for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { fprintf(ofs, "%d %e\n", v0dim, eigenvls[v0dim]); } fclose(ofs); } for(v0dim = 0; v0dim < converged; v0dim++) { inv_eigenvls[v0dim] = 1./eigenvls[v0dim]; } ev_qnorm=1.0/(sqrt(max_eigenvalue)+0.1); ev_minev*=ev_qnorm*ev_qnorm; /* ov_n_cheby is initialized in Dov_psi.c */ returnvalue=eigenvls[0]; free(max_eigenvector_); #else fprintf(stderr, "lapack not available, so JD method for EV computation not available \n"); #endif return(returnvalue); }
double ndratcor_acc(const int id, hamiltonian_field_t * const hf) { monomial * mnl = &monomial_list[id]; double atime, etime, delta; spinor * up0, * dn0, * up1, * dn1, * tup, * tdn; double coefs[6] = {-1./2., 3./8., -5./16., 35./128., -63./256., 231./1024.}; atime = gettime(); nd_set_global_parameter(mnl); g_mu3 = 0.; if(mnl->type == NDCLOVERRATCOR) { sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); copy_32_sw_fields(); } mnl->energy1 = square_norm(mnl->pf, VOLUME/2, 1) + square_norm(mnl->pf2, VOLUME/2, 1); mnl->solver_params.max_iter = mnl->maxiter; mnl->solver_params.squared_solver_prec = mnl->accprec; mnl->solver_params.no_shifts = mnl->rat.np; mnl->solver_params.shifts = mnl->rat.mu; mnl->solver_params.type = mnl->solver; mnl->solver_params.M_ndpsi = &Qtm_pm_ndpsi; mnl->solver_params.M_ndpsi32 = &Qtm_pm_ndpsi_32; if(mnl->type == NDCLOVERRATCOR) { mnl->solver_params.M_ndpsi = &Qsw_pm_ndpsi; mnl->solver_params.M_ndpsi32 = &Qsw_pm_ndpsi_32; } mnl->solver_params.sdim = VOLUME/2; mnl->solver_params.rel_prec = g_relative_precision_flag; // apply (Q R)^(-1) to pseudo-fermion fields up0 = mnl->w_fields[0]; dn0 = mnl->w_fields[1]; up1 = mnl->w_fields[2]; dn1 = mnl->w_fields[3]; apply_Z_ndpsi(up0, dn0, mnl->pf, mnl->pf2, id, hf, &(mnl->solver_params)); delta = coefs[0]*(scalar_prod_r(mnl->pf, up0, VOLUME/2, 1) + scalar_prod_r(mnl->pf2, dn0, VOLUME/2, 1)); mnl->energy1 += delta; if(g_debug_level > 2 && g_proc_id == 0) printf("# NDRATCOR acc step: c_%d*(phi * Z^%d * phi) = %e\n", 1, 1, delta); for(int i = 2; i < 8; i++) { if(delta*delta < mnl->accprec) break; delta = coefs[i-1]*(square_norm(up0, VOLUME/2, 1) + square_norm(dn0, VOLUME/2, 1)); mnl->energy1 += delta; if(g_debug_level > 2 && g_proc_id == 0) printf("# NDRATCOR acc step: c_%d*(phi * Z^%d * phi) = %e\n", i, i, delta); i++; //incrementing i if(delta*delta < mnl->accprec) break; apply_Z_ndpsi(up1, dn1, up0, dn0, id, hf, &(mnl->solver_params)); delta = coefs[i-1]*(scalar_prod_r(up0, up1, VOLUME/2, 1) + scalar_prod_r(dn0, dn1, VOLUME/2, 1)); mnl->energy1 += delta; if(g_debug_level > 2 && g_proc_id == 0) printf("# NDRATCOR acc step: c_%d*(phi * Z^%d * phi) = %e\n", i, i, delta); tup = up0; tdn = dn0; up0 = up1; dn0 = dn1; up1 = tup; dn1 = tdn; } etime = gettime(); if(g_proc_id == 0) { if(g_debug_level > 1) { printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); } if(g_debug_level > 3) { // shoud be 3 printf("called ndratcor_acc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0); } } return(mnl->energy1 - mnl->energy0); }
/* k output , l input */ int bicg(spinor * const k, spinor * const l, double eps_sq, const int rel_prec) { double err, d1, squarenorm=0.; complex rho0, rho1, omega, alpha, beta, nom, denom; int iteration, N=VOLUME/2; spinor * r, * p, * v, *hatr, * s, * t, * P, * Q; if(ITER_MAX_BCG > 0) { hatr = g_spinor_field[DUM_SOLVER]; r = g_spinor_field[DUM_SOLVER+1]; v = g_spinor_field[DUM_SOLVER+2]; p = g_spinor_field[DUM_SOLVER+3]; s = g_spinor_field[DUM_SOLVER+4]; t = g_spinor_field[DUM_SOLVER+5]; P = k; Q = l; squarenorm = square_norm(Q, VOLUME/2, 1); Mtm_plus_psi(r, P); gamma5(g_spinor_field[DUM_SOLVER], l, VOLUME/2); diff(p, hatr, r, N); assign(r, p, N); assign(hatr, p, N); rho0 = scalar_prod(hatr, r, N, 1); for(iteration = 0; iteration < ITER_MAX_BCG; iteration++){ err = square_norm(r, N, 1); if(g_proc_id == g_stdio_proc && g_debug_level > 1) { printf("BiCGstab: iterations: %d res^2 %e\n", iteration, err); fflush(stdout); } if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))){ break; } Mtm_plus_psi(v, p); denom = scalar_prod(hatr, v, N, 1); _div_complex(alpha, rho0, denom); assign(s, r, N); assign_diff_mul(s, v, alpha, N); Mtm_plus_psi(t, s); omega = scalar_prod(t,s, N, 1); d1 = square_norm(t, N, 1); omega.re/=d1; omega.im/=d1; assign_add_mul_add_mul(P, p, s, alpha, omega, N); assign(r, s, N); assign_diff_mul(r, t, omega, N); rho1 = scalar_prod(hatr, r, N, 1); _mult_assign_complex(nom, alpha, rho1); _mult_assign_complex(denom, omega, rho0); _div_complex(beta, nom, denom); omega.re=-omega.re; omega.im=-omega.im; assign_mul_bra_add_mul_ket_add(p, v, r, omega, beta, N); rho0.re = rho1.re; rho0.im = rho1.im; } if(g_proc_id==0 && g_debug_level > 0) { printf("BiCGstab: iterations: %d eps_sq: %1.4e\n", iteration, eps_sq); } } else{ iteration = ITER_MAX_BCG; gamma5(k, l, VOLUME/2); } /* if bicg fails, redo with conjugate gradient */ if(iteration>=ITER_MAX_BCG){ iteration = solve_cg(k,l,eps_sq, rel_prec); /* Save the solution for reuse! not needed since Chronological inverter is there */ /* assign(g_spinor_field[DUM_DERI+6], k, VOLUME/2); */ Qtm_minus_psi(k, k);; } return iteration; }
// computes ||(1 - C^dagger R C) phi|| void check_C_ndpsi(spinor * const k_up, spinor * const k_dn, spinor * const l_up, spinor * const l_dn, const int id, hamiltonian_field_t * const hf, solver_params_t * solver_params) { monomial * mnl = &monomial_list[id]; mnl->iter0 = solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, l_up, l_dn, solver_params); assign(k_up, l_up, VOLUME/2); assign(k_dn, l_dn, VOLUME/2); // apply C to the random field to generate pseudo-fermion fields for(int j = (mnl->rat.np-1); j > -1; j--) { // Q_h * tau^1 - i nu_j // this needs phmc_Cpol = 1 to work! if(mnl->type == NDCLOVERRATCOR || mnl->type == NDCLOVERRAT) { Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], I*mnl->rat.nu[j], 1., mnl->EVMaxInv); } else { Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], I*mnl->rat.nu[j], 1., mnl->EVMaxInv); } assign_add_mul(k_up, g_chi_up_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2); assign_add_mul(k_dn, g_chi_dn_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2); } //apply R solver_params->shifts = mnl->rat.mu; solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, k_up, k_dn, solver_params); for(int j = (mnl->rat.np-1); j > -1; j--) { assign_add_mul_r(k_up, g_chi_up_spinor_field[j], mnl->rat.rmu[j], VOLUME/2); assign_add_mul_r(k_dn, g_chi_dn_spinor_field[j], mnl->rat.rmu[j], VOLUME/2); } // apply C^dagger solver_params->shifts = mnl->rat.nu; solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, k_up, k_dn, solver_params); for(int j = (mnl->rat.np-1); j > -1; j--) { // Q_h * tau^1 + i nu_j if(mnl->type == NDCLOVERRATCOR || mnl->type == NDCLOVERRAT) { Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], -I*mnl->rat.nu[j], 1., mnl->EVMaxInv); } else { Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], -I*mnl->rat.nu[j], 1., mnl->EVMaxInv); } assign_add_mul(k_up, g_chi_up_spinor_field[mnl->rat.np], -I*mnl->rat.rnu[j], VOLUME/2); assign_add_mul(k_dn, g_chi_dn_spinor_field[mnl->rat.np], -I*mnl->rat.rnu[j], VOLUME/2); } diff(k_up, k_up, l_up, VOLUME/2); diff(k_dn, k_dn, l_dn, VOLUME/2); double resi = square_norm(k_up, VOLUME/2, 1); resi += square_norm(k_dn, VOLUME/2, 1); if(g_proc_id == 0) printf("|| (1-C^dagger R C)*phi|| = %e\n", resi); return; }
/* k output , l input */ int solve_cg(spinor * const k, spinor * const l, double eps_sq, const int rel_prec) { static double normsq, pro, err, alpha_cg, beta_cg, squarenorm, sqnrm, sqnrm2; int iteration = 0, i, j; int save_sloppy = g_sloppy_precision; double atime, etime, flops; spinor *x, *delta, *y; /* initialize residue r and search vector p */ #ifdef MPI atime = MPI_Wtime(); #else atime = ((double)clock())/((double)(CLOCKS_PER_SEC)); #endif squarenorm = square_norm(l, VOLUME/2, 1); if(g_sloppy_precision_flag == 1) { delta = g_spinor_field[DUM_SOLVER+3]; x = g_spinor_field[DUM_SOLVER+4]; y = g_spinor_field[DUM_SOLVER+5]; assign(delta, l, VOLUME/2); Qtm_pm_psi(y, k); diff(delta, l, y, VOLUME/2); sqnrm = square_norm(delta, VOLUME/2, 1); if(((sqnrm <= eps_sq) && (rel_prec == 0)) || ((sqnrm <= eps_sq*squarenorm) && (rel_prec == 1))) { return(0); } for(i = 0; i < 20; i++) { g_sloppy_precision = 1; /* main CG loop in lower precision */ zero_spinor_field(x, VOLUME/2); assign(g_spinor_field[DUM_SOLVER+1], delta, VOLUME/2); assign(g_spinor_field[DUM_SOLVER+2], delta, VOLUME/2); sqnrm2 = sqnrm; for(j = 0; j <= ITER_MAX_CG; j++) { Qtm_pm_psi(g_spinor_field[DUM_SOLVER], g_spinor_field[DUM_SOLVER+2]); pro = scalar_prod_r(g_spinor_field[DUM_SOLVER+2], g_spinor_field[DUM_SOLVER], VOLUME/2, 1); alpha_cg = sqnrm2 / pro; assign_add_mul_r(x, g_spinor_field[DUM_SOLVER+2], alpha_cg, VOLUME/2); assign_mul_add_r(g_spinor_field[DUM_SOLVER], -alpha_cg, g_spinor_field[DUM_SOLVER+1], VOLUME/2); err = square_norm(g_spinor_field[DUM_SOLVER], VOLUME/2, 1); if(g_proc_id == g_stdio_proc && g_debug_level > 1) { printf("inner CG: %d res^2 %g\n", iteration+j+1, err); fflush(stdout); } if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))){ break; } beta_cg = err / sqnrm2; assign_mul_add_r(g_spinor_field[DUM_SOLVER+2], beta_cg, g_spinor_field[DUM_SOLVER], VOLUME/2); assign(g_spinor_field[DUM_SOLVER+1], g_spinor_field[DUM_SOLVER], VOLUME/2); sqnrm2 = err; } /* end main CG loop */ iteration += j; g_sloppy_precision = 0; add(k, k, x, VOLUME/2); Qtm_pm_psi(y, x); diff(delta, delta, y, VOLUME/2); sqnrm = square_norm(delta, VOLUME/2, 1); if(g_debug_level > 0 && g_proc_id == g_stdio_proc) { printf("mixed CG(linsolve): true residue %d\t%g\t\n",iteration, sqnrm); fflush( stdout); } if(((sqnrm <= eps_sq) && (rel_prec == 0)) || ((sqnrm <= eps_sq*squarenorm) && (rel_prec == 1))) { break; } iteration++; } } else { Qtm_pm_psi(g_spinor_field[DUM_SOLVER], k); diff(g_spinor_field[DUM_SOLVER+1], l, g_spinor_field[DUM_SOLVER], VOLUME/2); assign(g_spinor_field[DUM_SOLVER+2], g_spinor_field[DUM_SOLVER+1], VOLUME/2); normsq=square_norm(g_spinor_field[DUM_SOLVER+1], VOLUME/2, 1); /* main loop */ for(iteration = 1; iteration <= ITER_MAX_CG; iteration++) { Qtm_pm_psi(g_spinor_field[DUM_SOLVER], g_spinor_field[DUM_SOLVER+2]); pro=scalar_prod_r(g_spinor_field[DUM_SOLVER+2], g_spinor_field[DUM_SOLVER], VOLUME/2, 1); alpha_cg=normsq/pro; assign_add_mul_r(k, g_spinor_field[DUM_SOLVER+2], alpha_cg, VOLUME/2); assign_mul_add_r(g_spinor_field[DUM_SOLVER], -alpha_cg, g_spinor_field[DUM_SOLVER+1], VOLUME/2); err=square_norm(g_spinor_field[DUM_SOLVER], VOLUME/2, 1); if(g_proc_id == g_stdio_proc && g_debug_level > 1) { printf("CG (linsolve): iterations: %d res^2 %e\n", iteration, err); fflush(stdout); } if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))){ break; } beta_cg = err/normsq; assign_mul_add_r(g_spinor_field[DUM_SOLVER+2], beta_cg, g_spinor_field[DUM_SOLVER], VOLUME/2); assign(g_spinor_field[DUM_SOLVER+1], g_spinor_field[DUM_SOLVER], VOLUME/2); normsq=err; } } #ifdef MPI etime = MPI_Wtime(); #else etime = ((double)clock())/((double)(CLOCKS_PER_SEC)); #endif /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ /* 2*1320.0 because the linalg is over VOLUME/2 */ flops = (2*(2*1320.0+2*3*4) + 2*3*4 + iteration*(2.*(2*1320.0+2*3*4) + 10*3*4))*VOLUME/2/1.0e6f; if(g_proc_id==0 && g_debug_level > 0) { printf("CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iteration, eps_sq, etime-atime); printf("CG: flopcount: t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime)); } g_sloppy_precision = save_sloppy; return(iteration); }
void ndratcor_heatbath(const int id, hamiltonian_field_t * const hf) { monomial * mnl = &monomial_list[id]; double atime, etime, delta; spinor * up0, * dn0, * up1, * dn1, * tup, * tdn, * Zup, * Zdn; double coefs[6] = {1./4., -3./32., 7./128., -77./2048., 231./8192., -1463./65536.}; // series of (1+x)^(1/4) double coefs_check[6] = {1./2., -1./8., 1./16., -5./128., 7./256., -21./1024.}; // series of (1+x)^(1/2) atime = gettime(); nd_set_global_parameter(mnl); g_mu3 = 0.; mnl->iter0 = 0; if(mnl->type == NDCLOVERRATCOR) { init_sw_fields(); sw_term((const su3**)hf->gaugefield, mnl->kappa, mnl->c_sw); sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); copy_32_sw_fields(); } // we measure before the trajectory! if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) { if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi); else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi); } // the Gaussian distributed random fields mnl->energy0 = 0.; random_spinor_field_eo(mnl->pf, mnl->rngrepro, RN_GAUSS); mnl->energy0 = square_norm(mnl->pf, VOLUME/2, 1); random_spinor_field_eo(mnl->pf2, mnl->rngrepro, RN_GAUSS); mnl->energy0 += square_norm(mnl->pf2, VOLUME/2, 1); mnl->solver_params.max_iter = mnl->maxiter; mnl->solver_params.squared_solver_prec = mnl->accprec; mnl->solver_params.no_shifts = mnl->rat.np; mnl->solver_params.shifts = mnl->rat.mu; mnl->solver_params.type = mnl->solver; mnl->solver_params.M_ndpsi = &Qtm_pm_ndpsi; mnl->solver_params.M_ndpsi32 = &Qtm_pm_ndpsi_32; if(mnl->type == NDCLOVERRATCOR) { mnl->solver_params.M_ndpsi = &Qsw_pm_ndpsi; mnl->solver_params.M_ndpsi32 = &Qsw_pm_ndpsi_32; } mnl->solver_params.sdim = VOLUME/2; mnl->solver_params.rel_prec = g_relative_precision_flag; // apply B to the random field to generate pseudo-fermion fields up0 = mnl->w_fields[0]; dn0 = mnl->w_fields[1]; up1 = mnl->w_fields[2]; dn1 = mnl->w_fields[3]; Zup = mnl->w_fields[4]; Zdn = mnl->w_fields[5]; apply_Z_ndpsi(up0, dn0, mnl->pf, mnl->pf2, id, hf, &(mnl->solver_params)); // computing correction to energy1 delta = coefs_check[0]*(scalar_prod_r(mnl->pf, up0, VOLUME/2, 1) + scalar_prod_r(mnl->pf2, dn0, VOLUME/2, 1)); if(g_debug_level > 2 && g_proc_id == 0) printf("# NDRATCOR heatbath: c_%d*(R * Z^%d * R) = %e\n", 1, 1, delta); // debug for showing that the old check was giving a smaller delta if(g_debug_level > 3) { double delta_old = square_norm(up0, VOLUME/2, 1) + square_norm(dn0, VOLUME/2, 1); if(g_proc_id == 0) { printf("# NDRATCOR old check: || Z^%d * R ||^2 = %e\n", 1, delta_old); printf("# NDRATCOR new check: (c_%d*(R * Z^%d * R))^2 = %e\n", 1, 1, delta*delta); } } if(delta*delta > mnl->accprec) { assign_add_mul_r(mnl->pf, up0, coefs[0], VOLUME/2); assign_add_mul_r(mnl->pf2, dn0, coefs[0], VOLUME/2); // saving first application assign(Zup, up0, VOLUME/2); assign(Zdn, dn0, VOLUME/2); for(int i = 2; i < 8; i++) { // computing next order correction to energy1 delta = coefs_check[i-1]*(scalar_prod_r(Zup, up0, VOLUME/2, 1) + scalar_prod_r(Zup, dn0, VOLUME/2, 1)); if(g_debug_level > 2 && g_proc_id == 0) printf("# NDRATCOR heatbath: c_%d*(R * Z^%d * R) = %e\n", i, i, delta); // debug for showing that the old check was giving a smaller delta if(g_debug_level > 3) { double delta_old = square_norm(up0, VOLUME/2, 1) + square_norm(dn0, VOLUME/2, 1); if(g_proc_id == 0) { printf("# NDRATCOR old check: || Z^%d * R ||^2 = %e\n", 1, delta_old); printf("# NDRATCOR new check: (c_%d*(R * Z^%d * R))^2 = %e\n", 1, 1, delta*delta); } } if(delta*delta < mnl->accprec) break; apply_Z_ndpsi(up1, dn1, up0, dn0, id, hf, &(mnl->solver_params)); assign_add_mul_r(mnl->pf, up1, coefs[i-1], VOLUME/2); assign_add_mul_r(mnl->pf2, dn1, coefs[i-1], VOLUME/2); tup = up0; tdn = dn0; up0 = up1; dn0 = dn1; up1 = tup; dn1 = tdn; } } etime = gettime(); if(g_proc_id == 0) { if(g_debug_level > 1) { printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); } if(g_debug_level > 3) { printf("called ndratcor_heatbath for id %d energy %f\n", id, mnl->energy0); } } return; }
void gtrafo_eo_nd(spinor * const Even_s, spinor * const Odd_s, spinor * const Even_c, spinor * const Odd_c, spinor * const Even_new_s, spinor * const Odd_new_s, spinor * const Even_new_c, spinor * const Odd_new_c, GTRAFO_TYPE type){ /* initialize temporal gauge here */ int retval; double dret1, dret2; static double plaquette1 = 0.0; static double plaquette2 = 0.0; if(type==GTRAFO_APPLY){ /* need VOLUME here (not N=VOLUME/2)*/ if ((retval = init_temporalgauge_trafo(VOLUME, g_gauge_field)) != 0 ) { // initializes the transformation matrices if (g_proc_id == 0) printf("Error while gauge fixing to temporal gauge. Aborting...\n"); // g_tempgauge_field as a copy of g_gauge_field exit(200); } /* do trafo */ plaquette1 = measure_plaquette(g_gauge_field); apply_gtrafo(g_gauge_field, g_trafo); // transformation of the gauge field plaquette2 = measure_plaquette(g_gauge_field); if (g_proc_id == 0) printf("\tPlaquette before gauge fixing: %.16e\n", plaquette1/6./VOLUME); if (g_proc_id == 0) printf("\tPlaquette after gauge fixing: %.16e\n", plaquette2/6./VOLUME); /* do trafo to odd_s part of source */ dret1 = square_norm(Odd_s, VOLUME/2 , 1); apply_gtrafo_spinor_odd(Odd_s, g_trafo); // odd spinor transformation, strange dret2 = square_norm(Odd_s, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); /* do trafo to odd_c part of source */ dret1 = square_norm(Odd_c, VOLUME/2 , 1); apply_gtrafo_spinor_odd(Odd_c, g_trafo); // odd spinor transformation, charm dret2 = square_norm(Odd_c, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); /* do trafo to even_s part of source */ dret1 = square_norm(Even_s, VOLUME/2 , 1); apply_gtrafo_spinor_even(Even_s, g_trafo); // even spinor transformation, strange dret2 = square_norm(Even_s, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); /* do trafo to even_c part of source */ dret1 = square_norm(Even_c, VOLUME/2 , 1); apply_gtrafo_spinor_even(Even_c, g_trafo); // even spinor transformation, charm dret2 = square_norm(Even_c, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); } else { /* undo trafo */ /* apply_inv_gtrafo(g_gauge_field, g_trafo);*/ /* copy back the saved original field located in g_tempgauge_field -> update necessary*/ plaquette1 = measure_plaquette(g_gauge_field); copy_gauge_field(g_gauge_field, g_tempgauge_field); g_update_gauge_copy = 1; plaquette2 = measure_plaquette(g_gauge_field); if (g_proc_id == 0) printf("\tPlaquette before inverse gauge fixing: %.16e\n", plaquette1/6./VOLUME); if (g_proc_id == 0) printf("\tPlaquette after inverse gauge fixing: %.16e\n", plaquette2/6./VOLUME); /* undo trafo to source Even_s */ dret1 = square_norm(Even_s, VOLUME/2 , 1); apply_inv_gtrafo_spinor_even(Even_s, g_trafo); dret2 = square_norm(Even_s, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); /* undo trafo to source Even_c */ dret1 = square_norm(Even_c, VOLUME/2 , 1); apply_inv_gtrafo_spinor_even(Even_c, g_trafo); dret2 = square_norm(Even_c, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); /* undo trafo to source Odd_s */ dret1 = square_norm(Odd_s, VOLUME/2 , 1); apply_inv_gtrafo_spinor_odd(Odd_s, g_trafo); dret2 = square_norm(Odd_s, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); /* undo trafo to source Odd_c */ dret1 = square_norm(Odd_c, VOLUME/2 , 1); apply_inv_gtrafo_spinor_odd(Odd_c, g_trafo); dret2 = square_norm(Odd_c, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); // Even_new_s dret1 = square_norm(Even_new_s, VOLUME/2 , 1); apply_inv_gtrafo_spinor_even(Even_new_s, g_trafo); dret2 = square_norm(Even_new_s, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); // Even_new_c dret1 = square_norm(Even_new_c, VOLUME/2 , 1); apply_inv_gtrafo_spinor_even(Even_new_c, g_trafo); dret2 = square_norm(Even_new_c, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); // Odd_new_s dret1 = square_norm(Odd_new_s, VOLUME/2 , 1); apply_inv_gtrafo_spinor_odd(Odd_new_s, g_trafo); dret2 = square_norm(Odd_new_s, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); // Odd_new_c dret1 = square_norm(Odd_new_c, VOLUME/2 , 1); apply_inv_gtrafo_spinor_odd(Odd_new_c, g_trafo); dret2 = square_norm(Odd_new_c, VOLUME/2, 1); if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); finalize_temporalgauge(); } # ifdef TM_USE_MPI xchange_gauge(g_gauge_field); # endif }
int main(int argc,char *argv[]) { FILE *parameterfile = NULL; char datafilename[206]; char parameterfilename[206]; char conf_filename[50]; char scalar_filename[50]; char * input_filename = NULL; char * filename = NULL; double plaquette_energy; #ifdef _USE_HALFSPINOR #undef _USE_HALFSPINOR printf("# WARNING: USE_HALFSPINOR will be ignored (not supported here).\n"); #endif if(even_odd_flag) { even_odd_flag=0; printf("# WARNING: even_odd_flag will be ignored (not supported here).\n"); } int j,j_max,k,k_max = 2; _Complex double * drvsc; #ifdef HAVE_LIBLEMON paramsXlfInfo *xlfInfo; #endif int status = 0; static double t1,t2,dt,sdt,dts,qdt,sqdt; double antioptaway=0.0; #ifdef MPI static double dt2; DUM_DERI = 6; DUM_SOLVER = DUM_DERI+2; DUM_MATRIX = DUM_SOLVER+6; NO_OF_SPINORFIELDS = DUM_MATRIX+2; #ifdef OMP int mpi_thread_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided); #else MPI_Init(&argc, &argv); #endif MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); #else g_proc_id = 0; #endif g_rgi_C1 = 1.; process_args(argc,argv,&input_filename,&filename); set_default_filenames(&input_filename, &filename); /* Read the input file */ if( (j = read_input(input_filename)) != 0) { fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename); exit(-1); } if(g_proc_id==0) { printf("parameter rho_BSM set to %f\n", rho_BSM); printf("parameter eta_BSM set to %f\n", eta_BSM); printf("parameter m0_BSM set to %f\n", m0_BSM); } #ifdef OMP init_openmp(); #endif tmlqcd_mpi_init(argc, argv); if(g_proc_id==0) { #ifdef SSE printf("# The code was compiled with SSE instructions\n"); #endif #ifdef SSE2 printf("# The code was compiled with SSE2 instructions\n"); #endif #ifdef SSE3 printf("# The code was compiled with SSE3 instructions\n"); #endif #ifdef P4 printf("# The code was compiled for Pentium4\n"); #endif #ifdef OPTERON printf("# The code was compiled for AMD Opteron\n"); #endif #ifdef _GAUGE_COPY printf("# The code was compiled with -D_GAUGE_COPY\n"); #endif #ifdef BGL printf("# The code was compiled for Blue Gene/L\n"); #endif #ifdef BGP printf("# The code was compiled for Blue Gene/P\n"); #endif #ifdef _USE_HALFSPINOR printf("# The code was compiled with -D_USE_HALFSPINOR\n"); #endif #ifdef _USE_SHMEM printf("# The code was compiled with -D_USE_SHMEM\n"); #ifdef _PERSISTENT printf("# The code was compiled for persistent MPI calls (halfspinor only)\n"); #endif #endif #ifdef MPI #ifdef _NON_BLOCKING printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n"); #endif #endif printf("\n"); fflush(stdout); } #ifdef _GAUGE_COPY init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); #else init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); #endif init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); j = init_bispinor_field(VOLUMEPLUSRAND, 12); if ( j!= 0) { fprintf(stderr, "Not enough memory for bispinor fields! Aborting...\n"); exit(0); } j = init_spinor_field(VOLUMEPLUSRAND, 12); if ( j!= 0) { fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); exit(0); } int numbScalarFields = 4; j = init_scalar_field(VOLUMEPLUSRAND, numbScalarFields); if ( j!= 0) { fprintf(stderr, "Not enough memory for scalar fields! Aborting...\n"); exit(0); } drvsc = malloc(18*VOLUMEPLUSRAND*sizeof(_Complex double)); if(g_proc_id == 0) { fprintf(stdout,"# The number of processes is %d \n",g_nproc); printf("# The lattice size is %d x %d x %d x %d\n", (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ)); printf("# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY),(int) LZ); fflush(stdout); } /* define the geometry */ geometry(); j = init_bsm_2hop_lookup(VOLUME); if ( j!= 0) { // this should not be reached since the init function calls fatal_error anyway fprintf(stderr, "Not enough memory for BSM2b 2hop lookup table! Aborting...\n"); exit(0); } /* define the boundary conditions for the fermion fields */ /* for the actual inversion, this is done in invert.c as the operators are iterated through */ // // For the BSM operator we don't use kappa normalisation, // as a result, when twisted boundary conditions are applied this needs to be unity. // In addition, unlike in the Wilson case, the hopping term comes with a plus sign. // However, in boundary(), the minus sign for the Wilson case is implicitly included. // We therefore use -1.0 here. boundary(-1.0); status = check_geometry(); if (status != 0) { fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n"); exit(1); } #if (defined MPI && !(defined _USE_SHMEM)) // fails, we're not using spinor fields // check_xchange(); #endif start_ranlux(1, 123456); // read gauge field if( strcmp(gauge_input_filename, "create_random_gaugefield") == 0 ) { random_gauge_field(reproduce_randomnumber_flag, g_gauge_field); } else { sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore); if (g_cart_id == 0) { printf("#\n# Trying to read gauge field from file %s in %s precision.\n", conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double")); fflush(stdout); } int i; if( (i = read_gauge_field(conf_filename,g_gauge_field)) !=0) { fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename); exit(-2); } if (g_cart_id == 0) { printf("# Finished reading gauge field.\n"); fflush(stdout); } } // read scalar field if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) { for( int s=0; s<numbScalarFields; s++ ) ranlxd(g_scalar_field[s], VOLUME); } else { sprintf(scalar_filename, "%s.%d", scalar_input_filename, nscalar); if (g_cart_id == 0) { printf("#\n# Trying to read scalar field from file %s in %s precision.\n", scalar_filename, (scalar_precision_read_flag == 32 ? "single" : "double")); fflush(stdout); } int i; if( (i = read_scalar_field(scalar_filename,g_scalar_field)) !=0) { fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename); exit(-2); } if (g_cart_id == 0) { printf("# Finished reading scalar field.\n"); fflush(stdout); } } #ifdef MPI xchange_gauge(g_gauge_field); #endif /*compute the energy of the gauge field*/ plaquette_energy = measure_plaquette( (const su3**) g_gauge_field); if (g_cart_id == 0) { printf("# The computed plaquette value is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc)); fflush(stdout); } #ifdef MPI for( int s=0; s<numbScalarFields; s++ ) generic_exchange(g_scalar_field[s], sizeof(scalar)); #endif /*initialize the bispinor fields*/ j_max=1; sdt=0.; // w random_spinor_field_lexic( (spinor*)(g_bispinor_field[4]), reproduce_randomnumber_flag, RN_GAUSS); random_spinor_field_lexic( (spinor*)(g_bispinor_field[4])+VOLUME, reproduce_randomnumber_flag, RN_GAUSS); // for the D^\dagger test: // v random_spinor_field_lexic( (spinor*)(g_bispinor_field[5]), reproduce_randomnumber_flag, RN_GAUSS); random_spinor_field_lexic( (spinor*)(g_bispinor_field[5])+VOLUME, reproduce_randomnumber_flag, RN_GAUSS); #if defined MPI generic_exchange(g_bispinor_field[4], sizeof(bispinor)); #endif // print L2-norm of source: double squarenorm = square_norm((spinor*)g_bispinor_field[4], 2*VOLUME, 1); if(g_proc_id==0) { printf("\n# square norm of the source: ||w||^2 = %e\n\n", squarenorm); fflush(stdout); } double t_MG, t_BK; /* inversion needs to be done first because it uses loads of the g_bispinor_fields internally */ #if TEST_INVERSION if(g_proc_id==1) printf("Testing inversion\n"); // Bartek's operator t1 = gettime(); cg_her_bi(g_bispinor_field[9], g_bispinor_field[4], 25000, 1.0e-14, 0, VOLUME, &Q2_psi_BSM2b); t_BK = gettime() - t1; // Marco's operator t1 = gettime(); cg_her_bi(g_bispinor_field[8], g_bispinor_field[4], 25000, 1.0e-14, 0, VOLUME, &Q2_psi_BSM2m); t_MG = gettime() - t1; if(g_proc_id==0) printf("Operator inversion time: t_MG = %f sec \t t_BK = %f sec\n\n", t_MG, t_BK); #endif /* now apply the operators to the same bispinor field and do various comparisons */ // Marco's operator #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); #endif t_MG = 0.0; t1 = gettime(); D_psi_BSM2m(g_bispinor_field[0], g_bispinor_field[4]); t1 = gettime() - t1; #ifdef MPI MPI_Allreduce (&t1, &t_MG, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #else t_MG = t1; #endif // Bartek's operator #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); #endif t_BK = 0.0; t1 = gettime(); D_psi_BSM2b(g_bispinor_field[1], g_bispinor_field[4]); t1 = gettime() - t1; #ifdef MPI MPI_Allreduce (&t1, &t_BK, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #else t_BK = t1; #endif if(g_proc_id==0) printf("Operator application time: t_MG = %f sec \t t_BK = %f sec\n\n", t_MG, t_BK); squarenorm = square_norm((spinor*)g_bispinor_field[0], 2*VOLUME, 1); if(g_proc_id==0) { printf("# || D_MG w ||^2 = %.16e\n", squarenorm); fflush(stdout); } squarenorm = square_norm((spinor*)g_bispinor_field[1], 2*VOLUME, 1); if(g_proc_id==0) { printf("# || D_BK w ||^2 = %.16e\n\n\n", squarenorm); fflush(stdout); } diff( (spinor*)g_bispinor_field[3], (spinor*)g_bispinor_field[0], (spinor*)g_bispinor_field[1], 2*VOLUME); printf("element-wise difference between (D_BK w) and (D_MG w)\n"); printf("( D_MG w - M_BK w )->sp_up.s0.c0= %.16e + I*(%.16e)\n\n", creal(g_bispinor_field[3][0].sp_up.s0.c0), cimag(g_bispinor_field[3][0].sp_up.s0.c0) ); double diffnorm = square_norm( (spinor*) g_bispinor_field[3], 2*VOLUME, 1 ); if(g_proc_id==0){ printf("Square norm of the difference\n"); printf("|| D_MG w - D_BK w ||^2 = %.16e \n\n\n", diffnorm); } // < D w, v > printf("Check consistency of D and D^dagger\n"); _Complex double prod1_MG = scalar_prod( (spinor*)g_bispinor_field[0], (spinor*)g_bispinor_field[5], 2*VOLUME, 1 ); if(g_proc_id==0) printf("< D_MG w, v > = %.16e + I*(%.16e)\n", creal(prod1_MG), cimag(prod1_MG)); _Complex double prod1_BK = scalar_prod( (spinor*)g_bispinor_field[1], (spinor*)g_bispinor_field[5], 2*VOLUME, 1 ); if(g_proc_id==0) printf("< D_BK w, v > = %.16e + I*(%.16e)\n\n", creal(prod1_BK), cimag(prod1_BK)); // < w, D^\dagger v > t_MG = gettime(); D_psi_dagger_BSM2m(g_bispinor_field[6], g_bispinor_field[5]); t_MG = gettime()-t_MG; t_BK = gettime(); D_psi_dagger_BSM2b(g_bispinor_field[7], g_bispinor_field[5]); t_BK = gettime() - t_BK; if(g_proc_id==0) printf("Operator dagger application time: t_MG = %f sec \t t_BK = %f sec\n\n", t_MG, t_BK); _Complex double prod2_MG = scalar_prod((spinor*)g_bispinor_field[4], (spinor*)g_bispinor_field[6], 2*VOLUME, 1); _Complex double prod2_BK = scalar_prod((spinor*)g_bispinor_field[4], (spinor*)g_bispinor_field[7], 2*VOLUME, 1); if( g_proc_id == 0 ){ printf("< w, D_MG^dagger v > = %.16e + I*(%.16e)\n", creal(prod2_MG), cimag(prod2_MG)); printf("< w, D_BK^dagger v > = %.16e + I*(%.16e)\n", creal(prod2_BK), cimag(prod2_BK)); printf("\n| < D_MG w, v > - < w, D_MG^dagger v > | = %.16e\n",cabs(prod2_MG-prod1_MG)); printf("| < D_BK w, v > - < w, D_BK^dagger v > | = %.16e\n\n",cabs(prod2_BK-prod1_BK)); } #if TEST_INVERSION // check result of inversion Q2_psi_BSM2m(g_bispinor_field[10], g_bispinor_field[8]); Q2_psi_BSM2b(g_bispinor_field[11], g_bispinor_field[8]); assign_diff_mul((spinor*)g_bispinor_field[10], (spinor*)g_bispinor_field[4], 1.0, 2*VOLUME); assign_diff_mul((spinor*)g_bispinor_field[11], (spinor*)g_bispinor_field[4], 1.0, 2*VOLUME); double squarenorm_MGMG = square_norm((spinor*)g_bispinor_field[10], 2*VOLUME, 1); double squarenorm_BKMG = square_norm((spinor*)g_bispinor_field[11], 2*VOLUME, 1); if(g_proc_id==0) { printf("# ||Q2_MG*(Q2_MG)^-1*(b)-b||^2 = %.16e\n\n", squarenorm_MGMG); printf("# ||Q2_BK*(Q2_MG)^-1*(b)-b||^2 = %.16e\n\n", squarenorm_BKMG); fflush(stdout); } Q2_psi_BSM2b(g_bispinor_field[10], g_bispinor_field[9]); Q2_psi_BSM2m(g_bispinor_field[11], g_bispinor_field[9]); assign_diff_mul((spinor*)g_bispinor_field[10], (spinor*)g_bispinor_field[4], 1.0, 2*VOLUME); assign_diff_mul((spinor*)g_bispinor_field[11], (spinor*)g_bispinor_field[4], 1.0, 2*VOLUME); double squarenorm_BKBK = square_norm((spinor*)g_bispinor_field[10], 2*VOLUME, 1); double squarenorm_MGBK = square_norm((spinor*)g_bispinor_field[11], 2*VOLUME, 1); if(g_proc_id==0) { printf("# ||Q2_BK*(Q2_BK)^-1*(b)-b||^2 = %.16e\n\n", squarenorm_BKBK); printf("# ||Q2_MG*(Q2_BK)^-1*(b)-b||^2 = %.16e\n\n", squarenorm_MGBK); fflush(stdout); } #endif #ifdef OMP free_omp_accumulators(); #endif free_gauge_field(); free_geometry_indices(); free_bispinor_field(); free_scalar_field(); #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); #endif return(0); }
int update() //Basic HMC update step { double squnrm; int i, acc; double exphdiff; /* the new impulses and the 'generator' of the arbitrary pseudofield */ /* calculate the hamiltonian of this state: new impulses + action */ /* g_X is ab-used a bit - here it is \xi = (gamma5 D)^{-1} \phi */ ham_old = s_g_old; for(i=0; i<GRIDPOINTS; i++) { gp1[i] = gauss(); gp2[i] = gauss(); ham_old += 0.5*(gp1[i]*gp1[i] + gp2[i]*gp2[i]); } /* Now create the field and calculate its contributions to the action (end of the 'misuse') */ /* squnrm is the fermion part of the action : */ /* S = R^dagger * R = g_fermion^dag * D^{-1 dag} * D^{-1} * g_fermion = g_fermion Q^-1 g_fermion */ /* PF1 det(1/(Q^2 + mu^2)) */ for(i=0; i<GRIDPOINTS; i++) { g_X[i].s1 = (gauss() + I*gauss())/sqrt(2); //Gaussian fields R g_X[i].s2 = (gauss() + I*gauss())/sqrt(2); } squnrm = square_norm(g_X); // step iv): g_fermion = \phi = K^dag * g_X = K^dag * \xi gam5D_wilson(g_fermion, g_X); assign_diff_mul(g_fermion, g_X, 0.+I*sqrt(g_musqr)); ham_old += squnrm; /* PF2 det((Q^2 + mu^2)/Q^2) */ if(no_timescales > 2) { for(i=0; i<GRIDPOINTS; i++) { g_X[i].s1 = (gauss() + I*gauss())/sqrt(2); //Gaussian fields R g_X[i].s2 = (gauss() + I*gauss())/sqrt(2); } squnrm = square_norm(g_X); cg(g_fermion2, g_X, ITER_MAX, DELTACG, &gam5D_SQR_musqr_wilson); gam5D_wilson(g_gam5DX, g_fermion2); assign_add_mul(g_gam5DX, g_fermion2, 0.+I*sqrt(g_musqr)); gam5D_wilson(g_fermion2, g_gam5DX); ham_old += squnrm; } // Add the part for the fermion fields // Do the molecular dynamic chain /* the simple LF scheme */ /* the second order minimal norm multi-timescale integrator*/ /* MN2_integrator(g_steps, 2, g_steps*g_stepsize, 0.2); */ /* This is the recursive implementation */ /* in can be found in rec_lf_integrator.c|h */ if (no_timescales == 1) leapfrog(n_steps[0], tau/n_steps[0]); else integrate_leap_frog(tau/n_steps[no_timescales-1], no_timescales-1, no_timescales, n_steps, 1, up_momenta); // Calculate the new action and hamiltonian ham = 0; s_g = 0; for (i=0; i<GRIDPOINTS; i++) { s_g += S_G(i); ham += 0.5*(gp1[i]*gp1[i] + gp2[i]*gp2[i]); } /* Sum_ij [(g_fermion^*)_i (Q^-1)_ij (g_fermion)_j] = Sum_ij [(g_fermion^*)_i (g_X)_i] */ ham += s_g; // add in the part for the fermion fields. cg(g_X, g_fermion, ITER_MAX, DELTACG, &gam5D_SQR_musqr_wilson); ham += scalar_prod_r(g_fermion, g_X); if(no_timescales > 2) { cg(g_gam5DX, g_fermion2, ITER_MAX, DELTACG, &gam5D_SQR_wilson); gam5D_SQR_musqr_wilson(g_X, g_temp, g_gam5DX); ham += scalar_prod_r(g_fermion2, g_X); } exphdiff = exp(ham_old-ham); acc = accept(exphdiff); for(i=0; i<GRIDPOINTS; i++) { gauge1_old[i]=gauge1[i]; gauge2_old[i]=gauge2[i]; } s_g_old = s_g; return(acc); }
int mrblk(spinor * const P, spinor * const Q, const int max_iter, const double eps_sq, const int rel_prec, const int N, matrix_mult_blk f, const int blk) { static int mr_init=0; int i = 0; double norm_r,beta; _Complex double alpha; spinor * r; const int parallel = 0; spinor * s[3]; static spinor *s_=NULL; static int N_; if(mr_init == 0 || N != N_) { if(N!= N_ && mr_init != 0) { free(s_); } N_ = N; s_ = calloc(3*(N+1)+1, sizeof(spinor)); mr_init = 1; } #if (defined SSE || defined SSE2 || defined SSE3) s[0] = (spinor *)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE); #else s[0] = s_; #endif s[1] = s[0] + N + 1; s[2] = s[1] + N + 1; r = s[0]; norm_r = square_norm(Q, N, parallel); zero_spinor_field(P, N); f(s[2], P, blk); diff(r, Q, s[2], N); norm_r = square_norm(r, N, parallel); if(g_proc_id == g_stdio_proc && g_debug_level > 2 && blk == 0) { printf("MRblk iteration= %d |res|^2= %e\n", i, norm_r); fflush( stdout ); } while((norm_r > eps_sq) && (i < max_iter)){ i++; f(s[1], r, blk); alpha = scalar_prod(s[1], r, N, parallel); beta = square_norm(s[1], N, parallel); alpha /= beta; assign_add_mul(P, r, alpha, N); if(i%50 == 0) { f(s[2], P,blk); } else{ assign_add_mul(s[2], s[1], alpha, N); } diff(r, Q, s[2], N); norm_r = square_norm(r, N, parallel); if(g_proc_id == g_stdio_proc && g_debug_level > 2 && blk == 0) { printf("MRblk iteration= %d |res|^2= %g\n", i, norm_r); fflush(stdout); } } /* free(s_); */ if(norm_r > eps_sq){ return(-1); } return(i); }