void Potential::generate_local_potential() { PROFILE_WITH_TIMER("sirius::Potential::generate_local_potential"); mdarray<double, 2> vloc_radial_integrals(unit_cell_.num_atom_types(), ctx_.gvec().num_shells()); /* split G-shells between MPI ranks */ splindex<block> spl_gshells(ctx_.gvec().num_shells(), comm_.size(), comm_.rank()); #pragma omp parallel { /* splines for all atom types */ std::vector< Spline<double> > sa(unit_cell_.num_atom_types()); for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) sa[iat] = Spline<double>(unit_cell_.atom_type(iat).radial_grid()); #pragma omp for for (int igsloc = 0; igsloc < spl_gshells.local_size(); igsloc++) { int igs = spl_gshells[igsloc]; for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) { auto& atom_type = unit_cell_.atom_type(iat); if (igs == 0) { for (int ir = 0; ir < atom_type.num_mt_points(); ir++) { double x = atom_type.radial_grid(ir); sa[iat][ir] = (x * atom_type.uspp().vloc[ir] + atom_type.zn()) * x; } vloc_radial_integrals(iat, igs) = sa[iat].interpolate().integrate(0); } else { double g = ctx_.gvec().shell_len(igs); double g2 = std::pow(g, 2); for (int ir = 0; ir < atom_type.num_mt_points(); ir++) { double x = atom_type.radial_grid(ir); sa[iat][ir] = (x * atom_type.uspp().vloc[ir] + atom_type.zn() * gsl_sf_erf(x)) * std::sin(g * x); } vloc_radial_integrals(iat, igs) = (sa[iat].interpolate().integrate(0) / g - atom_type.zn() * std::exp(-g2 / 4) / g2); } } } } int ld = unit_cell_.num_atom_types(); comm_.allgather(vloc_radial_integrals.at<CPU>(), ld * spl_gshells.global_offset(), ld * spl_gshells.local_size()); auto v = unit_cell_.make_periodic_function(vloc_radial_integrals, ctx_.gvec()); ctx_.fft().prepare(ctx_.gvec().partition()); ctx_.fft().transform<1>(ctx_.gvec().partition(), &v[ctx_.gvec().partition().gvec_offset_fft()]); ctx_.fft().output(&local_potential_->f_rg(0)); ctx_.fft().dismiss(); }
void Density::generate(K_set& ks__) { PROFILE_WITH_TIMER("sirius::Density::generate"); generate_valence(ks__); if (ctx_.full_potential()) { generate_core_charge_density(); /* add core contribution */ for (int ialoc = 0; ialoc < (int)unit_cell_.spl_num_atoms().local_size(); ialoc++) { int ia = unit_cell_.spl_num_atoms(ialoc); for (int ir = 0; ir < unit_cell_.atom(ia).num_mt_points(); ir++) rho_->f_mt<local>(0, ir, ialoc) += unit_cell_.atom(ia).symmetry_class().core_charge_density(ir) / y00; } /* synchronize muffin-tin part */ rho_->sync_mt(); for (int j = 0; j < ctx_.num_mag_dims(); j++) magnetization_[j]->sync_mt(); } double nel = 0; if (ctx_.full_potential()) { std::vector<double> nel_mt; double nel_it; nel = rho_->integrate(nel_mt, nel_it); } else { nel = rho_->f_pw(0).real() * unit_cell_.omega(); } if (std::abs(nel - unit_cell_.num_electrons()) > 1e-5) { std::stringstream s; s << "wrong charge density after k-point summation" << std::endl << "obtained value : " << nel << std::endl << "target value : " << unit_cell_.num_electrons() << std::endl << "difference : " << fabs(nel - unit_cell_.num_electrons()) << std::endl; if (ctx_.full_potential()) { s << "total core leakage : " << core_leakage(); for (int ic = 0; ic < unit_cell_.num_atom_symmetry_classes(); ic++) s << std::endl << " atom class : " << ic << ", core leakage : " << core_leakage(ic); } WARNING(s); } #ifdef __PRINT_OBJECT_HASH DUMP("hash(rhomt): %16llX", rho_->f_mt().hash()); DUMP("hash(rhoit): %16llX", rho_->f_it().hash()); #endif //if (debug_level > 1) check_density_continuity_at_mt(); }
inline void Density::add_k_point_contribution_rg(K_point* kp__) { PROFILE_WITH_TIMER("sirius::Density::add_k_point_contribution_rg"); int nfv = ctx_.num_fv_states(); double omega = unit_cell_.omega(); mdarray<double, 2> density_rg(ctx_.fft().local_size(), ctx_.num_mag_dims() + 1); density_rg.zero(); #ifdef __GPU if (ctx_.fft().hybrid()) { density_rg.allocate(memory_t::device); density_rg.zero_on_device(); } #endif ctx_.fft().prepare(kp__->gkvec().partition()); /* non-magnetic or collinear case */ if (ctx_.num_mag_dims() != 3) { for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) { if (!kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().global_index_size()) { continue; } #pragma omp for schedule(dynamic, 1) for (int i = 0; i < kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().local_size(); i++) { int j = kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col()[i]; double w = kp__->band_occupancy(j + ispn * nfv) * kp__->weight() / omega; /* transform to real space; in case of GPU wave-function stays in GPU memory */ if (ctx_.fft().gpu_only()) { ctx_.fft().transform<1>(kp__->gkvec().partition(), kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<GPU>(0, i)); } else { ctx_.fft().transform<1>(kp__->gkvec().partition(), kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<CPU>(0, i)); } if (ctx_.fft().hybrid()) { #ifdef __GPU update_density_rg_1_gpu(ctx_.fft().local_size(), ctx_.fft().buffer<GPU>(), w, density_rg.at<GPU>(0, ispn)); #else TERMINATE_NO_GPU #endif } else { #pragma omp parallel for for (int ir = 0; ir < ctx_.fft().local_size(); ir++) { auto z = ctx_.fft().buffer(ir); density_rg(ir, ispn) += w * (std::pow(z.real(), 2) + std::pow(z.imag(), 2)); } } } }
inline void Band::set_fv_h_o<CPU, electronic_structure_method_t::full_potential_lapwlo>(K_point* kp__, Periodic_function<double>* effective_potential__, dmatrix<double_complex>& h__, dmatrix<double_complex>& o__) const { PROFILE_WITH_TIMER("sirius::Band::set_fv_h_o"); h__.zero(); o__.zero(); double_complex zone(1, 0); int num_atoms_in_block = 2 * omp_get_max_threads(); int nblk = unit_cell_.num_atoms() / num_atoms_in_block + std::min(1, unit_cell_.num_atoms() % num_atoms_in_block); DUMP("nblk: %i", nblk); int max_mt_aw = num_atoms_in_block * unit_cell_.max_mt_aw_basis_size(); DUMP("max_mt_aw: %i", max_mt_aw); mdarray<double_complex, 2> alm_row(kp__->num_gkvec_row(), max_mt_aw); mdarray<double_complex, 2> alm_col(kp__->num_gkvec_col(), max_mt_aw); mdarray<double_complex, 2> halm_col(kp__->num_gkvec_col(), max_mt_aw); runtime::Timer t1("sirius::Band::set_fv_h_o|zgemm"); for (int iblk = 0; iblk < nblk; iblk++) { int num_mt_aw = 0; std::vector<int> offsets(num_atoms_in_block); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); offsets[ia - iblk * num_atoms_in_block] = num_mt_aw; num_mt_aw += type.mt_aw_basis_size(); } #ifdef __PRINT_OBJECT_CHECKSUM alm_row.zero(); alm_col.zero(); halm_col.zero(); #endif #pragma omp parallel { int tid = omp_get_thread_num(); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { if (ia % omp_get_num_threads() == tid) { int ialoc = ia - iblk * num_atoms_in_block; auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); mdarray<double_complex, 2> alm_row_tmp(alm_row.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_row(), type.mt_aw_basis_size()); mdarray<double_complex, 2> alm_col_tmp(alm_col.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_col(), type.mt_aw_basis_size()); mdarray<double_complex, 2> halm_col_tmp(halm_col.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_col(), type.mt_aw_basis_size()); kp__->alm_coeffs_row()->generate(ia, alm_row_tmp); for (int xi = 0; xi < type.mt_aw_basis_size(); xi++) { for (int igk = 0; igk < kp__->num_gkvec_row(); igk++) alm_row_tmp(igk, xi) = std::conj(alm_row_tmp(igk, xi)); } kp__->alm_coeffs_col()->generate(ia, alm_col_tmp); apply_hmt_to_apw<spin_block_t::nm>(atom, kp__->num_gkvec_col(), alm_col_tmp, halm_col_tmp); /* setup apw-lo and lo-apw blocks */ set_fv_h_o_apw_lo(kp__, type, atom, ia, alm_row_tmp, alm_col_tmp, h__, o__); } } } #ifdef __PRINT_OBJECT_CHECKSUM double_complex z1 = alm_row.checksum(); double_complex z2 = alm_col.checksum(); double_complex z3 = halm_col.checksum(); DUMP("checksum(alm_row): %18.10f %18.10f", std::real(z1), std::imag(z1)); DUMP("checksum(alm_col): %18.10f %18.10f", std::real(z2), std::imag(z2)); DUMP("checksum(halm_col): %18.10f %18.10f", std::real(z3), std::imag(z3)); #endif linalg<CPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, zone, alm_row.at<CPU>(), alm_row.ld(), alm_col.at<CPU>(), alm_col.ld(), zone, o__.at<CPU>(), o__.ld()); linalg<CPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, zone, alm_row.at<CPU>(), alm_row.ld(), halm_col.at<CPU>(), halm_col.ld(), zone, h__.at<CPU>(), h__.ld()); } double tval = t1.stop(); if (kp__->comm().rank() == 0) { DUMP("effective zgemm performance: %12.6f GFlops", 2 * 8e-9 * kp__->num_gkvec() * kp__->num_gkvec() * unit_cell_.mt_aw_basis_size() / tval); } /* add interstitial contributon */ set_fv_h_o_it(kp__, effective_potential__, h__, o__); /* setup lo-lo block */ set_fv_h_o_lo_lo(kp__, h__, o__); }
void Density::generate_valence(K_set& ks__) { PROFILE_WITH_TIMER("sirius::Density::generate_valence"); double wt = 0.0; double ot = 0.0; for (int ik = 0; ik < ks__.num_kpoints(); ik++) { wt += ks__[ik]->weight(); for (int j = 0; j < ctx_.num_bands(); j++) ot += ks__[ik]->weight() * ks__[ik]->band_occupancy(j); } if (std::abs(wt - 1.0) > 1e-12) TERMINATE("K_point weights don't sum to one"); if (std::abs(ot - unit_cell_.num_valence_electrons()) > 1e-8) { std::stringstream s; s << "wrong occupancies" << std::endl << " computed : " << ot << std::endl << " required : " << unit_cell_.num_valence_electrons() << std::endl << " difference : " << std::abs(ot - unit_cell_.num_valence_electrons()); WARNING(s); } /* swap wave functions */ for (int ikloc = 0; ikloc < ks__.spl_num_kpoints().local_size(); ikloc++) { int ik = ks__.spl_num_kpoints(ikloc); auto kp = ks__[ik]; for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) { if (ctx_.full_potential()) { kp->spinor_wave_functions<true>(ispn).swap_forward(0, kp->num_occupied_bands(ispn)); } else { kp->spinor_wave_functions<false>(ispn).swap_forward(0, kp->num_occupied_bands(ispn), kp->gkvec_fft_distr()); } } } /* zero density and magnetization */ zero(); ctx_.fft().prepare(); /* interstitial part is independent of basis type */ generate_valence_density_it(ks__); /* for muffin-tin part */ switch (ctx_.esm_type()) { case full_potential_lapwlo: { generate_valence_density_mt(ks__); break; } case full_potential_pwlo: { STOP(); } default: { break; } } #if (__VERIFICATION > 0) for (int ir = 0; ir < ctx_.fft(0)->local_size(); ir++) { if (rho_->f_it(ir) < 0) TERMINATE("density is wrong"); } #endif //== double nel = 0; //== for (int ir = 0; ir < ctx_.fft().local_size(); ir++) //== { //== nel += rho_->f_rg(ir); //== } //== ctx_.mpi_grid().communicator(1 << _dim_row_).allreduce(&nel, 1); //== nel = nel * unit_cell_.omega() / ctx_.fft().size(); //== printf("number of electrons: %f\n", nel); /* get rho(G) and mag(G) */ rho_->fft_transform(-1); for (int j = 0; j < ctx_.num_mag_dims(); j++) magnetization_[j]->fft_transform(-1); //== printf("number of electrons: %f\n", rho_->f_pw(0).real() * unit_cell_.omega()); //== STOP(); ctx_.fft().dismiss(); if (ctx_.esm_type() == ultrasoft_pseudopotential) augment(ks__); }
inline void K_point::generate_fv_states() { PROFILE_WITH_TIMER("sirius::K_point::generate_fv_states"); if (!ctx_.full_potential()) { return; } mdarray<double_complex, 2> pw_coeffs; mdarray<double_complex, 2> mt_coeffs; int nbnd_loc; /* in both cases eigen-vectors are redistributed to the same "full column" storage */ if (ctx_.iterative_solver_input_section().type_ == "exact") { fv_eigen_vectors_->remap_forward(0, ctx_.num_fv_states()); /* local number of bands */ nbnd_loc = fv_eigen_vectors_->spl_num_col().local_size(); if (nbnd_loc) { pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(), gklo_basis_size(), nbnd_loc); mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(num_gkvec(), 0), gklo_basis_size(), nbnd_loc); } } else { fv_eigen_vectors_slab_->remap_to_full_column_distr(ctx_.num_fv_states()); assert(fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size() == fv_eigen_vectors_slab_->mt_coeffs().spl_num_col().local_size()); /* local number of bands */ nbnd_loc = fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size(); if (nbnd_loc) { pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->pw_coeffs().extra().at<CPU>(), num_gkvec(), nbnd_loc); mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->mt_coeffs().extra().at<CPU>(), unit_cell_.mt_lo_basis_size(), nbnd_loc); } } #ifdef __GPU if (ctx_.processing_unit() == GPU) { pw_coeffs.allocate(memory_t::device); pw_coeffs.copy_to_device(); } #endif fv_states().prepare_full_column_distr(ctx_.num_fv_states()); assert(nbnd_loc == fv_states().pw_coeffs().spl_num_col().local_size()); assert(nbnd_loc == fv_states().mt_coeffs().spl_num_col().local_size()); #pragma omp parallel { /* get thread id */ #ifdef __GPU int tid = omp_get_thread_num(); #endif mdarray<double_complex, 2> alm(num_gkvec(), unit_cell_.max_mt_aw_basis_size(), memory_t::host_pinned); mdarray<double_complex, 2> tmp; #ifdef __GPU if (ctx_.processing_unit() == GPU) { alm.allocate(memory_t::device); tmp = mdarray<double_complex, 2>(unit_cell_.max_mt_aw_basis_size(), nbnd_loc, memory_t::device); } #endif #pragma omp for for (int ia = 0; ia < unit_cell_.num_atoms(); ia++) { /* number of alm coefficients for atom */ int mt_aw_size = unit_cell_.atom(ia).mt_aw_basis_size(); /* offset in wave-function */ int offset_wf = unit_cell_.atom(ia).offset_mt_coeffs(); /* generate matching coefficients for all G-vectors */ alm_coeffs_->generate(ia, alm); /* compute F(lm, i) = A(lm, G)^{T} * evec(G, i) for a single atom */ if (ctx_.processing_unit() == CPU) { /* multiply eigen-vectors and matching coefficients */ linalg<CPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(), alm.at<CPU>(), alm.ld(), pw_coeffs.at<CPU>(), pw_coeffs.ld(), fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld()); } #ifdef __GPU if (ctx_.processing_unit() == GPU) { /* multiply eigen-vectors and matching coefficients */ alm.async_copy_to_device(tid); linalg<GPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(), alm.at<GPU>(), alm.ld(), pw_coeffs.at<GPU>(), pw_coeffs.ld(), tmp.at<GPU>(), tmp.ld(), tid); acc::copyout(fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld(), tmp.at<GPU>(), tmp.ld(), mt_aw_size, nbnd_loc, tid); acc::sync_stream(tid); } #endif for (int i = 0; i < nbnd_loc; i++) { /* lo block */ std::memcpy(fv_states().mt_coeffs().extra().at<CPU>(offset_wf + mt_aw_size, i), mt_coeffs.at<CPU>(unit_cell_.atom(ia).offset_lo(), i), unit_cell_.atom(ia).mt_lo_basis_size() * sizeof(double_complex)); } } #pragma omp for for (int i = 0; i < nbnd_loc; i++) { /* G+k block */ std::memcpy(fv_states().pw_coeffs().extra().at<CPU>(0, i), pw_coeffs.at<CPU>(0, i), num_gkvec() * sizeof(double_complex)); } } fv_states().remap_to_prime_distr(ctx_.num_fv_states()); }