inline void K_point::generate_fv_states() { PROFILE_WITH_TIMER("sirius::K_point::generate_fv_states"); if (!ctx_.full_potential()) { return; } mdarray<double_complex, 2> pw_coeffs; mdarray<double_complex, 2> mt_coeffs; int nbnd_loc; /* in both cases eigen-vectors are redistributed to the same "full column" storage */ if (ctx_.iterative_solver_input_section().type_ == "exact") { fv_eigen_vectors_->remap_forward(0, ctx_.num_fv_states()); /* local number of bands */ nbnd_loc = fv_eigen_vectors_->spl_num_col().local_size(); if (nbnd_loc) { pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(), gklo_basis_size(), nbnd_loc); mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(num_gkvec(), 0), gklo_basis_size(), nbnd_loc); } } else { fv_eigen_vectors_slab_->remap_to_full_column_distr(ctx_.num_fv_states()); assert(fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size() == fv_eigen_vectors_slab_->mt_coeffs().spl_num_col().local_size()); /* local number of bands */ nbnd_loc = fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size(); if (nbnd_loc) { pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->pw_coeffs().extra().at<CPU>(), num_gkvec(), nbnd_loc); mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->mt_coeffs().extra().at<CPU>(), unit_cell_.mt_lo_basis_size(), nbnd_loc); } } #ifdef __GPU if (ctx_.processing_unit() == GPU) { pw_coeffs.allocate(memory_t::device); pw_coeffs.copy_to_device(); } #endif fv_states().prepare_full_column_distr(ctx_.num_fv_states()); assert(nbnd_loc == fv_states().pw_coeffs().spl_num_col().local_size()); assert(nbnd_loc == fv_states().mt_coeffs().spl_num_col().local_size()); #pragma omp parallel { /* get thread id */ #ifdef __GPU int tid = omp_get_thread_num(); #endif mdarray<double_complex, 2> alm(num_gkvec(), unit_cell_.max_mt_aw_basis_size(), memory_t::host_pinned); mdarray<double_complex, 2> tmp; #ifdef __GPU if (ctx_.processing_unit() == GPU) { alm.allocate(memory_t::device); tmp = mdarray<double_complex, 2>(unit_cell_.max_mt_aw_basis_size(), nbnd_loc, memory_t::device); } #endif #pragma omp for for (int ia = 0; ia < unit_cell_.num_atoms(); ia++) { /* number of alm coefficients for atom */ int mt_aw_size = unit_cell_.atom(ia).mt_aw_basis_size(); /* offset in wave-function */ int offset_wf = unit_cell_.atom(ia).offset_mt_coeffs(); /* generate matching coefficients for all G-vectors */ alm_coeffs_->generate(ia, alm); /* compute F(lm, i) = A(lm, G)^{T} * evec(G, i) for a single atom */ if (ctx_.processing_unit() == CPU) { /* multiply eigen-vectors and matching coefficients */ linalg<CPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(), alm.at<CPU>(), alm.ld(), pw_coeffs.at<CPU>(), pw_coeffs.ld(), fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld()); } #ifdef __GPU if (ctx_.processing_unit() == GPU) { /* multiply eigen-vectors and matching coefficients */ alm.async_copy_to_device(tid); linalg<GPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(), alm.at<GPU>(), alm.ld(), pw_coeffs.at<GPU>(), pw_coeffs.ld(), tmp.at<GPU>(), tmp.ld(), tid); acc::copyout(fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld(), tmp.at<GPU>(), tmp.ld(), mt_aw_size, nbnd_loc, tid); acc::sync_stream(tid); } #endif for (int i = 0; i < nbnd_loc; i++) { /* lo block */ std::memcpy(fv_states().mt_coeffs().extra().at<CPU>(offset_wf + mt_aw_size, i), mt_coeffs.at<CPU>(unit_cell_.atom(ia).offset_lo(), i), unit_cell_.atom(ia).mt_lo_basis_size() * sizeof(double_complex)); } } #pragma omp for for (int i = 0; i < nbnd_loc; i++) { /* G+k block */ std::memcpy(fv_states().pw_coeffs().extra().at<CPU>(0, i), pw_coeffs.at<CPU>(0, i), num_gkvec() * sizeof(double_complex)); } } fv_states().remap_to_prime_distr(ctx_.num_fv_states()); }
inline void K_point::generate_fv_states() { PROFILE("sirius::K_point::generate_fv_states"); if (!ctx_.full_potential()) { return; } #ifdef __GPU if (ctx_.processing_unit() == GPU) { fv_eigen_vectors_slab().pw_coeffs().allocate_on_device(); fv_eigen_vectors_slab().pw_coeffs().copy_to_device(0, ctx_.num_fv_states()); } #endif mdarray<double_complex, 2> alm(num_gkvec_loc(), unit_cell_.max_mt_aw_basis_size(), memory_t::host_pinned); mdarray<double_complex, 2> tmp(unit_cell_.max_mt_aw_basis_size(), ctx_.num_fv_states()); #ifdef __GPU if (ctx_.processing_unit() == GPU) { alm.allocate(memory_t::device); tmp.allocate(memory_t::device); } #endif for (int ia = 0; ia < unit_cell_.num_atoms(); ia++) { auto location = fv_eigen_vectors_slab().spl_num_atoms().location(ia); /* number of alm coefficients for atom */ int mt_aw_size = unit_cell_.atom(ia).mt_aw_basis_size(); int mt_lo_size = unit_cell_.atom(ia).mt_lo_basis_size(); /* generate matching coefficients for all G-vectors */ alm_coeffs_loc_->generate(ia, alm); double_complex* tmp_ptr_gpu = (ctx_.processing_unit() == GPU) ? tmp.at<GPU>() : nullptr; mdarray<double_complex, 2> tmp1(tmp.at<CPU>(), tmp_ptr_gpu, mt_aw_size, ctx_.num_fv_states()); /* compute F(lm, i) = A(lm, G)^{T} * evec(G, i) for a single atom */ if (ctx_.processing_unit() == CPU) { linalg<CPU>::gemm(1, 0, mt_aw_size, ctx_.num_fv_states(), num_gkvec_loc(), alm.at<CPU>(), alm.ld(), fv_eigen_vectors_slab().pw_coeffs().prime().at<CPU>(), fv_eigen_vectors_slab().pw_coeffs().prime().ld(), tmp1.at<CPU>(), tmp1.ld()); } #ifdef __GPU if (ctx_.processing_unit() == GPU) { alm.copy_to_device(mt_aw_size * num_gkvec_loc()); linalg<GPU>::gemm(1, 0, mt_aw_size, ctx_.num_fv_states(), num_gkvec_loc(), alm.at<GPU>(), alm.ld(), fv_eigen_vectors_slab().pw_coeffs().prime().at<GPU>(), fv_eigen_vectors_slab().pw_coeffs().prime().ld(), tmp1.at<GPU>(), tmp1.ld()); tmp1.copy_to_host(); } #endif comm_.reduce(tmp1.at<CPU>(), static_cast<int>(tmp1.size()), location.rank); #ifdef __PRINT_OBJECT_CHECKSUM auto z1 = tmp1.checksum(); DUMP("checksum(tmp1): %18.10f %18.10f", std::real(z1), std::imag(z1)); #endif if (location.rank == comm_.rank()) { int offset1 = fv_states().offset_mt_coeffs(location.local_index); int offset2 = fv_eigen_vectors_slab().offset_mt_coeffs(location.local_index); for (int i = 0; i < ctx_.num_fv_states(); i++) { /* aw block */ std::memcpy(fv_states().mt_coeffs().prime().at<CPU>(offset1, i), tmp1.at<CPU>(0, i), mt_aw_size * sizeof(double_complex)); /* lo block */ if (mt_lo_size) { std::memcpy(fv_states().mt_coeffs().prime().at<CPU>(offset1 + mt_aw_size, i), fv_eigen_vectors_slab().mt_coeffs().prime().at<CPU>(offset2, i), mt_lo_size * sizeof(double_complex)); } } } } #pragma omp parallel for for (int i = 0; i < ctx_.num_fv_states(); i++) { /* G+k block */ std::memcpy(fv_states().pw_coeffs().prime().at<CPU>(0, i), fv_eigen_vectors_slab().pw_coeffs().prime().at<CPU>(0, i), num_gkvec_loc() * sizeof(double_complex)); } #ifdef __GPU if (ctx_.processing_unit() == GPU) { fv_eigen_vectors_slab().pw_coeffs().deallocate_on_device(); } #endif }