inline void orthogonalize(int N__, int n__, std::vector<wave_functions*> wfs__, int idx_bra__, int idx_ket__, dmatrix<T>& o__, wave_functions& tmp__) { PROFILE("sddk::wave_functions::orthogonalize"); auto pu = wfs__[0]->pu(); /* project out the old subspace: * |\tilda phi_new> = |phi_new> - |phi_old><phi_old|phi_new> */ if (N__ > 0) { inner(*wfs__[idx_bra__], 0, N__, *wfs__[idx_ket__], N__, n__, 0.0, o__, 0, 0); transform(pu, -1.0, wfs__, 0, N__, o__, 0, 0, 1.0, wfs__, N__, n__); } /* orthogonalize new n__ x n__ block */ inner(*wfs__[idx_bra__], N__, n__, *wfs__[idx_ket__], N__, n__, 0.0, o__, 0, 0); /* single MPI rank */ if (o__.blacs_grid().comm().size() == 1) { bool use_magma{false}; #if defined(__GPU) && defined(__MAGMA) if (pu == GPU) { use_magma = true; } #endif if (use_magma) { #ifdef __GPU /* Cholesky factorization */ if (int info = linalg<GPU>::potrf(n__, o__.template at<GPU>(), o__.ld())) { std::stringstream s; s << "error in GPU factorization, info = " << info; TERMINATE(s); } /* inversion of triangular matrix */ if (linalg<GPU>::trtri(n__, o__.template at<GPU>(), o__.ld())) { TERMINATE("error in inversion"); } #endif } else { /* CPU version */ //check_hermitian("OVLP", o__, n__); //o__.serialize("overlap.dat", n__); /* Cholesky factorization */ if (int info = linalg<CPU>::potrf(n__, &o__(0, 0), o__.ld())) { std::stringstream s; s << "error in factorization, info = " << info << std::endl << "number of existing states: " << N__ << std::endl << "number of new states: " << n__ << std::endl << "number of wave_functions: " << wfs__.size() << std::endl << "idx_bra: " << idx_bra__ << " " << "idx_ket:" << idx_ket__; TERMINATE(s); } /* inversion of triangular matrix */ if (linalg<CPU>::trtri(n__, &o__(0, 0), o__.ld())) { TERMINATE("error in inversion"); } if (pu == GPU) { #ifdef __GPU acc::copyin(o__.template at<GPU>(), o__.ld(), o__.template at<CPU>(), o__.ld(), n__, n__); #endif } } /* CPU version */ if (pu == CPU) { /* multiplication by triangular matrix */ for (auto& e: wfs__) { /* wave functions are complex, transformation matrix is complex */ if (std::is_same<T, double_complex>::value) { linalg<CPU>::trmm('R', 'U', 'N', e->pw_coeffs().num_rows_loc(), n__, double_complex(1, 0), reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(), e->pw_coeffs().prime().at<CPU>(0, N__), e->pw_coeffs().prime().ld()); if (e->has_mt() && e->mt_coeffs().num_rows_loc()) { linalg<CPU>::trmm('R', 'U', 'N', e->mt_coeffs().num_rows_loc(), n__, double_complex(1, 0), reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(), e->mt_coeffs().prime().at<CPU>(0, N__), e->mt_coeffs().prime().ld()); } } /* wave functions are real (psi(G) = psi^{*}(-G)), transformation matrix is real */ if (std::is_same<T, double>::value) { linalg<CPU>::trmm('R', 'U', 'N', 2 * e->pw_coeffs().num_rows_loc(), n__, 1.0, reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(), reinterpret_cast<double*>(e->pw_coeffs().prime().at<CPU>(0, N__)), 2 * e->pw_coeffs().prime().ld()); if (e->has_mt() && e->mt_coeffs().num_rows_loc()) { linalg<CPU>::trmm('R', 'U', 'N', 2 * e->mt_coeffs().num_rows_loc(), n__, 1.0, reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(), reinterpret_cast<double*>(e->mt_coeffs().prime().at<CPU>(0, N__)), 2 * e->mt_coeffs().prime().ld()); } } } } #ifdef __GPU if (pu == GPU) { /* multiplication by triangular matrix */ for (auto& e: wfs__) { if (std::is_same<T, double_complex>::value) { double_complex alpha(1, 0); linalg<GPU>::trmm('R', 'U', 'N', e->pw_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(), e->pw_coeffs().prime().at<GPU>(0, N__), e->pw_coeffs().prime().ld()); if (e->has_mt() && e->mt_coeffs().num_rows_loc()) { linalg<GPU>::trmm('R', 'U', 'N', e->mt_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(), e->mt_coeffs().prime().at<GPU>(0, N__), e->mt_coeffs().prime().ld()); } /* alpha should not go out of the scope, so wait */ acc::sync_stream(-1); } if (std::is_same<T, double>::value) { double alpha{1}; linalg<GPU>::trmm('R', 'U', 'N', 2 * e->pw_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(), reinterpret_cast<double*>(e->pw_coeffs().prime().at<GPU>(0, N__)), 2 * e->pw_coeffs().prime().ld()); if (e->has_mt() && e->mt_coeffs().num_rows_loc()) { linalg<GPU>::trmm('R', 'U', 'N', 2 * e->mt_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(), reinterpret_cast<double*>(e->mt_coeffs().prime().at<GPU>(0, N__)), 2 * e->mt_coeffs().prime().ld()); } acc::sync_stream(-1); } } acc::sync_stream(-1); } #endif } else { /* parallel transformation */ sddk::timer t1("sddk::wave_functions::orthogonalize|potrf"); if (int info = linalg<CPU>::potrf(n__, o__)) { std::stringstream s; s << "error in factorization, info = " << info; TERMINATE(s); } t1.stop(); sddk::timer t2("sddk::wave_functions::orthogonalize|trtri"); if (linalg<CPU>::trtri(n__, o__)) { TERMINATE("error in inversion"); } t2.stop(); /* o is upper triangular matrix */ for (int i = 0; i < n__; i++) { for (int j = i + 1; j < n__; j++) { o__.set(j, i, 0); } } /* phi is transformed into phi, so we can't use it as the output buffer; use tmp instead and then overwrite phi */ for (auto& e: wfs__) { transform(pu, *e, N__, n__, o__, 0, 0, tmp__, 0, n__); e->copy_from(tmp__, 0, n__, N__, pu); } } }
inline void Band::set_fv_h_o<CPU, electronic_structure_method_t::full_potential_lapwlo>(K_point* kp__, Periodic_function<double>* effective_potential__, dmatrix<double_complex>& h__, dmatrix<double_complex>& o__) const { PROFILE_WITH_TIMER("sirius::Band::set_fv_h_o"); h__.zero(); o__.zero(); double_complex zone(1, 0); int num_atoms_in_block = 2 * omp_get_max_threads(); int nblk = unit_cell_.num_atoms() / num_atoms_in_block + std::min(1, unit_cell_.num_atoms() % num_atoms_in_block); DUMP("nblk: %i", nblk); int max_mt_aw = num_atoms_in_block * unit_cell_.max_mt_aw_basis_size(); DUMP("max_mt_aw: %i", max_mt_aw); mdarray<double_complex, 2> alm_row(kp__->num_gkvec_row(), max_mt_aw); mdarray<double_complex, 2> alm_col(kp__->num_gkvec_col(), max_mt_aw); mdarray<double_complex, 2> halm_col(kp__->num_gkvec_col(), max_mt_aw); runtime::Timer t1("sirius::Band::set_fv_h_o|zgemm"); for (int iblk = 0; iblk < nblk; iblk++) { int num_mt_aw = 0; std::vector<int> offsets(num_atoms_in_block); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); offsets[ia - iblk * num_atoms_in_block] = num_mt_aw; num_mt_aw += type.mt_aw_basis_size(); } #ifdef __PRINT_OBJECT_CHECKSUM alm_row.zero(); alm_col.zero(); halm_col.zero(); #endif #pragma omp parallel { int tid = omp_get_thread_num(); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { if (ia % omp_get_num_threads() == tid) { int ialoc = ia - iblk * num_atoms_in_block; auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); mdarray<double_complex, 2> alm_row_tmp(alm_row.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_row(), type.mt_aw_basis_size()); mdarray<double_complex, 2> alm_col_tmp(alm_col.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_col(), type.mt_aw_basis_size()); mdarray<double_complex, 2> halm_col_tmp(halm_col.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_col(), type.mt_aw_basis_size()); kp__->alm_coeffs_row()->generate(ia, alm_row_tmp); for (int xi = 0; xi < type.mt_aw_basis_size(); xi++) { for (int igk = 0; igk < kp__->num_gkvec_row(); igk++) alm_row_tmp(igk, xi) = std::conj(alm_row_tmp(igk, xi)); } kp__->alm_coeffs_col()->generate(ia, alm_col_tmp); apply_hmt_to_apw<spin_block_t::nm>(atom, kp__->num_gkvec_col(), alm_col_tmp, halm_col_tmp); /* setup apw-lo and lo-apw blocks */ set_fv_h_o_apw_lo(kp__, type, atom, ia, alm_row_tmp, alm_col_tmp, h__, o__); } } } #ifdef __PRINT_OBJECT_CHECKSUM double_complex z1 = alm_row.checksum(); double_complex z2 = alm_col.checksum(); double_complex z3 = halm_col.checksum(); DUMP("checksum(alm_row): %18.10f %18.10f", std::real(z1), std::imag(z1)); DUMP("checksum(alm_col): %18.10f %18.10f", std::real(z2), std::imag(z2)); DUMP("checksum(halm_col): %18.10f %18.10f", std::real(z3), std::imag(z3)); #endif linalg<CPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, zone, alm_row.at<CPU>(), alm_row.ld(), alm_col.at<CPU>(), alm_col.ld(), zone, o__.at<CPU>(), o__.ld()); linalg<CPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, zone, alm_row.at<CPU>(), alm_row.ld(), halm_col.at<CPU>(), halm_col.ld(), zone, h__.at<CPU>(), h__.ld()); } double tval = t1.stop(); if (kp__->comm().rank() == 0) { DUMP("effective zgemm performance: %12.6f GFlops", 2 * 8e-9 * kp__->num_gkvec() * kp__->num_gkvec() * unit_cell_.mt_aw_basis_size() / tval); } /* add interstitial contributon */ set_fv_h_o_it(kp__, effective_potential__, h__, o__); /* setup lo-lo block */ set_fv_h_o_lo_lo(kp__, h__, o__); }
inline void orthogonalize(device_t pu__, int num_sc__, int N__, int n__, std::vector<Wave_functions*> wfs__, int idx_bra__, int idx_ket__, dmatrix<T>& o__, wave_functions& tmp__) { PROFILE("sddk::wave_functions::orthogonalize"); /* project out the old subspace: * |\tilda phi_new> = |phi_new> - |phi_old><phi_old|phi_new> */ if (N__ > 0) { inner(num_sc__, *wfs__[idx_bra__], 0, N__, *wfs__[idx_ket__], N__, n__, o__, 0, 0); transform(pu__, -1.0, wfs__, 0, N__, o__, 0, 0, 1.0, wfs__, N__, n__); } //if (true) { // inner(num_sc__, *wfs__[idx_bra__], N__, n__, *wfs__[idx_ket__], N__, n__, o__, 0, 0); // linalg<CPU>::geqrf(n__, n__, o__, 0, 0); // auto diag = o__.get_diag(n__); // if (o__.blacs_grid().comm().rank() == 0) { // printf("diagonal of R-factor\n"); // for (int i = 0; i < n__; i++) { // if (std::abs(diag[i]) < 1e-6) { // std::cout << "small norm: " << i << " " << diag[i] << std::endl; // } // } // } // //std::vector<double> eo(n__); // //dmatrix<T> evec(o__.num_rows(), o__.num_cols(), o__.blacs_grid(), o__.bs_row(), o__.bs_col()); // //Eigenproblem_elpa1 evs(o__.blacs_grid(), o__.bs_row()); // //evs.solve(n__, n__, o__.template at<CPU>(), o__.ld(), eo.data(), evec.template at<CPU>(), evec.ld(), // // o__.num_rows_local(), o__.num_cols_local()); // //if (o__.blacs_grid().comm().rank() == 0) { // // std::cout << "smallest ev of the new n x x block: " << eo[0] << std::endl; // //} //} /* orthogonalize new n__ x n__ block */ inner(num_sc__, *wfs__[idx_bra__], N__, n__, *wfs__[idx_ket__], N__, n__, o__, 0, 0); /* single MPI rank */ if (o__.blacs_grid().comm().size() == 1) { bool use_magma{false}; #if defined(__GPU) && defined(__MAGMA) if (pu__ == GPU) { use_magma = true; } #endif if (use_magma) { #ifdef __GPU /* Cholesky factorization */ if (int info = linalg<GPU>::potrf(n__, o__.template at<GPU>(), o__.ld())) { std::stringstream s; s << "error in GPU factorization, info = " << info; TERMINATE(s); } /* inversion of triangular matrix */ if (linalg<GPU>::trtri(n__, o__.template at<GPU>(), o__.ld())) { TERMINATE("error in inversion"); } #endif } else { /* CPU version */ //check_hermitian("OVLP", o__, n__); //o__.serialize("overlap.dat", n__); /* Cholesky factorization */ if (int info = linalg<CPU>::potrf(n__, &o__(0, 0), o__.ld())) { std::stringstream s; s << "error in factorization, info = " << info << std::endl << "number of existing states: " << N__ << std::endl << "number of new states: " << n__ << std::endl << "number of wave_functions: " << wfs__.size() << std::endl << "idx_bra: " << idx_bra__ << " " << "idx_ket:" << idx_ket__; TERMINATE(s); } /* inversion of triangular matrix */ if (linalg<CPU>::trtri(n__, &o__(0, 0), o__.ld())) { TERMINATE("error in inversion"); } if (pu__ == GPU) { #ifdef __GPU acc::copyin(o__.template at<GPU>(), o__.ld(), o__.template at<CPU>(), o__.ld(), n__, n__); #endif } } for (int isc = 0; isc < num_sc__; isc++) { /* CPU version */ if (pu__ == CPU) { /* multiplication by triangular matrix */ for (auto& e: wfs__) { /* alias for spin component of wave-functions */ auto& wfsc = e->component(isc); /* wave functions are complex, transformation matrix is complex */ if (std::is_same<T, double_complex>::value) { linalg<CPU>::trmm('R', 'U', 'N', wfsc.pw_coeffs().num_rows_loc(), n__, double_complex(1, 0), reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(), wfsc.pw_coeffs().prime().at<CPU>(0, N__), e->component(isc).pw_coeffs().prime().ld()); if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) { linalg<CPU>::trmm('R', 'U', 'N', wfsc.mt_coeffs().num_rows_loc(), n__, double_complex(1, 0), reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(), wfsc.mt_coeffs().prime().at<CPU>(0, N__), wfsc.mt_coeffs().prime().ld()); } } /* wave functions are real (psi(G) = psi^{*}(-G)), transformation matrix is real */ if (std::is_same<T, double>::value) { linalg<CPU>::trmm('R', 'U', 'N', 2 * wfsc.pw_coeffs().num_rows_loc(), n__, 1.0, reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(), reinterpret_cast<double*>(wfsc.pw_coeffs().prime().at<CPU>(0, N__)), 2 * wfsc.pw_coeffs().prime().ld()); if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) { linalg<CPU>::trmm('R', 'U', 'N', 2 * wfsc.mt_coeffs().num_rows_loc(), n__, 1.0, reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(), reinterpret_cast<double*>(wfsc.mt_coeffs().prime().at<CPU>(0, N__)), 2 * wfsc.mt_coeffs().prime().ld()); } } } } #ifdef __GPU if (pu__ == GPU) { /* multiplication by triangular matrix */ for (auto& e: wfs__) { auto& wfsc = e->component(isc); if (std::is_same<T, double_complex>::value) { double_complex alpha(1, 0); linalg<GPU>::trmm('R', 'U', 'N', wfsc.pw_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(), wfsc.pw_coeffs().prime().at<GPU>(0, N__), wfsc.pw_coeffs().prime().ld()); if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) { linalg<GPU>::trmm('R', 'U', 'N', wfsc.mt_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(), wfsc.mt_coeffs().prime().at<GPU>(0, N__), wfsc.mt_coeffs().prime().ld()); } /* alpha should not go out of the scope, so wait */ acc::sync_stream(-1); } if (std::is_same<T, double>::value) { double alpha{1}; linalg<GPU>::trmm('R', 'U', 'N', 2 * wfsc.pw_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(), reinterpret_cast<double*>(wfsc.pw_coeffs().prime().at<GPU>(0, N__)), 2 * wfsc.pw_coeffs().prime().ld()); if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) { linalg<GPU>::trmm('R', 'U', 'N', 2 * wfsc.mt_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(), reinterpret_cast<double*>(wfsc.mt_coeffs().prime().at<GPU>(0, N__)), 2 * wfsc.mt_coeffs().prime().ld()); } acc::sync_stream(-1); } } acc::sync_stream(-1); } #endif } } else { /* parallel transformation */ sddk::timer t1("sddk::wave_functions::orthogonalize|potrf"); if (int info = linalg<CPU>::potrf(n__, o__)) { std::stringstream s; s << "error in factorization, info = " << info; TERMINATE(s); } t1.stop(); sddk::timer t2("sddk::wave_functions::orthogonalize|trtri"); if (linalg<CPU>::trtri(n__, o__)) { TERMINATE("error in inversion"); } t2.stop(); /* o is upper triangular matrix */ for (int i = 0; i < n__; i++) { for (int j = i + 1; j < n__; j++) { o__.set(j, i, 0); } } /* phi is transformed into phi, so we can't use it as the output buffer; use tmp instead and then overwrite phi */ for (auto& e: wfs__) { for (int isc = 0; isc < num_sc__; isc++) { transform(pu__, e->component(isc), N__, n__, o__, 0, 0, tmp__, 0, n__); e->component(isc).copy_from(tmp__, 0, n__, N__, pu__); } } } }
inline void Band::set_fv_h_o<GPU, electronic_structure_method_t::full_potential_lapwlo>(K_point* kp__, Periodic_function<double>* effective_potential__, dmatrix<double_complex>& h__, dmatrix<double_complex>& o__) const { runtime::Timer t("sirius::Band::set_fv_h_o"); runtime::Timer t2("sirius::Band::set_fv_h_o|alloc"); h__.zero(); h__.allocate(memory_t::device); h__.zero_on_device(); o__.zero(); o__.allocate(memory_t::device); o__.zero_on_device(); double_complex zone(1, 0); int num_atoms_in_block = 2 * omp_get_max_threads(); int nblk = unit_cell_.num_atoms() / num_atoms_in_block + std::min(1, unit_cell_.num_atoms() % num_atoms_in_block); DUMP("nblk: %i", nblk); int max_mt_aw = num_atoms_in_block * unit_cell_.max_mt_aw_basis_size(); DUMP("max_mt_aw: %i", max_mt_aw); mdarray<double_complex, 3> alm_row(kp__->num_gkvec_row(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device); mdarray<double_complex, 3> alm_col(kp__->num_gkvec_col(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device); mdarray<double_complex, 3> halm_col(kp__->num_gkvec_col(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device); t2.stop(); runtime::Timer t1("sirius::Band::set_fv_h_o|zgemm"); for (int iblk = 0; iblk < nblk; iblk++) { int num_mt_aw = 0; std::vector<int> offsets(num_atoms_in_block); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { int ialoc = ia - iblk * num_atoms_in_block; auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); offsets[ialoc] = num_mt_aw; num_mt_aw += type.mt_aw_basis_size(); } int s = iblk % 2; #pragma omp parallel { int tid = omp_get_thread_num(); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { if (ia % omp_get_num_threads() == tid) { int ialoc = ia - iblk * num_atoms_in_block; auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); mdarray<double_complex, 2> alm_row_tmp(alm_row.at<CPU>(0, offsets[ialoc], s), alm_row.at<GPU>(0, offsets[ialoc], s), kp__->num_gkvec_row(), type.mt_aw_basis_size()); mdarray<double_complex, 2> alm_col_tmp(alm_col.at<CPU>(0, offsets[ialoc], s), alm_col.at<GPU>(0, offsets[ialoc], s), kp__->num_gkvec_col(), type.mt_aw_basis_size()); mdarray<double_complex, 2> halm_col_tmp(halm_col.at<CPU>(0, offsets[ialoc], s), halm_col.at<GPU>(0, offsets[ialoc], s), kp__->num_gkvec_col(), type.mt_aw_basis_size()); kp__->alm_coeffs_row()->generate(ia, alm_row_tmp); for (int xi = 0; xi < type.mt_aw_basis_size(); xi++) { for (int igk = 0; igk < kp__->num_gkvec_row(); igk++) { alm_row_tmp(igk, xi) = std::conj(alm_row_tmp(igk, xi)); } } alm_row_tmp.async_copy_to_device(tid); kp__->alm_coeffs_col()->generate(ia, alm_col_tmp); alm_col_tmp.async_copy_to_device(tid); apply_hmt_to_apw<spin_block_t::nm>(atom, kp__->num_gkvec_col(), alm_col_tmp, halm_col_tmp); halm_col_tmp.async_copy_to_device(tid); /* setup apw-lo and lo-apw blocks */ set_fv_h_o_apw_lo(kp__, type, atom, ia, alm_row_tmp, alm_col_tmp, h__, o__); } } acc::sync_stream(tid); } acc::sync_stream(omp_get_max_threads()); linalg<GPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, &zone, alm_row.at<GPU>(0, 0, s), alm_row.ld(), alm_col.at<GPU>(0, 0, s), alm_col.ld(), &zone, o__.at<GPU>(), o__.ld(), omp_get_max_threads()); linalg<GPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, &zone, alm_row.at<GPU>(0, 0, s), alm_row.ld(), halm_col.at<GPU>(0, 0, s), halm_col.ld(), &zone, h__.at<GPU>(), h__.ld(), omp_get_max_threads()); } acc::copyout(h__.at<CPU>(), h__.ld(), h__.at<GPU>(), h__.ld(), kp__->num_gkvec_row(), kp__->num_gkvec_col()); acc::copyout(o__.at<CPU>(), o__.ld(), o__.at<GPU>(), o__.ld(), kp__->num_gkvec_row(), kp__->num_gkvec_col()); double tval = t1.stop(); if (kp__->comm().rank() == 0) { DUMP("effective zgemm performance: %12.6f GFlops", 2 * 8e-9 * kp__->num_gkvec() * kp__->num_gkvec() * unit_cell_.mt_aw_basis_size() / tval); } /* add interstitial contributon */ set_fv_h_o_it(kp__, effective_potential__, h__, o__); /* setup lo-lo block */ set_fv_h_o_lo_lo(kp__, h__, o__); h__.deallocate_on_device(); o__.deallocate_on_device(); }