cltudecomp(int lb, int ub):indx(lb, ub), indx2(lb, ub) { ivector iv(lb + 1, ub); iv.fill_seqadd(lb, 1); L.allocate(lb + 1, ub, lb, iv); ivector iv1(lb, ub); iv1.fill_seqadd(lb, 1); U.allocate(lb, ub, lb, iv1); indx2.fill_seqadd(lb, 1); }
void allocate(int lb, int ub) { indx.allocate(lb, ub); indx2.allocate(lb, ub); ivector iv(lb + 1, ub); iv.fill_seqadd(lb, 1); L.allocate(lb + 1, ub, lb, iv); ivector iv1(lb, ub); iv1.fill_seqadd(lb, 1); U.allocate(lb, ub, lb, iv1); indx2.fill_seqadd(lb, 1); }
inline void Band::set_fv_h_o<GPU, electronic_structure_method_t::full_potential_lapwlo>(K_point* kp__, Periodic_function<double>* effective_potential__, dmatrix<double_complex>& h__, dmatrix<double_complex>& o__) const { runtime::Timer t("sirius::Band::set_fv_h_o"); runtime::Timer t2("sirius::Band::set_fv_h_o|alloc"); h__.zero(); h__.allocate(memory_t::device); h__.zero_on_device(); o__.zero(); o__.allocate(memory_t::device); o__.zero_on_device(); double_complex zone(1, 0); int num_atoms_in_block = 2 * omp_get_max_threads(); int nblk = unit_cell_.num_atoms() / num_atoms_in_block + std::min(1, unit_cell_.num_atoms() % num_atoms_in_block); DUMP("nblk: %i", nblk); int max_mt_aw = num_atoms_in_block * unit_cell_.max_mt_aw_basis_size(); DUMP("max_mt_aw: %i", max_mt_aw); mdarray<double_complex, 3> alm_row(kp__->num_gkvec_row(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device); mdarray<double_complex, 3> alm_col(kp__->num_gkvec_col(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device); mdarray<double_complex, 3> halm_col(kp__->num_gkvec_col(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device); t2.stop(); runtime::Timer t1("sirius::Band::set_fv_h_o|zgemm"); for (int iblk = 0; iblk < nblk; iblk++) { int num_mt_aw = 0; std::vector<int> offsets(num_atoms_in_block); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { int ialoc = ia - iblk * num_atoms_in_block; auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); offsets[ialoc] = num_mt_aw; num_mt_aw += type.mt_aw_basis_size(); } int s = iblk % 2; #pragma omp parallel { int tid = omp_get_thread_num(); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { if (ia % omp_get_num_threads() == tid) { int ialoc = ia - iblk * num_atoms_in_block; auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); mdarray<double_complex, 2> alm_row_tmp(alm_row.at<CPU>(0, offsets[ialoc], s), alm_row.at<GPU>(0, offsets[ialoc], s), kp__->num_gkvec_row(), type.mt_aw_basis_size()); mdarray<double_complex, 2> alm_col_tmp(alm_col.at<CPU>(0, offsets[ialoc], s), alm_col.at<GPU>(0, offsets[ialoc], s), kp__->num_gkvec_col(), type.mt_aw_basis_size()); mdarray<double_complex, 2> halm_col_tmp(halm_col.at<CPU>(0, offsets[ialoc], s), halm_col.at<GPU>(0, offsets[ialoc], s), kp__->num_gkvec_col(), type.mt_aw_basis_size()); kp__->alm_coeffs_row()->generate(ia, alm_row_tmp); for (int xi = 0; xi < type.mt_aw_basis_size(); xi++) { for (int igk = 0; igk < kp__->num_gkvec_row(); igk++) { alm_row_tmp(igk, xi) = std::conj(alm_row_tmp(igk, xi)); } } alm_row_tmp.async_copy_to_device(tid); kp__->alm_coeffs_col()->generate(ia, alm_col_tmp); alm_col_tmp.async_copy_to_device(tid); apply_hmt_to_apw<spin_block_t::nm>(atom, kp__->num_gkvec_col(), alm_col_tmp, halm_col_tmp); halm_col_tmp.async_copy_to_device(tid); /* setup apw-lo and lo-apw blocks */ set_fv_h_o_apw_lo(kp__, type, atom, ia, alm_row_tmp, alm_col_tmp, h__, o__); } } acc::sync_stream(tid); } acc::sync_stream(omp_get_max_threads()); linalg<GPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, &zone, alm_row.at<GPU>(0, 0, s), alm_row.ld(), alm_col.at<GPU>(0, 0, s), alm_col.ld(), &zone, o__.at<GPU>(), o__.ld(), omp_get_max_threads()); linalg<GPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, &zone, alm_row.at<GPU>(0, 0, s), alm_row.ld(), halm_col.at<GPU>(0, 0, s), halm_col.ld(), &zone, h__.at<GPU>(), h__.ld(), omp_get_max_threads()); } acc::copyout(h__.at<CPU>(), h__.ld(), h__.at<GPU>(), h__.ld(), kp__->num_gkvec_row(), kp__->num_gkvec_col()); acc::copyout(o__.at<CPU>(), o__.ld(), o__.at<GPU>(), o__.ld(), kp__->num_gkvec_row(), kp__->num_gkvec_col()); double tval = t1.stop(); if (kp__->comm().rank() == 0) { DUMP("effective zgemm performance: %12.6f GFlops", 2 * 8e-9 * kp__->num_gkvec() * kp__->num_gkvec() * unit_cell_.mt_aw_basis_size() / tval); } /* add interstitial contributon */ set_fv_h_o_it(kp__, effective_potential__, h__, o__); /* setup lo-lo block */ set_fv_h_o_lo_lo(kp__, h__, o__); h__.deallocate_on_device(); o__.deallocate_on_device(); }