Example #1
0
inline void orthogonalize(int N__,
                          int n__,
                          std::vector<wave_functions*> wfs__,
                          int idx_bra__,
                          int idx_ket__,
                          dmatrix<T>& o__,
                          wave_functions& tmp__)
{
    PROFILE("sddk::wave_functions::orthogonalize");

    auto pu = wfs__[0]->pu();
        
    /* project out the old subspace:
     * |\tilda phi_new> = |phi_new> - |phi_old><phi_old|phi_new> */
    if (N__ > 0) {
        inner(*wfs__[idx_bra__], 0, N__, *wfs__[idx_ket__], N__, n__, 0.0, o__, 0, 0);
        transform(pu, -1.0, wfs__, 0, N__, o__, 0, 0, 1.0, wfs__, N__, n__);
    }

    /* orthogonalize new n__ x n__ block */
    inner(*wfs__[idx_bra__], N__, n__, *wfs__[idx_ket__], N__, n__, 0.0, o__, 0, 0);

    /* single MPI rank */
    if (o__.blacs_grid().comm().size() == 1) {
        bool use_magma{false};
        #if defined(__GPU) && defined(__MAGMA)
        if (pu == GPU) {
            use_magma = true;
        }
        #endif

        if (use_magma) {
            #ifdef __GPU
            /* Cholesky factorization */
            if (int info = linalg<GPU>::potrf(n__, o__.template at<GPU>(), o__.ld())) {
                std::stringstream s;
                s << "error in GPU factorization, info = " << info;
                TERMINATE(s);
            }
            /* inversion of triangular matrix */
            if (linalg<GPU>::trtri(n__, o__.template at<GPU>(), o__.ld())) {
                TERMINATE("error in inversion");
            }
            #endif
        } else { /* CPU version */
            //check_hermitian("OVLP", o__, n__);
            //o__.serialize("overlap.dat", n__);
            /* Cholesky factorization */
            if (int info = linalg<CPU>::potrf(n__, &o__(0, 0), o__.ld())) {
                std::stringstream s;
                s << "error in factorization, info = " << info << std::endl
                  << "number of existing states: " << N__ << std::endl
                  << "number of new states: " << n__ << std::endl
                  << "number of wave_functions: " << wfs__.size() << std::endl
                  << "idx_bra: " << idx_bra__ << " " << "idx_ket:" << idx_ket__;
                TERMINATE(s);
            }
            /* inversion of triangular matrix */
            if (linalg<CPU>::trtri(n__, &o__(0, 0), o__.ld())) {
                TERMINATE("error in inversion");
            }
            if (pu == GPU) {
                #ifdef __GPU
                acc::copyin(o__.template at<GPU>(), o__.ld(), o__.template at<CPU>(), o__.ld(), n__, n__);
                #endif
            }
        }

        /* CPU version */
        if (pu == CPU) {
            /* multiplication by triangular matrix */
            for (auto& e: wfs__) {
                /* wave functions are complex, transformation matrix is complex */
                if (std::is_same<T, double_complex>::value) {
                    linalg<CPU>::trmm('R', 'U', 'N', e->pw_coeffs().num_rows_loc(), n__, double_complex(1, 0),
                                      reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(),
                                      e->pw_coeffs().prime().at<CPU>(0, N__), e->pw_coeffs().prime().ld());

                    if (e->has_mt() && e->mt_coeffs().num_rows_loc()) {
                        linalg<CPU>::trmm('R', 'U', 'N', e->mt_coeffs().num_rows_loc(), n__, double_complex(1, 0),
                                          reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(),
                                          e->mt_coeffs().prime().at<CPU>(0, N__), e->mt_coeffs().prime().ld());
                    }
                }
                /* wave functions are real (psi(G) = psi^{*}(-G)), transformation matrix is real */
                if (std::is_same<T, double>::value) {
                    linalg<CPU>::trmm('R', 'U', 'N', 2 * e->pw_coeffs().num_rows_loc(), n__, 1.0,
                                      reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(),
                                      reinterpret_cast<double*>(e->pw_coeffs().prime().at<CPU>(0, N__)), 2 * e->pw_coeffs().prime().ld());

                    if (e->has_mt() && e->mt_coeffs().num_rows_loc()) {
                        linalg<CPU>::trmm('R', 'U', 'N', 2 * e->mt_coeffs().num_rows_loc(), n__, 1.0,
                                          reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(),
                                          reinterpret_cast<double*>(e->mt_coeffs().prime().at<CPU>(0, N__)), 2 * e->mt_coeffs().prime().ld());
                    }
                }
            }
        }
        #ifdef __GPU
        if (pu == GPU) {
            /* multiplication by triangular matrix */
            for (auto& e: wfs__) {
                if (std::is_same<T, double_complex>::value) {
                    double_complex alpha(1, 0);

                    linalg<GPU>::trmm('R', 'U', 'N', e->pw_coeffs().num_rows_loc(), n__, &alpha,
                                      reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(),
                                      e->pw_coeffs().prime().at<GPU>(0, N__), e->pw_coeffs().prime().ld());

                    if (e->has_mt() && e->mt_coeffs().num_rows_loc()) {
                        linalg<GPU>::trmm('R', 'U', 'N', e->mt_coeffs().num_rows_loc(), n__, &alpha,
                                          reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(),
                                          e->mt_coeffs().prime().at<GPU>(0, N__), e->mt_coeffs().prime().ld());
                    }
                    /* alpha should not go out of the scope, so wait */
                    acc::sync_stream(-1);
                }
                if (std::is_same<T, double>::value) {
                    double alpha{1};

                    linalg<GPU>::trmm('R', 'U', 'N', 2 * e->pw_coeffs().num_rows_loc(), n__, &alpha,
                                      reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(),
                                      reinterpret_cast<double*>(e->pw_coeffs().prime().at<GPU>(0, N__)), 2 * e->pw_coeffs().prime().ld());

                    if (e->has_mt() && e->mt_coeffs().num_rows_loc()) {
                        linalg<GPU>::trmm('R', 'U', 'N', 2 * e->mt_coeffs().num_rows_loc(), n__, &alpha,
                                          reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(),
                                          reinterpret_cast<double*>(e->mt_coeffs().prime().at<GPU>(0, N__)), 2 * e->mt_coeffs().prime().ld());
                    }
                    acc::sync_stream(-1);
                }
            }
            acc::sync_stream(-1);
        }
        #endif
    } else { /* parallel transformation */
        sddk::timer t1("sddk::wave_functions::orthogonalize|potrf");
        if (int info = linalg<CPU>::potrf(n__, o__)) {
            std::stringstream s;
            s << "error in factorization, info = " << info;
            TERMINATE(s);
        }
        t1.stop();

        sddk::timer t2("sddk::wave_functions::orthogonalize|trtri");
        if (linalg<CPU>::trtri(n__, o__)) {
            TERMINATE("error in inversion");
        }
        t2.stop();

        /* o is upper triangular matrix */
        for (int i = 0; i < n__; i++) {
            for (int j = i + 1; j < n__; j++) {
                o__.set(j, i, 0);
            }
        }

        /* phi is transformed into phi, so we can't use it as the output buffer; use tmp instead and then overwrite phi */
        for (auto& e: wfs__) {
            transform(pu, *e, N__, n__, o__, 0, 0, tmp__, 0, n__);
            e->copy_from(tmp__, 0, n__, N__, pu);
        }
    }
}
inline void Band::set_fv_h_o<CPU, electronic_structure_method_t::full_potential_lapwlo>(K_point* kp__,
                                                                                        Periodic_function<double>* effective_potential__,
                                                                                        dmatrix<double_complex>& h__,
                                                                                        dmatrix<double_complex>& o__) const
{
    PROFILE_WITH_TIMER("sirius::Band::set_fv_h_o");
    
    h__.zero();
    o__.zero();

    double_complex zone(1, 0);
    
    int num_atoms_in_block = 2 * omp_get_max_threads();
    int nblk = unit_cell_.num_atoms() / num_atoms_in_block +
               std::min(1, unit_cell_.num_atoms() % num_atoms_in_block);
    DUMP("nblk: %i", nblk);

    int max_mt_aw = num_atoms_in_block * unit_cell_.max_mt_aw_basis_size();
    DUMP("max_mt_aw: %i", max_mt_aw);

    mdarray<double_complex, 2> alm_row(kp__->num_gkvec_row(), max_mt_aw);
    mdarray<double_complex, 2> alm_col(kp__->num_gkvec_col(), max_mt_aw);
    mdarray<double_complex, 2> halm_col(kp__->num_gkvec_col(), max_mt_aw);

    runtime::Timer t1("sirius::Band::set_fv_h_o|zgemm");
    for (int iblk = 0; iblk < nblk; iblk++)
    {
        int num_mt_aw = 0;
        std::vector<int> offsets(num_atoms_in_block);
        for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++)
        {
            auto& atom = unit_cell_.atom(ia);
            auto& type = atom.type();
            offsets[ia - iblk * num_atoms_in_block] = num_mt_aw;
            num_mt_aw += type.mt_aw_basis_size();
        }
        
        #ifdef __PRINT_OBJECT_CHECKSUM
        alm_row.zero();
        alm_col.zero();
        halm_col.zero();
        #endif

        #pragma omp parallel
        {
            int tid = omp_get_thread_num();
            for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++)
            {
                if (ia % omp_get_num_threads() == tid)
                {
                    int ialoc = ia - iblk * num_atoms_in_block;
                    auto& atom = unit_cell_.atom(ia);
                    auto& type = atom.type();

                    mdarray<double_complex, 2> alm_row_tmp(alm_row.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_row(), type.mt_aw_basis_size());
                    mdarray<double_complex, 2> alm_col_tmp(alm_col.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_col(), type.mt_aw_basis_size());
                    mdarray<double_complex, 2> halm_col_tmp(halm_col.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_col(), type.mt_aw_basis_size());

                    kp__->alm_coeffs_row()->generate(ia, alm_row_tmp);
                    for (int xi = 0; xi < type.mt_aw_basis_size(); xi++)
                    {
                        for (int igk = 0; igk < kp__->num_gkvec_row(); igk++) alm_row_tmp(igk, xi) = std::conj(alm_row_tmp(igk, xi));
                    }
                    kp__->alm_coeffs_col()->generate(ia, alm_col_tmp);
                    apply_hmt_to_apw<spin_block_t::nm>(atom, kp__->num_gkvec_col(), alm_col_tmp, halm_col_tmp);

                    /* setup apw-lo and lo-apw blocks */
                    set_fv_h_o_apw_lo(kp__, type, atom, ia, alm_row_tmp, alm_col_tmp, h__, o__);
                }
            }
        }
        #ifdef __PRINT_OBJECT_CHECKSUM
        double_complex z1 = alm_row.checksum();
        double_complex z2 = alm_col.checksum();
        double_complex z3 = halm_col.checksum();
        DUMP("checksum(alm_row): %18.10f %18.10f", std::real(z1), std::imag(z1));
        DUMP("checksum(alm_col): %18.10f %18.10f", std::real(z2), std::imag(z2));
        DUMP("checksum(halm_col): %18.10f %18.10f", std::real(z3), std::imag(z3));
        #endif
        linalg<CPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, zone,
                          alm_row.at<CPU>(), alm_row.ld(), alm_col.at<CPU>(), alm_col.ld(), zone, 
                          o__.at<CPU>(), o__.ld());

        linalg<CPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, zone, 
                          alm_row.at<CPU>(), alm_row.ld(), halm_col.at<CPU>(), halm_col.ld(), zone,
                          h__.at<CPU>(), h__.ld());
    }
    double tval = t1.stop();
    if (kp__->comm().rank() == 0)
    {
        DUMP("effective zgemm performance: %12.6f GFlops",
             2 * 8e-9 * kp__->num_gkvec() * kp__->num_gkvec() * unit_cell_.mt_aw_basis_size() / tval);
    }

    /* add interstitial contributon */
    set_fv_h_o_it(kp__, effective_potential__, h__, o__);

    /* setup lo-lo block */
    set_fv_h_o_lo_lo(kp__, h__, o__);
}
Example #3
0
inline void orthogonalize(device_t                     pu__,
                          int                          num_sc__,
                          int                          N__,
                          int                          n__,
                          std::vector<Wave_functions*> wfs__,
                          int                          idx_bra__,
                          int                          idx_ket__,
                          dmatrix<T>&                  o__,
                          wave_functions&              tmp__)
{
    PROFILE("sddk::wave_functions::orthogonalize");

    /* project out the old subspace:
     * |\tilda phi_new> = |phi_new> - |phi_old><phi_old|phi_new> */
    if (N__ > 0) {
        inner(num_sc__, *wfs__[idx_bra__], 0, N__, *wfs__[idx_ket__], N__, n__, o__, 0, 0);
        transform(pu__, -1.0, wfs__, 0, N__, o__, 0, 0, 1.0, wfs__, N__, n__);
    }

    //if (true) {

    //    inner(num_sc__, *wfs__[idx_bra__], N__, n__, *wfs__[idx_ket__], N__, n__, o__, 0, 0);

    //    linalg<CPU>::geqrf(n__, n__, o__, 0, 0);
    //    auto diag = o__.get_diag(n__);
    //    if (o__.blacs_grid().comm().rank() == 0) {
    //        printf("diagonal of R-factor\n");
    //        for (int i = 0; i < n__; i++) {
    //            if (std::abs(diag[i]) < 1e-6) {
    //                std::cout << "small norm: " << i << " " << diag[i] << std::endl;
    //            }
    //        }
    //    }

    //    //std::vector<double> eo(n__);
    //    //dmatrix<T> evec(o__.num_rows(), o__.num_cols(), o__.blacs_grid(), o__.bs_row(), o__.bs_col());

    //    //Eigenproblem_elpa1 evs(o__.blacs_grid(), o__.bs_row());
    //    //evs.solve(n__, n__, o__.template at<CPU>(), o__.ld(), eo.data(), evec.template at<CPU>(), evec.ld(),
    //    //          o__.num_rows_local(), o__.num_cols_local());

    //    //if (o__.blacs_grid().comm().rank() == 0) { 
    //    //    std::cout << "smallest ev of the new n x x block: " << eo[0] << std::endl;
    //    //}
    //}

    /* orthogonalize new n__ x n__ block */
    inner(num_sc__, *wfs__[idx_bra__], N__, n__, *wfs__[idx_ket__], N__, n__, o__, 0, 0);

    /* single MPI rank */
    if (o__.blacs_grid().comm().size() == 1) {
        bool use_magma{false};
        #if defined(__GPU) && defined(__MAGMA)
        if (pu__ == GPU) {
            use_magma = true;
        }
        #endif

        if (use_magma) {
            #ifdef __GPU
            /* Cholesky factorization */
            if (int info = linalg<GPU>::potrf(n__, o__.template at<GPU>(), o__.ld())) {
                std::stringstream s;
                s << "error in GPU factorization, info = " << info;
                TERMINATE(s);
            }
            /* inversion of triangular matrix */
            if (linalg<GPU>::trtri(n__, o__.template at<GPU>(), o__.ld())) {
                TERMINATE("error in inversion");
            }
            #endif
        } else { /* CPU version */
            //check_hermitian("OVLP", o__, n__);
            //o__.serialize("overlap.dat", n__);
            /* Cholesky factorization */
            if (int info = linalg<CPU>::potrf(n__, &o__(0, 0), o__.ld())) {
                std::stringstream s;
                s << "error in factorization, info = " << info << std::endl
                  << "number of existing states: " << N__ << std::endl
                  << "number of new states: " << n__ << std::endl
                  << "number of wave_functions: " << wfs__.size() << std::endl
                  << "idx_bra: " << idx_bra__ << " " << "idx_ket:" << idx_ket__;
                TERMINATE(s);
            }
            /* inversion of triangular matrix */
            if (linalg<CPU>::trtri(n__, &o__(0, 0), o__.ld())) {
                TERMINATE("error in inversion");
            }
            if (pu__ == GPU) {
                #ifdef __GPU
                acc::copyin(o__.template at<GPU>(), o__.ld(), o__.template at<CPU>(), o__.ld(), n__, n__);
                #endif
            }
        }

        for (int isc = 0; isc < num_sc__; isc++) {
            /* CPU version */
            if (pu__ == CPU) {
                /* multiplication by triangular matrix */
                for (auto& e: wfs__) {
                    /* alias for spin component of wave-functions */
                    auto& wfsc = e->component(isc);
                    /* wave functions are complex, transformation matrix is complex */
                    if (std::is_same<T, double_complex>::value) {
                        linalg<CPU>::trmm('R', 'U', 'N', wfsc.pw_coeffs().num_rows_loc(), n__, double_complex(1, 0),
                                          reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(),
                                          wfsc.pw_coeffs().prime().at<CPU>(0, N__), e->component(isc).pw_coeffs().prime().ld());

                        if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) {
                            linalg<CPU>::trmm('R', 'U', 'N', wfsc.mt_coeffs().num_rows_loc(), n__, double_complex(1, 0),
                                              reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(),
                                              wfsc.mt_coeffs().prime().at<CPU>(0, N__), wfsc.mt_coeffs().prime().ld());
                        }
                    }
                    /* wave functions are real (psi(G) = psi^{*}(-G)), transformation matrix is real */
                    if (std::is_same<T, double>::value) {
                        linalg<CPU>::trmm('R', 'U', 'N', 2 * wfsc.pw_coeffs().num_rows_loc(), n__, 1.0,
                                          reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(),
                                          reinterpret_cast<double*>(wfsc.pw_coeffs().prime().at<CPU>(0, N__)), 2 * wfsc.pw_coeffs().prime().ld());

                        if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) {
                            linalg<CPU>::trmm('R', 'U', 'N', 2 * wfsc.mt_coeffs().num_rows_loc(), n__, 1.0,
                                              reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(),
                                              reinterpret_cast<double*>(wfsc.mt_coeffs().prime().at<CPU>(0, N__)), 2 * wfsc.mt_coeffs().prime().ld());
                        }
                    }
                }
            }
            #ifdef __GPU
            if (pu__ == GPU) {
                /* multiplication by triangular matrix */
                for (auto& e: wfs__) {
                    auto& wfsc = e->component(isc);
                    if (std::is_same<T, double_complex>::value) {
                        double_complex alpha(1, 0);

                        linalg<GPU>::trmm('R', 'U', 'N', wfsc.pw_coeffs().num_rows_loc(), n__, &alpha,
                                          reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(),
                                          wfsc.pw_coeffs().prime().at<GPU>(0, N__), wfsc.pw_coeffs().prime().ld());

                        if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) {
                            linalg<GPU>::trmm('R', 'U', 'N', wfsc.mt_coeffs().num_rows_loc(), n__, &alpha,
                                              reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(),
                                              wfsc.mt_coeffs().prime().at<GPU>(0, N__), wfsc.mt_coeffs().prime().ld());
                        }
                        /* alpha should not go out of the scope, so wait */
                        acc::sync_stream(-1);
                    }
                    if (std::is_same<T, double>::value) {
                        double alpha{1};

                        linalg<GPU>::trmm('R', 'U', 'N', 2 * wfsc.pw_coeffs().num_rows_loc(), n__, &alpha,
                                          reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(),
                                          reinterpret_cast<double*>(wfsc.pw_coeffs().prime().at<GPU>(0, N__)), 2 * wfsc.pw_coeffs().prime().ld());

                        if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) {
                            linalg<GPU>::trmm('R', 'U', 'N', 2 * wfsc.mt_coeffs().num_rows_loc(), n__, &alpha,
                                              reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(),
                                              reinterpret_cast<double*>(wfsc.mt_coeffs().prime().at<GPU>(0, N__)), 2 * wfsc.mt_coeffs().prime().ld());
                        }
                        acc::sync_stream(-1);
                    }
                }
                acc::sync_stream(-1);
            }
            #endif
        }
    } else { /* parallel transformation */
        sddk::timer t1("sddk::wave_functions::orthogonalize|potrf");
        if (int info = linalg<CPU>::potrf(n__, o__)) {
            std::stringstream s;
            s << "error in factorization, info = " << info;
            TERMINATE(s);
        }
        t1.stop();

        sddk::timer t2("sddk::wave_functions::orthogonalize|trtri");
        if (linalg<CPU>::trtri(n__, o__)) {
            TERMINATE("error in inversion");
        }
        t2.stop();

        /* o is upper triangular matrix */
        for (int i = 0; i < n__; i++) {
            for (int j = i + 1; j < n__; j++) {
                o__.set(j, i, 0);
            }
        }

        /* phi is transformed into phi, so we can't use it as the output buffer; use tmp instead and then overwrite phi */
        for (auto& e: wfs__) {
            for (int isc = 0; isc < num_sc__; isc++) {
                transform(pu__, e->component(isc), N__, n__, o__, 0, 0, tmp__, 0, n__);
                e->component(isc).copy_from(tmp__, 0, n__, N__, pu__);
            }
        }
    }
}
inline void Band::set_fv_h_o<GPU, electronic_structure_method_t::full_potential_lapwlo>(K_point* kp__,
                                                                                        Periodic_function<double>* effective_potential__,
                                                                                        dmatrix<double_complex>& h__,
                                                                                        dmatrix<double_complex>& o__) const
{
    runtime::Timer t("sirius::Band::set_fv_h_o");
    
    runtime::Timer t2("sirius::Band::set_fv_h_o|alloc");
    h__.zero();
    h__.allocate(memory_t::device);
    h__.zero_on_device();

    o__.zero();
    o__.allocate(memory_t::device);
    o__.zero_on_device();

    double_complex zone(1, 0);

    int num_atoms_in_block = 2 * omp_get_max_threads();
    int nblk = unit_cell_.num_atoms() / num_atoms_in_block +
               std::min(1, unit_cell_.num_atoms() % num_atoms_in_block);
    DUMP("nblk: %i", nblk);

    int max_mt_aw = num_atoms_in_block * unit_cell_.max_mt_aw_basis_size();
    DUMP("max_mt_aw: %i", max_mt_aw);

    mdarray<double_complex, 3> alm_row(kp__->num_gkvec_row(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device);

    mdarray<double_complex, 3> alm_col(kp__->num_gkvec_col(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device);

    mdarray<double_complex, 3> halm_col(kp__->num_gkvec_col(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device);
    t2.stop();

    runtime::Timer t1("sirius::Band::set_fv_h_o|zgemm");
    for (int iblk = 0; iblk < nblk; iblk++) {
        int num_mt_aw = 0;
        std::vector<int> offsets(num_atoms_in_block);
        for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) {
            int ialoc = ia - iblk * num_atoms_in_block;
            auto& atom = unit_cell_.atom(ia);
            auto& type = atom.type();
            offsets[ialoc] = num_mt_aw;
            num_mt_aw += type.mt_aw_basis_size();
        }

        int s = iblk % 2;
            
        #pragma omp parallel
        {
            int tid = omp_get_thread_num();
            for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) {
                if (ia % omp_get_num_threads() == tid) {
                    int ialoc = ia - iblk * num_atoms_in_block;
                    auto& atom = unit_cell_.atom(ia);
                    auto& type = atom.type();

                    mdarray<double_complex, 2> alm_row_tmp(alm_row.at<CPU>(0, offsets[ialoc], s),
                                                           alm_row.at<GPU>(0, offsets[ialoc], s),
                                                           kp__->num_gkvec_row(), type.mt_aw_basis_size());

                    mdarray<double_complex, 2> alm_col_tmp(alm_col.at<CPU>(0, offsets[ialoc], s),
                                                           alm_col.at<GPU>(0, offsets[ialoc], s),
                                                           kp__->num_gkvec_col(), type.mt_aw_basis_size());
                    
                    mdarray<double_complex, 2> halm_col_tmp(halm_col.at<CPU>(0, offsets[ialoc], s),
                                                            halm_col.at<GPU>(0, offsets[ialoc], s),
                                                            kp__->num_gkvec_col(), type.mt_aw_basis_size());

                    kp__->alm_coeffs_row()->generate(ia, alm_row_tmp);
                    for (int xi = 0; xi < type.mt_aw_basis_size(); xi++) {
                        for (int igk = 0; igk < kp__->num_gkvec_row(); igk++) {
                            alm_row_tmp(igk, xi) = std::conj(alm_row_tmp(igk, xi));
                        }
                    }
                    alm_row_tmp.async_copy_to_device(tid);

                    kp__->alm_coeffs_col()->generate(ia, alm_col_tmp);
                    alm_col_tmp.async_copy_to_device(tid);

                    apply_hmt_to_apw<spin_block_t::nm>(atom, kp__->num_gkvec_col(), alm_col_tmp, halm_col_tmp);
                    halm_col_tmp.async_copy_to_device(tid);

                    /* setup apw-lo and lo-apw blocks */
                    set_fv_h_o_apw_lo(kp__, type, atom, ia, alm_row_tmp, alm_col_tmp, h__, o__);
                }
            }
            acc::sync_stream(tid);
        }
        acc::sync_stream(omp_get_max_threads());
        linalg<GPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, &zone, 
                          alm_row.at<GPU>(0, 0, s), alm_row.ld(), alm_col.at<GPU>(0, 0, s), alm_col.ld(), &zone, 
                          o__.at<GPU>(), o__.ld(), omp_get_max_threads());

        linalg<GPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, &zone, 
                          alm_row.at<GPU>(0, 0, s), alm_row.ld(), halm_col.at<GPU>(0, 0, s), halm_col.ld(), &zone,
                          h__.at<GPU>(), h__.ld(), omp_get_max_threads());
    }

    acc::copyout(h__.at<CPU>(), h__.ld(), h__.at<GPU>(), h__.ld(), kp__->num_gkvec_row(), kp__->num_gkvec_col());
    acc::copyout(o__.at<CPU>(), o__.ld(), o__.at<GPU>(), o__.ld(), kp__->num_gkvec_row(), kp__->num_gkvec_col());
    
    double tval = t1.stop();
    if (kp__->comm().rank() == 0) {
        DUMP("effective zgemm performance: %12.6f GFlops",
             2 * 8e-9 * kp__->num_gkvec() * kp__->num_gkvec() * unit_cell_.mt_aw_basis_size() / tval);
    }

    /* add interstitial contributon */
    set_fv_h_o_it(kp__, effective_potential__, h__, o__);

    /* setup lo-lo block */
    set_fv_h_o_lo_lo(kp__, h__, o__);

    h__.deallocate_on_device();
    o__.deallocate_on_device();
}