void Potential::generate_local_potential()
{
    PROFILE_WITH_TIMER("sirius::Potential::generate_local_potential");

    mdarray<double, 2> vloc_radial_integrals(unit_cell_.num_atom_types(), ctx_.gvec().num_shells());

    /* split G-shells between MPI ranks */
    splindex<block> spl_gshells(ctx_.gvec().num_shells(), comm_.size(), comm_.rank());

    #pragma omp parallel
    {
        /* splines for all atom types */
        std::vector< Spline<double> > sa(unit_cell_.num_atom_types());
        
        for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++)
            sa[iat] = Spline<double>(unit_cell_.atom_type(iat).radial_grid());
    
        #pragma omp for
        for (int igsloc = 0; igsloc < spl_gshells.local_size(); igsloc++)
        {
            int igs = spl_gshells[igsloc];

            for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++)
            {
                auto& atom_type = unit_cell_.atom_type(iat);

                if (igs == 0)
                {
                    for (int ir = 0; ir < atom_type.num_mt_points(); ir++) 
                    {
                        double x = atom_type.radial_grid(ir);
                        sa[iat][ir] = (x * atom_type.uspp().vloc[ir] + atom_type.zn()) * x;
                    }
                    vloc_radial_integrals(iat, igs) = sa[iat].interpolate().integrate(0);
                }
                else
                {
                    double g = ctx_.gvec().shell_len(igs);
                    double g2 = std::pow(g, 2);
                    for (int ir = 0; ir < atom_type.num_mt_points(); ir++) 
                    {
                        double x = atom_type.radial_grid(ir);
                        sa[iat][ir] = (x * atom_type.uspp().vloc[ir] + atom_type.zn() * gsl_sf_erf(x)) * std::sin(g * x);
                    }
                    vloc_radial_integrals(iat, igs) = (sa[iat].interpolate().integrate(0) / g - atom_type.zn() * std::exp(-g2 / 4) / g2);
                }
            }
        }
    }

    int ld = unit_cell_.num_atom_types();
    comm_.allgather(vloc_radial_integrals.at<CPU>(), ld * spl_gshells.global_offset(), ld * spl_gshells.local_size());

    auto v = unit_cell_.make_periodic_function(vloc_radial_integrals, ctx_.gvec());
    ctx_.fft().prepare(ctx_.gvec().partition());
    ctx_.fft().transform<1>(ctx_.gvec().partition(), &v[ctx_.gvec().partition().gvec_offset_fft()]);
    ctx_.fft().output(&local_potential_->f_rg(0));
    ctx_.fft().dismiss();
}
void Density::generate(K_set& ks__)
{
    PROFILE_WITH_TIMER("sirius::Density::generate");

    generate_valence(ks__);

    if (ctx_.full_potential())
    {
        generate_core_charge_density();

        /* add core contribution */
        for (int ialoc = 0; ialoc < (int)unit_cell_.spl_num_atoms().local_size(); ialoc++)
        {
            int ia = unit_cell_.spl_num_atoms(ialoc);
            for (int ir = 0; ir < unit_cell_.atom(ia).num_mt_points(); ir++)
                rho_->f_mt<local>(0, ir, ialoc) += unit_cell_.atom(ia).symmetry_class().core_charge_density(ir) / y00;
        }

        /* synchronize muffin-tin part */
        rho_->sync_mt();
        for (int j = 0; j < ctx_.num_mag_dims(); j++) magnetization_[j]->sync_mt();
    }
    
    double nel = 0;
    if (ctx_.full_potential())
    {
        std::vector<double> nel_mt;
        double nel_it;
        nel = rho_->integrate(nel_mt, nel_it);
    }
    else
    {
        nel = rho_->f_pw(0).real() * unit_cell_.omega();
    }

    if (std::abs(nel - unit_cell_.num_electrons()) > 1e-5)
    {
        std::stringstream s;
        s << "wrong charge density after k-point summation" << std::endl
          << "obtained value : " << nel << std::endl 
          << "target value : " << unit_cell_.num_electrons() << std::endl
          << "difference : " << fabs(nel - unit_cell_.num_electrons()) << std::endl;
        if (ctx_.full_potential())
        {
            s << "total core leakage : " << core_leakage();
            for (int ic = 0; ic < unit_cell_.num_atom_symmetry_classes(); ic++) 
                s << std::endl << "  atom class : " << ic << ", core leakage : " << core_leakage(ic);
        }
        WARNING(s);
    }

    #ifdef __PRINT_OBJECT_HASH
    DUMP("hash(rhomt): %16llX", rho_->f_mt().hash());
    DUMP("hash(rhoit): %16llX", rho_->f_it().hash());
    #endif

    //if (debug_level > 1) check_density_continuity_at_mt();
}
inline void Density::add_k_point_contribution_rg(K_point* kp__)
{
    PROFILE_WITH_TIMER("sirius::Density::add_k_point_contribution_rg");

    int nfv = ctx_.num_fv_states();
    double omega = unit_cell_.omega();

    mdarray<double, 2> density_rg(ctx_.fft().local_size(), ctx_.num_mag_dims() + 1);
    density_rg.zero();

    #ifdef __GPU
    if (ctx_.fft().hybrid()) {
        density_rg.allocate(memory_t::device);
        density_rg.zero_on_device();
    }
    #endif

    ctx_.fft().prepare(kp__->gkvec().partition());

    /* non-magnetic or collinear case */
    if (ctx_.num_mag_dims() != 3) {
        for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) {
            if (!kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().global_index_size()) {
                continue;
            }

            #pragma omp for schedule(dynamic, 1)
            for (int i = 0; i < kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().local_size(); i++) {
                int j = kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col()[i];
                double w = kp__->band_occupancy(j + ispn * nfv) * kp__->weight() / omega;

                /* transform to real space; in case of GPU wave-function stays in GPU memory */
                if (ctx_.fft().gpu_only()) {
                    ctx_.fft().transform<1>(kp__->gkvec().partition(),
                                            kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<GPU>(0, i));
                } else {
                    ctx_.fft().transform<1>(kp__->gkvec().partition(),
                                            kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<CPU>(0, i));
                }

                if (ctx_.fft().hybrid()) {
                    #ifdef __GPU
                    update_density_rg_1_gpu(ctx_.fft().local_size(), ctx_.fft().buffer<GPU>(), w, density_rg.at<GPU>(0, ispn));
                    #else
                    TERMINATE_NO_GPU
                    #endif
                } else {
                    #pragma omp parallel for
                    for (int ir = 0; ir < ctx_.fft().local_size(); ir++) {
                        auto z = ctx_.fft().buffer(ir);
                        density_rg(ir, ispn) += w * (std::pow(z.real(), 2) + std::pow(z.imag(), 2));
                    }
                }
            }
        }
inline void Band::set_fv_h_o<CPU, electronic_structure_method_t::full_potential_lapwlo>(K_point* kp__,
                                                                                        Periodic_function<double>* effective_potential__,
                                                                                        dmatrix<double_complex>& h__,
                                                                                        dmatrix<double_complex>& o__) const
{
    PROFILE_WITH_TIMER("sirius::Band::set_fv_h_o");
    
    h__.zero();
    o__.zero();

    double_complex zone(1, 0);
    
    int num_atoms_in_block = 2 * omp_get_max_threads();
    int nblk = unit_cell_.num_atoms() / num_atoms_in_block +
               std::min(1, unit_cell_.num_atoms() % num_atoms_in_block);
    DUMP("nblk: %i", nblk);

    int max_mt_aw = num_atoms_in_block * unit_cell_.max_mt_aw_basis_size();
    DUMP("max_mt_aw: %i", max_mt_aw);

    mdarray<double_complex, 2> alm_row(kp__->num_gkvec_row(), max_mt_aw);
    mdarray<double_complex, 2> alm_col(kp__->num_gkvec_col(), max_mt_aw);
    mdarray<double_complex, 2> halm_col(kp__->num_gkvec_col(), max_mt_aw);

    runtime::Timer t1("sirius::Band::set_fv_h_o|zgemm");
    for (int iblk = 0; iblk < nblk; iblk++)
    {
        int num_mt_aw = 0;
        std::vector<int> offsets(num_atoms_in_block);
        for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++)
        {
            auto& atom = unit_cell_.atom(ia);
            auto& type = atom.type();
            offsets[ia - iblk * num_atoms_in_block] = num_mt_aw;
            num_mt_aw += type.mt_aw_basis_size();
        }
        
        #ifdef __PRINT_OBJECT_CHECKSUM
        alm_row.zero();
        alm_col.zero();
        halm_col.zero();
        #endif

        #pragma omp parallel
        {
            int tid = omp_get_thread_num();
            for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++)
            {
                if (ia % omp_get_num_threads() == tid)
                {
                    int ialoc = ia - iblk * num_atoms_in_block;
                    auto& atom = unit_cell_.atom(ia);
                    auto& type = atom.type();

                    mdarray<double_complex, 2> alm_row_tmp(alm_row.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_row(), type.mt_aw_basis_size());
                    mdarray<double_complex, 2> alm_col_tmp(alm_col.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_col(), type.mt_aw_basis_size());
                    mdarray<double_complex, 2> halm_col_tmp(halm_col.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_col(), type.mt_aw_basis_size());

                    kp__->alm_coeffs_row()->generate(ia, alm_row_tmp);
                    for (int xi = 0; xi < type.mt_aw_basis_size(); xi++)
                    {
                        for (int igk = 0; igk < kp__->num_gkvec_row(); igk++) alm_row_tmp(igk, xi) = std::conj(alm_row_tmp(igk, xi));
                    }
                    kp__->alm_coeffs_col()->generate(ia, alm_col_tmp);
                    apply_hmt_to_apw<spin_block_t::nm>(atom, kp__->num_gkvec_col(), alm_col_tmp, halm_col_tmp);

                    /* setup apw-lo and lo-apw blocks */
                    set_fv_h_o_apw_lo(kp__, type, atom, ia, alm_row_tmp, alm_col_tmp, h__, o__);
                }
            }
        }
        #ifdef __PRINT_OBJECT_CHECKSUM
        double_complex z1 = alm_row.checksum();
        double_complex z2 = alm_col.checksum();
        double_complex z3 = halm_col.checksum();
        DUMP("checksum(alm_row): %18.10f %18.10f", std::real(z1), std::imag(z1));
        DUMP("checksum(alm_col): %18.10f %18.10f", std::real(z2), std::imag(z2));
        DUMP("checksum(halm_col): %18.10f %18.10f", std::real(z3), std::imag(z3));
        #endif
        linalg<CPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, zone,
                          alm_row.at<CPU>(), alm_row.ld(), alm_col.at<CPU>(), alm_col.ld(), zone, 
                          o__.at<CPU>(), o__.ld());

        linalg<CPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, zone, 
                          alm_row.at<CPU>(), alm_row.ld(), halm_col.at<CPU>(), halm_col.ld(), zone,
                          h__.at<CPU>(), h__.ld());
    }
    double tval = t1.stop();
    if (kp__->comm().rank() == 0)
    {
        DUMP("effective zgemm performance: %12.6f GFlops",
             2 * 8e-9 * kp__->num_gkvec() * kp__->num_gkvec() * unit_cell_.mt_aw_basis_size() / tval);
    }

    /* add interstitial contributon */
    set_fv_h_o_it(kp__, effective_potential__, h__, o__);

    /* setup lo-lo block */
    set_fv_h_o_lo_lo(kp__, h__, o__);
}
void Density::generate_valence(K_set& ks__)
{
    PROFILE_WITH_TIMER("sirius::Density::generate_valence");

    double wt = 0.0;
    double ot = 0.0;
    for (int ik = 0; ik < ks__.num_kpoints(); ik++)
    {
        wt += ks__[ik]->weight();
        for (int j = 0; j < ctx_.num_bands(); j++) ot += ks__[ik]->weight() * ks__[ik]->band_occupancy(j);
    }

    if (std::abs(wt - 1.0) > 1e-12) TERMINATE("K_point weights don't sum to one");

    if (std::abs(ot - unit_cell_.num_valence_electrons()) > 1e-8)
    {
        std::stringstream s;
        s << "wrong occupancies" << std::endl
          << "  computed : " << ot << std::endl
          << "  required : " << unit_cell_.num_valence_electrons() << std::endl
          << "  difference : " << std::abs(ot - unit_cell_.num_valence_electrons());
        WARNING(s);
    }

    /* swap wave functions */
    for (int ikloc = 0; ikloc < ks__.spl_num_kpoints().local_size(); ikloc++)
    {
        int ik = ks__.spl_num_kpoints(ikloc);
        auto kp = ks__[ik];

        for (int ispn = 0; ispn < ctx_.num_spins(); ispn++)
        {
            if (ctx_.full_potential())
            {
                kp->spinor_wave_functions<true>(ispn).swap_forward(0, kp->num_occupied_bands(ispn));
            }
            else
            {
                kp->spinor_wave_functions<false>(ispn).swap_forward(0, kp->num_occupied_bands(ispn), kp->gkvec_fft_distr());
            }
        }
    }

    /* zero density and magnetization */
    zero();

    ctx_.fft().prepare();

    /* interstitial part is independent of basis type */
    generate_valence_density_it(ks__);

    /* for muffin-tin part */
    switch (ctx_.esm_type())
    {
        case full_potential_lapwlo:
        {
            generate_valence_density_mt(ks__);
            break;
        }
        case full_potential_pwlo:
        {
            STOP();
        }
        default:
        {
            break;
        }
    }
    
    #if (__VERIFICATION > 0)
    for (int ir = 0; ir < ctx_.fft(0)->local_size(); ir++)
    {
        if (rho_->f_it(ir) < 0) TERMINATE("density is wrong");
    }
    #endif

    //== double nel = 0;
    //== for (int ir = 0; ir < ctx_.fft().local_size(); ir++)
    //== {
    //==     nel += rho_->f_rg(ir);
    //== }
    //== ctx_.mpi_grid().communicator(1 << _dim_row_).allreduce(&nel, 1);
    //== nel = nel * unit_cell_.omega() / ctx_.fft().size();
    //== printf("number of electrons: %f\n", nel);
    
    /* get rho(G) and mag(G) */
    rho_->fft_transform(-1);
    for (int j = 0; j < ctx_.num_mag_dims(); j++) magnetization_[j]->fft_transform(-1);

    //== printf("number of electrons: %f\n", rho_->f_pw(0).real() * unit_cell_.omega());
    //== STOP();

    ctx_.fft().dismiss();

    if (ctx_.esm_type() == ultrasoft_pseudopotential) augment(ks__);
}
inline void K_point::generate_fv_states()
{
    PROFILE_WITH_TIMER("sirius::K_point::generate_fv_states");
    
    if (!ctx_.full_potential()) {
        return;
    }

    mdarray<double_complex, 2> pw_coeffs;
    mdarray<double_complex, 2> mt_coeffs;
    
    int nbnd_loc;
    /* in both cases eigen-vectors are redistributed to the same "full column" storage */
    if (ctx_.iterative_solver_input_section().type_ == "exact") {
        fv_eigen_vectors_->remap_forward(0, ctx_.num_fv_states());
        /* local number of bands */
        nbnd_loc = fv_eigen_vectors_->spl_num_col().local_size();
        
        if (nbnd_loc) {
            pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(), gklo_basis_size(), nbnd_loc);
            mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(num_gkvec(), 0), gklo_basis_size(), nbnd_loc);
        }

    } else {
        fv_eigen_vectors_slab_->remap_to_full_column_distr(ctx_.num_fv_states());
        assert(fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size() ==
               fv_eigen_vectors_slab_->mt_coeffs().spl_num_col().local_size());
        /* local number of bands */
        nbnd_loc = fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size();
        if (nbnd_loc) {
            pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->pw_coeffs().extra().at<CPU>(), num_gkvec(), nbnd_loc);
            mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->mt_coeffs().extra().at<CPU>(), unit_cell_.mt_lo_basis_size(), nbnd_loc);
        }
    }

    #ifdef __GPU
    if (ctx_.processing_unit() == GPU) {
        pw_coeffs.allocate(memory_t::device);
        pw_coeffs.copy_to_device();
    }
    #endif

    fv_states().prepare_full_column_distr(ctx_.num_fv_states());

    assert(nbnd_loc == fv_states().pw_coeffs().spl_num_col().local_size());
    assert(nbnd_loc == fv_states().mt_coeffs().spl_num_col().local_size());

    #pragma omp parallel
    {
        /* get thread id */
        #ifdef __GPU
        int tid = omp_get_thread_num();
        #endif
        mdarray<double_complex, 2> alm(num_gkvec(), unit_cell_.max_mt_aw_basis_size(), memory_t::host_pinned);
        mdarray<double_complex, 2> tmp;

        #ifdef __GPU
        if (ctx_.processing_unit() == GPU) {
            alm.allocate(memory_t::device);
            tmp = mdarray<double_complex, 2>(unit_cell_.max_mt_aw_basis_size(), nbnd_loc, memory_t::device);
        }
        #endif
        
        #pragma omp for
        for (int ia = 0; ia < unit_cell_.num_atoms(); ia++) {
            /* number of alm coefficients for atom */
            int mt_aw_size = unit_cell_.atom(ia).mt_aw_basis_size();
            /* offset in wave-function */
            int offset_wf = unit_cell_.atom(ia).offset_mt_coeffs();
            /* generate matching coefficients for all G-vectors */
            alm_coeffs_->generate(ia, alm);
            
            /* compute F(lm, i) = A(lm, G)^{T} * evec(G, i) for a single atom */
            if (ctx_.processing_unit() == CPU) {
                /* multiply eigen-vectors and matching coefficients */
                linalg<CPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(),
                                  alm.at<CPU>(), alm.ld(),
                                  pw_coeffs.at<CPU>(), pw_coeffs.ld(),
                                  fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld());
            }
            #ifdef __GPU
            if (ctx_.processing_unit() == GPU) {
                /* multiply eigen-vectors and matching coefficients */
                alm.async_copy_to_device(tid);
                linalg<GPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(),
                                  alm.at<GPU>(), alm.ld(),
                                  pw_coeffs.at<GPU>(), pw_coeffs.ld(),
                                  tmp.at<GPU>(), tmp.ld(),
                                  tid);
                acc::copyout(fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld(),
                             tmp.at<GPU>(), tmp.ld(),
                             mt_aw_size, nbnd_loc, tid);
                acc::sync_stream(tid);
            }
            #endif

            for (int i = 0; i < nbnd_loc; i++) {
                /* lo block */
                std::memcpy(fv_states().mt_coeffs().extra().at<CPU>(offset_wf + mt_aw_size, i),
                            mt_coeffs.at<CPU>(unit_cell_.atom(ia).offset_lo(), i),
                            unit_cell_.atom(ia).mt_lo_basis_size() * sizeof(double_complex));
            }
        }
        #pragma omp for
        for (int i = 0; i < nbnd_loc; i++) {
            /* G+k block */
            std::memcpy(fv_states().pw_coeffs().extra().at<CPU>(0, i),
                        pw_coeffs.at<CPU>(0, i), num_gkvec() * sizeof(double_complex));
        }
    }

    fv_states().remap_to_prime_distr(ctx_.num_fv_states());
}