inline void Density::add_k_point_contribution_rg(K_point* kp__)
{
    PROFILE_WITH_TIMER("sirius::Density::add_k_point_contribution_rg");

    int nfv = ctx_.num_fv_states();
    double omega = unit_cell_.omega();

    mdarray<double, 2> density_rg(ctx_.fft().local_size(), ctx_.num_mag_dims() + 1);
    density_rg.zero();

    #ifdef __GPU
    if (ctx_.fft().hybrid()) {
        density_rg.allocate(memory_t::device);
        density_rg.zero_on_device();
    }
    #endif

    ctx_.fft().prepare(kp__->gkvec().partition());

    /* non-magnetic or collinear case */
    if (ctx_.num_mag_dims() != 3) {
        for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) {
            if (!kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().global_index_size()) {
                continue;
            }

            #pragma omp for schedule(dynamic, 1)
            for (int i = 0; i < kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().local_size(); i++) {
                int j = kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col()[i];
                double w = kp__->band_occupancy(j + ispn * nfv) * kp__->weight() / omega;

                /* transform to real space; in case of GPU wave-function stays in GPU memory */
                if (ctx_.fft().gpu_only()) {
                    ctx_.fft().transform<1>(kp__->gkvec().partition(),
                                            kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<GPU>(0, i));
                } else {
                    ctx_.fft().transform<1>(kp__->gkvec().partition(),
                                            kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<CPU>(0, i));
                }

                if (ctx_.fft().hybrid()) {
                    #ifdef __GPU
                    update_density_rg_1_gpu(ctx_.fft().local_size(), ctx_.fft().buffer<GPU>(), w, density_rg.at<GPU>(0, ispn));
                    #else
                    TERMINATE_NO_GPU
                    #endif
                } else {
                    #pragma omp parallel for
                    for (int ir = 0; ir < ctx_.fft().local_size(); ir++) {
                        auto z = ctx_.fft().buffer(ir);
                        density_rg(ir, ispn) += w * (std::pow(z.real(), 2) + std::pow(z.imag(), 2));
                    }
                }
            }
        }
inline void Density::add_k_point_contribution_rg(K_point* kp__)
{
    PROFILE("sirius::Density::add_k_point_contribution_rg");

    int nfv = ctx_.num_fv_states();
    double omega = unit_cell_.omega();

    auto& fft = ctx_.fft_coarse();
    
    /* get preallocated memory */
    double* ptr = static_cast<double*>(ctx_.memory_buffer(fft.local_size() * (ctx_.num_mag_dims() + 1) * sizeof(double)));

    mdarray<double, 2> density_rg(ptr, fft.local_size(), ctx_.num_mag_dims() + 1, "density_rg");
    density_rg.zero();

    if (fft.pu() == GPU) {
        density_rg.allocate(memory_t::device);
        density_rg.zero<memory_t::device>();
    }

    fft.prepare(kp__->gkvec().partition());

    /* non-magnetic or collinear case */
    if (ctx_.num_mag_dims() != 3) {
        /* loop over pure spinor components */
        for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) {
            /* trivial case */
            if (!kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().global_index_size()) {
                continue;
            }

            for (int i = 0; i < kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().local_size(); i++) {
                int j = kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col()[i];
                double w = kp__->band_occupancy(j + ispn * nfv) * kp__->weight() / omega;

                ///* transform to real space; in case of GPU wave-function stays in GPU memory */
                fft.transform<1>(kp__->gkvec().partition(),
                                 kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<CPU>(0, i));
                //switch (fft.pu()) {
                //    case CPU: {
                //        fft.transform<1>(kp__->gkvec().partition(),
                //                         kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<CPU>(0, i));
                //        break;
                //    }
                //    case GPU: {
                //        fft.transform<1, GPU>(kp__->gkvec().partition(),
                //                              kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<GPU>(0, i));
                //        break;
                //    }
                //}
                
                /* add to density */
                switch (fft.pu()) {
                    case CPU: {
                        #pragma omp parallel for schedule(static)
                        for (int ir = 0; ir < fft.local_size(); ir++) {
                            auto z = fft.buffer(ir);
                            density_rg(ir, ispn) += w * (std::pow(z.real(), 2) + std::pow(z.imag(), 2));
                        }
                        break;
                    }
                    case GPU: {
                        #ifdef __GPU
                        update_density_rg_1_gpu(fft.local_size(), fft.buffer().at<GPU>(), w, density_rg.at<GPU>(0, ispn));
                        #else
                        TERMINATE_NO_GPU
                        #endif
                        break;
                    }
                }
            }
        }
    } else { /* non-collinear case */