inline void Density::add_k_point_contribution_rg(K_point* kp__) { PROFILE_WITH_TIMER("sirius::Density::add_k_point_contribution_rg"); int nfv = ctx_.num_fv_states(); double omega = unit_cell_.omega(); mdarray<double, 2> density_rg(ctx_.fft().local_size(), ctx_.num_mag_dims() + 1); density_rg.zero(); #ifdef __GPU if (ctx_.fft().hybrid()) { density_rg.allocate(memory_t::device); density_rg.zero_on_device(); } #endif ctx_.fft().prepare(kp__->gkvec().partition()); /* non-magnetic or collinear case */ if (ctx_.num_mag_dims() != 3) { for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) { if (!kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().global_index_size()) { continue; } #pragma omp for schedule(dynamic, 1) for (int i = 0; i < kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().local_size(); i++) { int j = kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col()[i]; double w = kp__->band_occupancy(j + ispn * nfv) * kp__->weight() / omega; /* transform to real space; in case of GPU wave-function stays in GPU memory */ if (ctx_.fft().gpu_only()) { ctx_.fft().transform<1>(kp__->gkvec().partition(), kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<GPU>(0, i)); } else { ctx_.fft().transform<1>(kp__->gkvec().partition(), kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<CPU>(0, i)); } if (ctx_.fft().hybrid()) { #ifdef __GPU update_density_rg_1_gpu(ctx_.fft().local_size(), ctx_.fft().buffer<GPU>(), w, density_rg.at<GPU>(0, ispn)); #else TERMINATE_NO_GPU #endif } else { #pragma omp parallel for for (int ir = 0; ir < ctx_.fft().local_size(); ir++) { auto z = ctx_.fft().buffer(ir); density_rg(ir, ispn) += w * (std::pow(z.real(), 2) + std::pow(z.imag(), 2)); } } } }
inline void Density::add_k_point_contribution_rg(K_point* kp__) { PROFILE("sirius::Density::add_k_point_contribution_rg"); int nfv = ctx_.num_fv_states(); double omega = unit_cell_.omega(); auto& fft = ctx_.fft_coarse(); /* get preallocated memory */ double* ptr = static_cast<double*>(ctx_.memory_buffer(fft.local_size() * (ctx_.num_mag_dims() + 1) * sizeof(double))); mdarray<double, 2> density_rg(ptr, fft.local_size(), ctx_.num_mag_dims() + 1, "density_rg"); density_rg.zero(); if (fft.pu() == GPU) { density_rg.allocate(memory_t::device); density_rg.zero<memory_t::device>(); } fft.prepare(kp__->gkvec().partition()); /* non-magnetic or collinear case */ if (ctx_.num_mag_dims() != 3) { /* loop over pure spinor components */ for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) { /* trivial case */ if (!kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().global_index_size()) { continue; } for (int i = 0; i < kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col().local_size(); i++) { int j = kp__->spinor_wave_functions(ispn).pw_coeffs().spl_num_col()[i]; double w = kp__->band_occupancy(j + ispn * nfv) * kp__->weight() / omega; ///* transform to real space; in case of GPU wave-function stays in GPU memory */ fft.transform<1>(kp__->gkvec().partition(), kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<CPU>(0, i)); //switch (fft.pu()) { // case CPU: { // fft.transform<1>(kp__->gkvec().partition(), // kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<CPU>(0, i)); // break; // } // case GPU: { // fft.transform<1, GPU>(kp__->gkvec().partition(), // kp__->spinor_wave_functions(ispn).pw_coeffs().extra().template at<GPU>(0, i)); // break; // } //} /* add to density */ switch (fft.pu()) { case CPU: { #pragma omp parallel for schedule(static) for (int ir = 0; ir < fft.local_size(); ir++) { auto z = fft.buffer(ir); density_rg(ir, ispn) += w * (std::pow(z.real(), 2) + std::pow(z.imag(), 2)); } break; } case GPU: { #ifdef __GPU update_density_rg_1_gpu(fft.local_size(), fft.buffer().at<GPU>(), w, density_rg.at<GPU>(0, ispn)); #else TERMINATE_NO_GPU #endif break; } } } } } else { /* non-collinear case */