예제 #1
0
파일: abscissa.cpp 프로젝트: Tanex/lin_dev
std::string* allt(int* idel, std::string ideligen) {
  if (*idel > 3 || ideligen == "ifall")
    return new std::string("igen");
  int identifiering = *idel + 1;
  std::string igenom("ihopkoppling");
  int ihop = aning(&identifiering, igenom);
  std::string* ikapp = new std::string("ilning");
  int* illa = ann(identifiering, ikapp);
  std::string* ilska = new std::string("implementering");
  int* imperfekt = advent(identifiering, ilska);
  std::string* inalles = new std::string("inblick");
  int* inblandning = aftonsol(identifiering, inalles);
  std::string* inbromsning = new std::string("indelning");
  std::string indatagenerering = alltigenom(identifiering, inbromsning);
  std::string indentering("indikering");
  int indexering = aldrig(&identifiering, indentering);
  std::string infix("informationsbit");
  std::string* inflygning = alm(&identifiering, infix);
  std::string ing("ingendera");
  int ingalunda = andel(&identifiering, ing);
  std::string* ingenstans = new std::string("ingetdera");
  int* ingenting = allmoge(identifiering, ingenstans);
  std::string* ingnidning = new std::string("initialisering");
  return ingnidning;
} // allt
예제 #2
0
파일: abscissa.cpp 프로젝트: Tanex/lin_dev
std::string almanacka(int massa, std::string* massbegravning) {
  if (massa > 4 || *massbegravning == "massvis")
    return "materia";
  int massgrav = massa + 1;
  std::string matlagning("matsal");
  int matning = anslutningspropp(&massgrav, matlagning);
  std::string* matta = new std::string("medan");
  int* max = alltmera(massgrav, matta);
  std::string medelst("medurs");
  std::string* medicinering = annonsering(&massgrav, medelst);
  std::string* mej = new std::string("mellanlagring");
  std::string mekanisering = anm(massgrav, mej);
  std::string men("mestadels");
  std::string* mening = annonsering(&massgrav, men);
  std::string middag("mig");
  int midja = alltnog(&massgrav, middag);
  std::string min("minde");
  std::string* mina = alm(&massgrav, min);
  std::string* mindes = new std::string("minnas");
  std::string minimering = amortering(massgrav, mindes);
  std::string* minnesstorlek = new std::string("minoritetsregering");
  std::string minns = anm(massgrav, minnesstorlek);
  std::string minsann("minskning");
  return minsann;
} // almanacka
예제 #3
0
파일: abscissa.cpp 프로젝트: Tanex/lin_dev
std::string* alltihop(int* kassa, std::string katalogisering) {
  if (*kassa > 4 || katalogisering == "kedja")
    return new std::string("kinesiska");
  int kattunge = *kassa + 1;
  std::string* kista = new std::string("klack");
  int* kjol = anpassning(kattunge, kista);
  std::string* kladd = new std::string("klappning");
  int* klapp = anslagstavla(kattunge, kladd);
  std::string* klase = new std::string("klick");
  int* klassificering = alldeles(kattunge, klase);
  std::string klippa("klokera");
  std::string* klocka = alm(&kattunge, klippa);
  std::string* klubb = new std::string("klump");
  int* klubba = algebra(kattunge, klubb);
  std::string* klunga = new std::string("klut");
  int* klunk = anspelning(kattunge, klunga);
  std::string* klyfta = new std::string("klyscha");
  return klyfta;
} // alltihop
// allele map[N_BASE] is true for each base listed in allele_freq[<=N_BASE]:
//
static
void
get_max_lhood_allele_freq(const snp_pos_info& pi,
                          double* allele_freq,
                          bool* is_allele_used,
                          double& loghood) {

    // minimization constants:
    static const double line_tol(1e-7);
    static const double start_ratio(0.05);
    static const double min_start_dist(1e-6);
    static const double end_tol(1e-7);
    static const unsigned max_iter(200);

    static const unsigned N_BASE2(N_BASE*N_BASE);
    double conj_dir[N_BASE2];

    unsigned n_allele(0);
    for (unsigned i(0); i<N_BASE; ++i) {
        if (is_allele_used[i]) n_allele++;
    }

    assert(n_allele);

    const unsigned n_allele2(n_allele*n_allele);

    std::fill(conj_dir,conj_dir+n_allele2,0.);
    for (unsigned i(0); i<n_allele; ++i) {
        const double start_dist( std::max(std::fabs(allele_freq[i]*start_ratio),min_start_dist) );
        conj_dir[i*(n_allele+1)] = start_dist;
    }

    double start_tol(end_tol);
    unsigned iter;
    double final_dlh;
    position_allele_distro_loghood_minfunc alm(pi,is_allele_used);
    codemin::minimize_conj_direction(allele_freq,conj_dir,alm,start_tol,end_tol,line_tol,
                                     loghood,iter,final_dlh,max_iter);
    alm.arg_to_prob(allele_freq,allele_freq);
    loghood=-loghood;
}
예제 #5
0
파일: abscissa.cpp 프로젝트: Tanex/lin_dev
std::string* alltsammans(int* lina, std::string lind) {
  if (*lina > 6 || lind == "lindring")
    return new std::string("linjevis");
  int lindning = *lina + 1;
  std::string* linjevisa = new std::string("lista");
  std::string linnea = antagligen(lindning, linjevisa);
  std::string* listning = new std::string("liten");
  int* lite = allokering(lindning, listning);
  std::string litet("livsstil");
  std::string* livboj = anordning(&lindning, litet);
  std::string ljusstake("lokalisering");
  std::string* ljusstyrka = allah(&lindning, ljusstake);
  std::string* loss = new std::string("lucka");
  int* lots = aftonsol(lindning, loss);
  std::string lufttrumma("lur");
  std::string* lunga = alm(&lindning, lufttrumma);
  std::string* lus = new std::string("lycka");
  int* lutning = adressering(lindning, lus);
  std::string* lykta = new std::string("lysning");
  int* lyra = ande(lindning, lykta);
  std::string* lyss = new std::string("m");
  return lyss;
} // alltsammans
예제 #6
0
void SessionHelper::watchAlmemory(const std::vector<std::string> &patternList, bool showTime)
{
  ALMemoryHelper alm(_session);

  alm.watch(patternList, showTime);
}
예제 #7
0
void SessionHelper::postOnAlmemory(const std::string &pattern, const std::string &arg, bool json)
{
  ALMemoryHelper alm(_session);

  alm.post(pattern, arg, json);
}
void
position_nonref_2allele_test(const snp_pos_info& pi,
                             const blt_options& opt,
                             const bool /*is_always_test*/,
                             nonref_test_call& nrc) {

    static const bool is_mle_freq(false);

    if (pi.ref_base=='N') return;

    // add early escape test here?

    // 1. Determine the two 'primary' alleles -- Simple test just adds
    // up qscores to determine which alleles are primary.
    //
    nrc.nonref_id=(BASE_ID::ANY);
    //unsigned nonref2_id(BASE_ID::ANY); // just ignore this value for now....
    {
        double qtot[N_BASE];
        for (unsigned i(0); i<N_BASE; ++i) qtot[i] = 0;

        const unsigned n_calls(pi.calls.size());
        for (unsigned i(0); i<n_calls; ++i) {
            if (pi.calls[i].base_id==BASE_ID::ANY) continue;
            qtot[pi.calls[i].base_id] += pi.calls[i].get_qscore();
        }

        // get max and max2:
        unsigned max_id=0;
        unsigned max2_id=1;
        for (unsigned b(1); b<N_BASE; ++b) {
            if (qtot[b] > qtot[max_id]) {
                max2_id = max_id;
                max_id = b;
            } else if (qtot[b] > qtot[max2_id]) {
                max2_id = b;
            }
        }

        const unsigned ref_id=base_to_id(pi.ref_base);
        if       (ref_id==max_id) {
            nrc.nonref_id=max2_id;

#if 0
        } else if (ref_id==max2_id) {
            nrc.nonref_id=max_id;
#endif
        } else {
            nrc.nonref_id=max_id;
            //nonref2_id=max2_id;
        }
    }

    blt_float_t lhood[NR2TEST::SIZE];

    lhood[NR2TEST::REF] = calc_pos_nonref_freq_loghood(pi,0.);

    sparse_function sf;
    nonref_allele_freq_loghood_sparse_func nlf(pi,nrc.nonref_id,sf);
    sample_uniform_range(0.,1.,nlf);
    //sample_uniform_range(min_nonref_freq,1.,nlf);

    lhood[NR2TEST::NONREF_MF] = integrate_ln_sparsefunc(sf, opt.min_nonref_freq, 1,1,1);
    lhood[NR2TEST::NONREF_MF_NOISE] = integrate_ln_sparsefunc(sf, 0, opt.nonref_site_error_decay_freq,1,0);

    static const blt_float_t neginf(-std::numeric_limits<blt_float_t>::infinity());
    lhood[NR2TEST::NONREF_OTHER] = neginf;

    //std::cerr << "WAGART: logh ref/nonef: " << lhood[0] << " " << lhood[1] << "\n";

    // TODO: ctor compute this:

    // this goes in here just in case someone cranks both parameters up near 1:
    //
    const double nonref_variant_rate_used = opt.nonref_variant_rate*(1-opt.nonref_site_error_rate);

    blt_float_t prior[NR2TEST::SIZE];
    prior[NR2TEST::REF] = log1p_switch(-(nonref_variant_rate_used+opt.nonref_site_error_rate));
    prior[NR2TEST::NONREF_MF] = std::log(nonref_variant_rate_used/3);
    prior[NR2TEST::NONREF_MF_NOISE] = std::log(opt.nonref_site_error_rate);
    prior[NR2TEST::NONREF_OTHER] = std::log(2*nonref_variant_rate_used/3);

    double pprob[NR2TEST::SIZE];
    for (unsigned i(0); i<NR2TEST::SIZE; ++i) {
        pprob[i] = lhood[i] + prior[i];
    }
    normalize_ln_distro(pprob,pprob+NR2TEST::SIZE,nrc.max_gt);

    nrc.snp_qphred=error_prob_to_qphred(pprob[NR2TEST::REF]+pprob[NR2TEST::NONREF_MF_NOISE]);
    nrc.max_gt_qphred=error_prob_to_qphred(prob_comp(pprob,pprob+NR2TEST::SIZE,nrc.max_gt));

    nrc.is_snp=(nrc.snp_qphred != 0);

    if (! (is_mle_freq && nrc.is_snp)) return;

#if 0
    const double null_loghood(calc_pos_nonref_freq_loghood(pi,0.));

    // heuristic to escape early:
    static const double p_delta(0.001);
    const double delta_loghood(calc_pos_nonref_freq_loghood(pi,p_delta));
    if (null_loghood > delta_loghood) return;

    double x_nonref_freq;
    double x_loghood;

    position_nonref_freq_loghood_minfunc mf(epi);

    static const double x1(0.5);
    static const double x2(0.4);
    codemin::minimize_1d(x1,x2,mf.val(x1),mf,x_nonref_freq,x_loghood);

    x_nonref_freq = mf.arg_to_prob(x_nonref_freq);

    const double log_lrt(-2.*(x_loghood+null_loghood));

    // becuase null has the parameter fixed to a boundary value, the
    // asymmtotic distribution is a 50:50 mixture of csq(0) and chq(1)
    // -- the same effect as multiplying alpha of csq(1) by 2, dividing
    // the null prob by 2. (as we do below):
    boost::math::chi_squared dist(1);
    const double null_prob((1.-boost::math::cdf(dist,log_lrt))/2.);

    sc.is_snp=(null_prob<alpha);
    sc.null_loghood=null_loghood;
    sc.min_test_loghood=-x_loghood;
    sc.snp_prob=1.-null_prob;

    // if it's a snp then get additional information on non-reference
    // allele frequencies.
    //
    if (not sc.is_snp) return;

    static const double line_tol(1e-7);
    static const double start_ratio(0.05);
    static const double min_start_dist(1e-6);
    static const double end_tol(1e-7);
    static const unsigned max_iter(200);

    const unsigned ref_base_id(base_to_id(pi.ref_base));

    const double ref_freq(1.-x_nonref_freq);
    const double nonref_freq((x_nonref_freq)/3.);
    for (unsigned i(0); i<N_BASE; ++i) {
        if (i==ref_base_id) sc.allele_freq[i] = ref_freq;
        else               sc.allele_freq[i] = nonref_freq;
    }

    static const unsigned N_BASE2(N_BASE*N_BASE);
    double conj_dir[N_BASE2];
    std::fill(conj_dir,conj_dir+N_BASE2,0.);
    for (unsigned i(0); i<N_BASE; ++i) {
        const double start_dist( std::max(std::fabs(sc.allele_freq[i]*start_ratio),min_start_dist) );
        conj_dir[i*(N_BASE+1)] = start_dist;
    }

    double start_tol(end_tol);
    unsigned iter;
    double x_all_loghood;
    double final_dlh;
    position_allele_distro_loghood_minfunc alm(epi);
    codemin::minimize_conj_direction(sc.allele_freq,conj_dir,alm,start_tol,end_tol,line_tol,
                                     x_all_loghood,iter,final_dlh,max_iter);
    alm.arg_to_prob(sc.allele_freq,sc.allele_freq);

    sc.min_loghood=-x_all_loghood;
#endif
}
예제 #9
0
int main (int argc, char *argv[]) {
  const double FullSky = 4.0*M_PI*(180.0/M_PI)*(180.0/M_PI); // In degrees.
  bool GetArea=0;
  std::string infile, outfile, areafile;
  int l, m, lmax;
  Healpix_Map<MAP_PRECISION> Map, Test;
  double **winClTable, area;
  std::ofstream outstream;
  printf("\n");

  /*
  // Create quadrilateral window function:
  if (argc!=3) {
    printf("Input is: <Map FITS output file> <Nside>\n");
    printf("Exiting...\n\n");
    return 1;
  }
  outfile.assign(argv[1]);
  m = atoi(argv[2]);
  CreateWindow(outfile, m);
  return 0;
  */


  // Check if number of input parameters is correct:
  if (argc!=3 && argc!=4) {
    printf("Input is: <Map FITS file> <W^2_l .dat file> <Window area file (optional)>\n");
    printf("Exiting...\n\n");
    return 1;
  }
  
  // Get input:
  infile.assign(argv[1]);
  outfile.assign(argv[2]);
  if (argc==4) {
    GetArea=1;
    areafile.assign(argv[3]);
  }

  // Load map:
  Announce("Loading Healpix map from "+infile+"... ");
  read_Healpix_map_from_fits(infile, Map); 
  Announce();
  lmax = (5*Map.Nside())/2; // LMAX hard-coded to 2.5 Nside, limit mentioned by Franz Elsner, 08/2015.


  // If requested, set pixels values to either 0 or 1:
  if (BINARIZE==1) {
    Announce("Setting pixels values to 1 or 0... ");
    l = 12*Map.Nside()*Map.Nside();
#pragma omp parallel for
    for (m=0; m<l; m++) {
      if (Map[m]<1.0) Map[m]=0.0;
      else Map[m]=1.0;
    }
    Announce();
  }
  
  // If requested, compute the effective window area by adding pixel values:
  if (GetArea==1) {
    Announce("Computing window effective area... ");
    l    = 12*Map.Nside()*Map.Nside();
    area = 0.0;
#pragma omp parallel for reduction(+:area)
    for (m=0; m<l; m++) area += Map[m];
    area = area/((double)l)*FullSky;
    outstream.open(areafile.c_str());
    if (!outstream.is_open()) warning("Cannot write to file "+outfile);
    else {
      outstream << "# Window "<<infile<<" effective area (deg^2):\n";
      outstream << area << std::endl; 
    }
    outstream.close();    
    Announce();
  }

  // Allocate and clean memory for alm's:
  Announce("Allocating memory for harmonic coefficients... ");
  Alm<xcomplex <ALM_PRECISION> > alm(lmax,lmax);
#pragma omp parallel for schedule(dynamic) private(m)
  for(l=0; l<=lmax; l++) {
    for (m=0; m<=l; m++) alm(l,m).Set(0,0);
  }
  Announce();

  // Prepare Healpix weights:
  arr<double> weight(2*Map.Nside());
  PrepRingWeights(1, weight, Map.Nside());

  // Transform to alm:
  Announce("Getting harmonic coefficients... ");
  map2alm_iter(Map, alm, NITER, weight);
  Announce();

  // Compute Sum_m|alm|^2
  Announce("Summing squares over m... ");
  winClTable=matrix<double>(0,lmax, 0,1);
#pragma omp parallel for schedule(dynamic) private(m)
  for(l=0; l<=lmax; l++) {
    winClTable[l][0] = (double)l;
    winClTable[l][1] = alm(l,0).norm();
    for(m=1; m<=l; m++) winClTable[l][1] += 2.0*alm(l,m).norm();     
  }
  Announce();

  // Output to ASCII file:
  Announce("Writing Cl window function to "+outfile+"... ");
  outstream.open(outfile.c_str());
  if (!outstream.is_open()) warning("Cannot write to file "+outfile);
  else PrintTable(winClTable, lmax+1, 2, &outstream);
  outstream.close();
  Announce();
  
  // Exit program:
  printf("\n");
  return 0;

  /*
  // Test recovery of input map:
  Announce("Go back... ");
  Test.SetNside(Map.Nside(), RING);
  alm2map(alm, Test);
  Announce();
  Announce("Writing to fits file "+outfile+"... ");
  write_Healpix_map_to_fits("!"+outfile, Test, planckType<MAP_PRECISION>()); // Filename prefixed by ! to overwrite.
  Announce();
  */  
}
inline void K_point::generate_fv_states()
{
    PROFILE_WITH_TIMER("sirius::K_point::generate_fv_states");
    
    if (!ctx_.full_potential()) {
        return;
    }

    mdarray<double_complex, 2> pw_coeffs;
    mdarray<double_complex, 2> mt_coeffs;
    
    int nbnd_loc;
    /* in both cases eigen-vectors are redistributed to the same "full column" storage */
    if (ctx_.iterative_solver_input_section().type_ == "exact") {
        fv_eigen_vectors_->remap_forward(0, ctx_.num_fv_states());
        /* local number of bands */
        nbnd_loc = fv_eigen_vectors_->spl_num_col().local_size();
        
        if (nbnd_loc) {
            pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(), gklo_basis_size(), nbnd_loc);
            mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(num_gkvec(), 0), gklo_basis_size(), nbnd_loc);
        }

    } else {
        fv_eigen_vectors_slab_->remap_to_full_column_distr(ctx_.num_fv_states());
        assert(fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size() ==
               fv_eigen_vectors_slab_->mt_coeffs().spl_num_col().local_size());
        /* local number of bands */
        nbnd_loc = fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size();
        if (nbnd_loc) {
            pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->pw_coeffs().extra().at<CPU>(), num_gkvec(), nbnd_loc);
            mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->mt_coeffs().extra().at<CPU>(), unit_cell_.mt_lo_basis_size(), nbnd_loc);
        }
    }

    #ifdef __GPU
    if (ctx_.processing_unit() == GPU) {
        pw_coeffs.allocate(memory_t::device);
        pw_coeffs.copy_to_device();
    }
    #endif

    fv_states().prepare_full_column_distr(ctx_.num_fv_states());

    assert(nbnd_loc == fv_states().pw_coeffs().spl_num_col().local_size());
    assert(nbnd_loc == fv_states().mt_coeffs().spl_num_col().local_size());

    #pragma omp parallel
    {
        /* get thread id */
        #ifdef __GPU
        int tid = omp_get_thread_num();
        #endif
        mdarray<double_complex, 2> alm(num_gkvec(), unit_cell_.max_mt_aw_basis_size(), memory_t::host_pinned);
        mdarray<double_complex, 2> tmp;

        #ifdef __GPU
        if (ctx_.processing_unit() == GPU) {
            alm.allocate(memory_t::device);
            tmp = mdarray<double_complex, 2>(unit_cell_.max_mt_aw_basis_size(), nbnd_loc, memory_t::device);
        }
        #endif
        
        #pragma omp for
        for (int ia = 0; ia < unit_cell_.num_atoms(); ia++) {
            /* number of alm coefficients for atom */
            int mt_aw_size = unit_cell_.atom(ia).mt_aw_basis_size();
            /* offset in wave-function */
            int offset_wf = unit_cell_.atom(ia).offset_mt_coeffs();
            /* generate matching coefficients for all G-vectors */
            alm_coeffs_->generate(ia, alm);
            
            /* compute F(lm, i) = A(lm, G)^{T} * evec(G, i) for a single atom */
            if (ctx_.processing_unit() == CPU) {
                /* multiply eigen-vectors and matching coefficients */
                linalg<CPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(),
                                  alm.at<CPU>(), alm.ld(),
                                  pw_coeffs.at<CPU>(), pw_coeffs.ld(),
                                  fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld());
            }
            #ifdef __GPU
            if (ctx_.processing_unit() == GPU) {
                /* multiply eigen-vectors and matching coefficients */
                alm.async_copy_to_device(tid);
                linalg<GPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(),
                                  alm.at<GPU>(), alm.ld(),
                                  pw_coeffs.at<GPU>(), pw_coeffs.ld(),
                                  tmp.at<GPU>(), tmp.ld(),
                                  tid);
                acc::copyout(fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld(),
                             tmp.at<GPU>(), tmp.ld(),
                             mt_aw_size, nbnd_loc, tid);
                acc::sync_stream(tid);
            }
            #endif

            for (int i = 0; i < nbnd_loc; i++) {
                /* lo block */
                std::memcpy(fv_states().mt_coeffs().extra().at<CPU>(offset_wf + mt_aw_size, i),
                            mt_coeffs.at<CPU>(unit_cell_.atom(ia).offset_lo(), i),
                            unit_cell_.atom(ia).mt_lo_basis_size() * sizeof(double_complex));
            }
        }
        #pragma omp for
        for (int i = 0; i < nbnd_loc; i++) {
            /* G+k block */
            std::memcpy(fv_states().pw_coeffs().extra().at<CPU>(0, i),
                        pw_coeffs.at<CPU>(0, i), num_gkvec() * sizeof(double_complex));
        }
    }

    fv_states().remap_to_prime_distr(ctx_.num_fv_states());
}
예제 #11
0
inline void K_point::generate_fv_states()
{
    PROFILE("sirius::K_point::generate_fv_states");
    
    if (!ctx_.full_potential()) {
        return;
    }

    #ifdef __GPU
    if (ctx_.processing_unit() == GPU) {
        fv_eigen_vectors_slab().pw_coeffs().allocate_on_device();
        fv_eigen_vectors_slab().pw_coeffs().copy_to_device(0, ctx_.num_fv_states());
    }
    #endif

    mdarray<double_complex, 2> alm(num_gkvec_loc(), unit_cell_.max_mt_aw_basis_size(), memory_t::host_pinned);
    mdarray<double_complex, 2> tmp(unit_cell_.max_mt_aw_basis_size(), ctx_.num_fv_states());

    #ifdef __GPU
    if (ctx_.processing_unit() == GPU) {
        alm.allocate(memory_t::device);
        tmp.allocate(memory_t::device);
    }
    #endif
    
    for (int ia = 0; ia < unit_cell_.num_atoms(); ia++) {
        auto location = fv_eigen_vectors_slab().spl_num_atoms().location(ia);
        /* number of alm coefficients for atom */
        int mt_aw_size = unit_cell_.atom(ia).mt_aw_basis_size();
        int mt_lo_size = unit_cell_.atom(ia).mt_lo_basis_size();
        /* generate matching coefficients for all G-vectors */
        alm_coeffs_loc_->generate(ia, alm);

        double_complex* tmp_ptr_gpu = (ctx_.processing_unit() == GPU) ? tmp.at<GPU>() : nullptr;
        mdarray<double_complex, 2> tmp1(tmp.at<CPU>(), tmp_ptr_gpu, mt_aw_size, ctx_.num_fv_states());

        /* compute F(lm, i) = A(lm, G)^{T} * evec(G, i) for a single atom */
        if (ctx_.processing_unit() == CPU) {
            linalg<CPU>::gemm(1, 0, mt_aw_size, ctx_.num_fv_states(), num_gkvec_loc(),
                              alm.at<CPU>(), alm.ld(),
                              fv_eigen_vectors_slab().pw_coeffs().prime().at<CPU>(), fv_eigen_vectors_slab().pw_coeffs().prime().ld(),
                              tmp1.at<CPU>(), tmp1.ld());
        }
        #ifdef __GPU
        if (ctx_.processing_unit() == GPU) {
            alm.copy_to_device(mt_aw_size * num_gkvec_loc());
            linalg<GPU>::gemm(1, 0, mt_aw_size, ctx_.num_fv_states(), num_gkvec_loc(),
                              alm.at<GPU>(), alm.ld(),
                              fv_eigen_vectors_slab().pw_coeffs().prime().at<GPU>(), fv_eigen_vectors_slab().pw_coeffs().prime().ld(),
                              tmp1.at<GPU>(), tmp1.ld());
            tmp1.copy_to_host();
        }
        #endif

        comm_.reduce(tmp1.at<CPU>(), static_cast<int>(tmp1.size()), location.rank);

        #ifdef __PRINT_OBJECT_CHECKSUM
        auto z1 = tmp1.checksum();
        DUMP("checksum(tmp1): %18.10f %18.10f", std::real(z1), std::imag(z1));
        #endif

        if (location.rank == comm_.rank()) {
            int offset1 = fv_states().offset_mt_coeffs(location.local_index);
            int offset2 = fv_eigen_vectors_slab().offset_mt_coeffs(location.local_index);
            for (int i = 0; i < ctx_.num_fv_states(); i++) {
                /* aw block */
                std::memcpy(fv_states().mt_coeffs().prime().at<CPU>(offset1, i),
                            tmp1.at<CPU>(0, i),
                            mt_aw_size * sizeof(double_complex));
                /* lo block */
                if (mt_lo_size) {
                    std::memcpy(fv_states().mt_coeffs().prime().at<CPU>(offset1 + mt_aw_size, i),
                                fv_eigen_vectors_slab().mt_coeffs().prime().at<CPU>(offset2, i),
                                mt_lo_size * sizeof(double_complex));
                }
            }
        }
    }

    #pragma omp parallel for
    for (int i = 0; i < ctx_.num_fv_states(); i++) {
        /* G+k block */
        std::memcpy(fv_states().pw_coeffs().prime().at<CPU>(0, i),
                    fv_eigen_vectors_slab().pw_coeffs().prime().at<CPU>(0, i),
                    num_gkvec_loc() * sizeof(double_complex));
    }

    #ifdef __GPU
    if (ctx_.processing_unit() == GPU) {
        fv_eigen_vectors_slab().pw_coeffs().deallocate_on_device();
    }
    #endif
}