// [[Rcpp::export]] Rcpp::IntegerMatrix quantileNorm(Rcpp::IntegerMatrix mat, Rcpp::IntegerVector ref, int nthreads=1, int seed=13){ if (mat.nrow() != ref.length()) Rcpp::stop("incompatible arrays..."); if (!std::is_sorted(ref.begin(), ref.end())) Rcpp::stop("ref must be sorted"); int ncol = mat.ncol(); int nrow = mat.nrow(); //allocate new matrix Rcpp::IntegerMatrix res(nrow, ncol); Mat<int> oldmat = asMat(mat); Mat<int> newmat = asMat(res); Vec<int> ref2 = asVec(ref); //allocate a seed for each column std::seed_seq sseq{seed}; std::vector<std::uint32_t> seeds(ncol); sseq.generate(seeds.begin(), seeds.end()); #pragma omp parallel num_threads(nthreads) { std::vector<std::pair<int, int> > storage(nrow);//pairs <value, index> #pragma omp for for (int col = 0; col < ncol; ++col){ std::mt19937 gen(seeds[col]); qtlnorm(oldmat.getCol(col), ref2, newmat.getCol(col), storage, gen); } } res.attr("dimnames") = mat.attr("dimnames"); return res; }
inline void colSummary_loop(Mat<int> mat, Vec<int> ref, Tfun fun, int nthreads){ int ncol = mat.ncol; #pragma omp parallel for num_threads(nthreads) for (int col = 0; col < ncol; ++col){ ref[col] = fun(mat.getCol(col)); } }
// [[Rcpp::export]] Rcpp::IntegerVector getRef(Rcpp::IntegerMatrix mat, std::string type, int nthreads=1){ //allocate another matrix with transpose dimensions as mat int ncol = mat.ncol(); int nrow = mat.nrow(); if (ncol*nrow == 0) Rcpp::stop("empty input is invalid"); Rcpp::IntegerMatrix mem_smat(ncol, nrow); Mat<int> smat = asMat(mem_smat); Mat<int> omat = asMat(mat); //sort every column #pragma omp parallel for num_threads(nthreads) for (int col = 0; col < ncol; ++col){ Vec<int> ovec = omat.getCol(col); MatRow<int> svec = smat.getRow(col); sortCounts(ovec, svec); } return colSummary(mem_smat, type, nthreads); }
static inline double fb_core(Mat<double> initPs, Mat<double> trans, Mat<double> lliks, Vec<int> seqlens, Mat<double> posteriors, Mat<double> new_trans, Mat<double> new_initPs, int nthreads){ nthreads = std::max(1, nthreads); int nrow = lliks.nrow; int ncol = lliks.ncol; int nchunk = seqlens.len; //temporary objects std::vector<int> chunk_startsSTD(seqlens.len, 0); Vec<int> chunk_starts = asVec<int>(chunk_startsSTD); //get the start of each chunk for (int i = 0, acc = 0; i < nchunk; ++i){chunk_starts[i] = acc; acc += seqlens[i];} long double tot_llik = 0; //figure out how to assign the chromosomes to the threads //covert seqlens to double std::vector<double> jobSize(nchunk); for (int i = 0; i < nchunk; ++i) jobSize[i] = seqlens[i]; //get the assignments std::vector<int> breaks = scheduleJobs(asVec(jobSize), nthreads); #pragma omp parallel num_threads(nthreads) { //each thread gets one copy of these temporaries std::vector<double> thread_new_transSTD(nrow*nrow, 0); Mat<double> thread_new_trans = asMat(thread_new_transSTD, nrow); long double thread_llik = 0; FBtmp thread_tmp(nrow); /* transform the log likelihoods to probabilities (exponentiate). * A column-specific factor is multiplied to obtain a better numerical * stability. This tends to be the bottle-neck of the whole * algorithm, but it is indispensable, and it scales well with the * number of cores.*/ #pragma omp for schedule(static) reduction(+:tot_llik) for (int c = 0; c < ncol; ++c){ double* llikcol = lliks.colptr(c); /* get maximum llik in the column */ double max_llik = llikcol[0]; for (int r = 1; r < nrow; ++r){ if (llikcol[r] > max_llik){ max_llik = llikcol[r]; } } /* subtract maximum and exponentiate */ tot_llik += max_llik; for (int r = 0; r < nrow; ++r, ++llikcol){ *llikcol = exp(*llikcol - max_llik); } } /* Do forward and backward loop for each chunk (defined by seqlens) * Chunks might have very different lengths (that's why they have been scheduled). */ #pragma omp for schedule(static) nowait for (int thread = 0; thread < nthreads; ++thread){ for (int o = breaks[thread]; o < breaks[thread+1]; ++o){ //o is the index that identifies the sequence int chunk_start = chunk_starts[o]; int chunk_end = chunk_start + seqlens[o]; fb_iter(lliks.subsetCol(chunk_start, chunk_end), posteriors.subsetCol(chunk_start, chunk_end), initPs.getCol(o), new_initPs.getCol(o), trans, thread_new_trans, thread_llik, thread_tmp); } } //protected access to the shared variables #pragma omp critical { tot_llik += thread_llik; for (int p = 0, q = nrow*nrow; p < q; ++p){ new_trans[p] += thread_new_trans[p]; } } } /* normalizing new_trans matrix */ // The parallelization overhead might take longer than // this loop.... for (int row = 0; row < nrow; ++row){ double sum = 0; for (int col = 0; col < nrow; ++col){sum += new_trans(row, col);} for (int col = 0; col < nrow; ++col){new_trans(row, col)/=sum;} } return (double) tot_llik; }