//[[Rcpp::export]] Rcpp::RawVector toBitVec(Rcpp::LogicalVector indx) { unsigned nBit = indx.size(); unsigned nByte = ceil(float(nBit)/8); Rcpp::RawVector bytes(nByte);//default are all 0s bytes.attr("bitlen") = nBit; unsigned byteIndex, bitIndex ; for(unsigned i = 0 ; i < nBit; i++) { byteIndex = i / 8; bitIndex = i % 8; if(indx(i) == 1) bytes[byteIndex] = bytes[byteIndex] | 1 << bitIndex; } return bytes; }
// [[Rcpp::export]] Rcpp::DataFrame gt_to_popsum(Rcpp::DataFrame var_info, Rcpp::CharacterMatrix gt) { // Calculate popgen summaries for the sample. // var_info should contain columns named 'CHROM', 'POS', 'mask' and possibly others. Rcpp::LogicalVector mask = var_info["mask"]; Rcpp::IntegerVector nsample(mask.size()); Rcpp::StringVector allele_counts(mask.size()); Rcpp::NumericVector Hes(mask.size()); Rcpp::NumericVector Nes(mask.size()); int i = 0; int j = 0; int k = 0; for(i=0; i < gt.nrow(); i++){ // Iterate over variants (rows) if(mask[i] == TRUE){ std::vector<int> myalleles (1,0); for(j=0; j < gt.ncol(); j++){ // Iterate over samples (columns) if(gt(i, j) != NA_STRING){ nsample[i]++; // Increment sample count. // Count alleles per sample. std::vector < int > intv = gtsplit(as<std::string>(gt(i, j))); for(k=0; k<intv.size(); k++){ while(myalleles.size() - 1 < intv[k]){ // We have more alleles than exist in the vector myalleles. myalleles.push_back(0); } myalleles[intv[k]]++; } } } // Concatenate allele counts into a comma delimited string. int n; char buffer [50]; n = sprintf (buffer, "%d", myalleles[0]); for(j=1; j < myalleles.size(); j++){ n=sprintf (buffer, "%s,%d", buffer, myalleles[j]); } allele_counts[i] = buffer; // Sum all alleles. int nalleles = myalleles[0]; for(j=1; j < myalleles.size(); j++){ nalleles = nalleles + myalleles[j]; } // Stats. double He = 1; He = He - pow(double(myalleles[0])/double(nalleles), myalleles.size()); for(j=1; j < myalleles.size(); j++){ He = He - pow(double(myalleles[j])/double(nalleles), myalleles.size()); } Hes[i] = He; Nes[i] = 1/(1-He); } else { // Missing variant (row=NA) nsample[i] = NA_INTEGER; } } return Rcpp::DataFrame::create(var_info, _["n"]=nsample, _["Allele_counts"]=allele_counts, _["He"]=Hes, _["Ne"]=Nes ); }
//' @export // [[Rcpp::export(name=".gt_to_popsum")]] Rcpp::DataFrame gt_to_popsum(Rcpp::DataFrame var_info, Rcpp::CharacterMatrix gt) { // Calculate popgen summaries for the sample. // var_info should contain columns named 'CHROM', 'POS', 'mask' and possibly others. Rcpp::LogicalVector mask = var_info["mask"]; Rcpp::IntegerVector nsample(mask.size()); Rcpp::StringVector allele_counts(mask.size()); Rcpp::NumericVector Hes(mask.size()); Rcpp::NumericVector Nes(mask.size()); int i = 0; int j = 0; // unsigned int j = 0; unsigned int k = 0; for(i=0; i < gt.nrow(); i++){ // Iterate over variants (rows) if(mask[i] == TRUE){ std::vector<int> myalleles (1,0); for(j=0; j < gt.ncol(); j++){ // Iterate over samples (columns) if(gt(i, j) != NA_STRING){ nsample[i]++; // Increment sample count. // Rcout << "gt: " << gt(i, j) << "\n"; // Count alleles per sample. int unphased_as_na = 0; // 0 == FALSE std::vector < std::string > gt_vector; std::string gt2 = as<std::string>(gt(i,j)); vcfRCommon::gtsplit( gt2, gt_vector, unphased_as_na ); // Rcout << "gt_vector.size: " << gt_vector.size() << "\n"; for(k=0; k<gt_vector.size(); k++){ int myAllele = std::stoi(gt_vector[k]); // Rcout << " " << myAllele; // // If this genotype had an allele we did not previously observe // we'll have to grow the vector. while(myalleles.size() - 1 < myAllele){ myalleles.push_back(0); } myalleles[myAllele]++; } // Rcout << "\n\n"; } } // Concatenate allele counts into a comma delimited string. char buffer [50]; // int n; // n=sprintf(buffer, "%d", myalleles[0]); sprintf(buffer, "%d", myalleles[0]); for(j=1; (unsigned)j < myalleles.size(); j++){ // n=sprintf (buffer, "%s,%d", buffer, myalleles[j]); sprintf (buffer, "%s,%d", buffer, myalleles[j]); } allele_counts[i] = buffer; // Sum all alleles. int nalleles = myalleles[0]; for(j=1; (unsigned)j < myalleles.size(); j++){ nalleles = nalleles + myalleles[j]; } // Stats. double He = 1; He = He - pow(double(myalleles[0])/double(nalleles), myalleles.size()); for(j=1; (unsigned)j < myalleles.size(); j++){ He = He - pow(double(myalleles[j])/double(nalleles), myalleles.size()); } Hes[i] = He; Nes[i] = 1/(1-He); } else { // Missing variant (row=NA) nsample[i] = NA_INTEGER; } } return Rcpp::DataFrame::create(var_info, _["n"]=nsample, _["Allele_counts"]=allele_counts, _["He"]=Hes, _["Ne"]=Nes ); }