// [[Rcpp::export]] Rcpp::List subsetCounts(Rcpp::IntegerVector counts, Rcpp::IntegerVector start, Rcpp::IntegerVector width, Rcpp::LogicalVector strand){ if (start.length() != width.length() || start.length() != strand.length()) Rcpp::stop("provided vectors have different lengths..."); int nr = start.length(); int len = counts.length(); int tot = 0; int* S = start.begin(); int* W = width.begin(); for (int i = 0; i < nr; ++i){ int s = S[i] - 1; int w = W[i]; if (s < 0) Rcpp::stop("negative start positions are invalid"); if (s + w > len) Rcpp::stop("range exceeds the lengths of the counts vector"); tot += w; } Rcpp::IntegerVector res(tot); Rcpp::IntegerVector nstart(nr); Rcpp::IntegerVector nend(nr); int* R = res.begin(); int* C = counts.begin(); int* ST = strand.begin(); int* NS = nstart.begin(); int* NE = nend.begin(); int currpos = 0; for (int i = 0; i < nr; ++i){ NS[i] = currpos + 1; int w = W[i]; if (ST[i]) std::copy(C + S[i]-1, C + S[i]-1 + w, R + currpos); else std::reverse_copy(C + S[i]-1, C + S[i]-1 + w, R + currpos); currpos += w; NE[i] = currpos; } return List::create(_("counts")=res, _("starts")=nstart, _("ends")=nend); }
Rcpp::IntegerVector get_which(Rcpp::LogicalVector row) { int j = 0; for (int i = 0; i < row.length(); i++) if (row(i)) j++; Rcpp::IntegerVector ret(j); for (int i = 0, j = 0; i < row.length(); i++) if (row(i)) ret(j++) = i + 1; // R is 1-based return ret; }
// [[Rcpp::export]] Rcpp::LogicalVector doc_validate(XPtrDoc doc, XPtrDoc schema) { xmlLineNumbersDefault(1); Rcpp::CharacterVector vec; xmlSchemaParserCtxtPtr cptr = xmlSchemaNewDocParserCtxt(schema.checked_get()); xmlSchemaSetParserStructuredErrors(cptr, handleSchemaError, &vec); xmlSchemaPtr sptr = xmlSchemaParse(cptr); xmlSchemaValidCtxtPtr vptr = xmlSchemaNewValidCtxt(sptr); xmlSchemaSetValidStructuredErrors(vptr, handleSchemaError, &vec); Rcpp::LogicalVector out; out.push_back(0 == xmlSchemaValidateDoc(vptr, doc.checked_get())); xmlSchemaFreeParserCtxt(cptr); xmlSchemaFreeValidCtxt(vptr); xmlSchemaFree(sptr); out.attr("errors") = vec; return out; }
//[[Rcpp::export]] Rcpp::RawVector toBitVec(Rcpp::LogicalVector indx) { unsigned nBit = indx.size(); unsigned nByte = ceil(float(nBit)/8); Rcpp::RawVector bytes(nByte);//default are all 0s bytes.attr("bitlen") = nBit; unsigned byteIndex, bitIndex ; for(unsigned i = 0 ; i < nBit; i++) { byteIndex = i / 8; bitIndex = i % 8; if(indx(i) == 1) bytes[byteIndex] = bytes[byteIndex] | 1 << bitIndex; } return bytes; }
// [[Rcpp::export]] Rcpp::DataFrame gt_to_popsum(Rcpp::DataFrame var_info, Rcpp::CharacterMatrix gt) { // Calculate popgen summaries for the sample. // var_info should contain columns named 'CHROM', 'POS', 'mask' and possibly others. Rcpp::LogicalVector mask = var_info["mask"]; Rcpp::IntegerVector nsample(mask.size()); Rcpp::StringVector allele_counts(mask.size()); Rcpp::NumericVector Hes(mask.size()); Rcpp::NumericVector Nes(mask.size()); int i = 0; int j = 0; int k = 0; for(i=0; i < gt.nrow(); i++){ // Iterate over variants (rows) if(mask[i] == TRUE){ std::vector<int> myalleles (1,0); for(j=0; j < gt.ncol(); j++){ // Iterate over samples (columns) if(gt(i, j) != NA_STRING){ nsample[i]++; // Increment sample count. // Count alleles per sample. std::vector < int > intv = gtsplit(as<std::string>(gt(i, j))); for(k=0; k<intv.size(); k++){ while(myalleles.size() - 1 < intv[k]){ // We have more alleles than exist in the vector myalleles. myalleles.push_back(0); } myalleles[intv[k]]++; } } } // Concatenate allele counts into a comma delimited string. int n; char buffer [50]; n = sprintf (buffer, "%d", myalleles[0]); for(j=1; j < myalleles.size(); j++){ n=sprintf (buffer, "%s,%d", buffer, myalleles[j]); } allele_counts[i] = buffer; // Sum all alleles. int nalleles = myalleles[0]; for(j=1; j < myalleles.size(); j++){ nalleles = nalleles + myalleles[j]; } // Stats. double He = 1; He = He - pow(double(myalleles[0])/double(nalleles), myalleles.size()); for(j=1; j < myalleles.size(); j++){ He = He - pow(double(myalleles[j])/double(nalleles), myalleles.size()); } Hes[i] = He; Nes[i] = 1/(1-He); } else { // Missing variant (row=NA) nsample[i] = NA_INTEGER; } } return Rcpp::DataFrame::create(var_info, _["n"]=nsample, _["Allele_counts"]=allele_counts, _["He"]=Hes, _["Ne"]=Nes ); }
//' @export // [[Rcpp::export(name=".gt_to_popsum")]] Rcpp::DataFrame gt_to_popsum(Rcpp::DataFrame var_info, Rcpp::CharacterMatrix gt) { // Calculate popgen summaries for the sample. // var_info should contain columns named 'CHROM', 'POS', 'mask' and possibly others. Rcpp::LogicalVector mask = var_info["mask"]; Rcpp::IntegerVector nsample(mask.size()); Rcpp::StringVector allele_counts(mask.size()); Rcpp::NumericVector Hes(mask.size()); Rcpp::NumericVector Nes(mask.size()); int i = 0; int j = 0; // unsigned int j = 0; unsigned int k = 0; for(i=0; i < gt.nrow(); i++){ // Iterate over variants (rows) if(mask[i] == TRUE){ std::vector<int> myalleles (1,0); for(j=0; j < gt.ncol(); j++){ // Iterate over samples (columns) if(gt(i, j) != NA_STRING){ nsample[i]++; // Increment sample count. // Rcout << "gt: " << gt(i, j) << "\n"; // Count alleles per sample. int unphased_as_na = 0; // 0 == FALSE std::vector < std::string > gt_vector; std::string gt2 = as<std::string>(gt(i,j)); vcfRCommon::gtsplit( gt2, gt_vector, unphased_as_na ); // Rcout << "gt_vector.size: " << gt_vector.size() << "\n"; for(k=0; k<gt_vector.size(); k++){ int myAllele = std::stoi(gt_vector[k]); // Rcout << " " << myAllele; // // If this genotype had an allele we did not previously observe // we'll have to grow the vector. while(myalleles.size() - 1 < myAllele){ myalleles.push_back(0); } myalleles[myAllele]++; } // Rcout << "\n\n"; } } // Concatenate allele counts into a comma delimited string. char buffer [50]; // int n; // n=sprintf(buffer, "%d", myalleles[0]); sprintf(buffer, "%d", myalleles[0]); for(j=1; (unsigned)j < myalleles.size(); j++){ // n=sprintf (buffer, "%s,%d", buffer, myalleles[j]); sprintf (buffer, "%s,%d", buffer, myalleles[j]); } allele_counts[i] = buffer; // Sum all alleles. int nalleles = myalleles[0]; for(j=1; (unsigned)j < myalleles.size(); j++){ nalleles = nalleles + myalleles[j]; } // Stats. double He = 1; He = He - pow(double(myalleles[0])/double(nalleles), myalleles.size()); for(j=1; (unsigned)j < myalleles.size(); j++){ He = He - pow(double(myalleles[j])/double(nalleles), myalleles.size()); } Hes[i] = He; Nes[i] = 1/(1-He); } else { // Missing variant (row=NA) nsample[i] = NA_INTEGER; } } return Rcpp::DataFrame::create(var_info, _["n"]=nsample, _["Allele_counts"]=allele_counts, _["He"]=Hes, _["Ne"]=Nes ); }