// [[Rcpp::export]] void write_vcf_body( Rcpp::CharacterMatrix fix, Rcpp::CharacterMatrix gt, std::string filename, int mask=0 ) { // http://stackoverflow.com/a/5649224 // int verbose = 0; // int verbose = 1; if( verbose == 1 ){ Rcpp::Rcout << "Made it into the function!\n"; } int i = 0; // Rows int j = 0; // Columns std::string tmpstring; // Assemble each line before writing // Initialize filehandle. gzFile fi; // Initialize file. // Note that gzfile does not tolerate initializing an empty file. // Use ofstream instead. if ( ! std::ifstream( filename ) ){ if( verbose == 1 ){ Rcpp::Rcout << "File does not exist." << std::endl; } std::ofstream myfile; myfile.open (filename, std::ios::out | std::ios::binary); myfile.close(); // This should make valgrind hang. // Or not??? // fi = gzopen( filename.c_str(), "ab" ); // gzclose(fi); } // In order for APPEND=TRUE to work the header // should not be printed here. if( verbose == 1 ){ Rcpp::Rcout << "Matrix fix has " << fix.nrow() << " rows (variants).\n"; } // Manage body if( fix.nrow() >= 1 ){ if( verbose == 1 ){ Rcpp::Rcout << "Processing the body (variants).\n"; } // There is at least one variant. fi = gzopen( filename.c_str(), "ab" ); if (! fi) { Rcpp::Rcerr << "gzopen of " << filename << " failed: " << strerror (errno) << ".\n"; } for(i = 0; i < fix.nrow(); i++){ Rcpp::checkUserInterrupt(); if(mask == 1 && fix(i,6) != "PASS" ){ // Don't print variant. } else { // Print variant. j = 0; tmpstring = fix(i,j); for(j = 1; j < fix.ncol(); j++){ if(fix(i,j) == NA_STRING){ tmpstring = tmpstring + "\t" + "."; } else { tmpstring = tmpstring + "\t" + fix(i,j); } } // gt portion for(j = 0; j < gt.ncol(); j++){ if(gt(i, j) == NA_STRING){ tmpstring = tmpstring + "\t" + "./."; } else { tmpstring = tmpstring + "\t" + gt(i, j); } } gzwrite(fi, tmpstring.c_str(), tmpstring.size()); gzwrite(fi,"\n",strlen("\n")); } } if( verbose == 1 ){ Rcpp::Rcout << "Finished processing the body (variants).\n"; } gzclose(fi); } else { if( verbose == 1 ){ Rcpp::Rcout << "No rows (variants).\n"; } } // return void; }
// [[Rcpp::export]] Rcpp::DataFrame gt_to_popsum(Rcpp::DataFrame var_info, Rcpp::CharacterMatrix gt) { // Calculate popgen summaries for the sample. // var_info should contain columns named 'CHROM', 'POS', 'mask' and possibly others. Rcpp::LogicalVector mask = var_info["mask"]; Rcpp::IntegerVector nsample(mask.size()); Rcpp::StringVector allele_counts(mask.size()); Rcpp::NumericVector Hes(mask.size()); Rcpp::NumericVector Nes(mask.size()); int i = 0; int j = 0; int k = 0; for(i=0; i < gt.nrow(); i++){ // Iterate over variants (rows) if(mask[i] == TRUE){ std::vector<int> myalleles (1,0); for(j=0; j < gt.ncol(); j++){ // Iterate over samples (columns) if(gt(i, j) != NA_STRING){ nsample[i]++; // Increment sample count. // Count alleles per sample. std::vector < int > intv = gtsplit(as<std::string>(gt(i, j))); for(k=0; k<intv.size(); k++){ while(myalleles.size() - 1 < intv[k]){ // We have more alleles than exist in the vector myalleles. myalleles.push_back(0); } myalleles[intv[k]]++; } } } // Concatenate allele counts into a comma delimited string. int n; char buffer [50]; n = sprintf (buffer, "%d", myalleles[0]); for(j=1; j < myalleles.size(); j++){ n=sprintf (buffer, "%s,%d", buffer, myalleles[j]); } allele_counts[i] = buffer; // Sum all alleles. int nalleles = myalleles[0]; for(j=1; j < myalleles.size(); j++){ nalleles = nalleles + myalleles[j]; } // Stats. double He = 1; He = He - pow(double(myalleles[0])/double(nalleles), myalleles.size()); for(j=1; j < myalleles.size(); j++){ He = He - pow(double(myalleles[j])/double(nalleles), myalleles.size()); } Hes[i] = He; Nes[i] = 1/(1-He); } else { // Missing variant (row=NA) nsample[i] = NA_INTEGER; } } return Rcpp::DataFrame::create(var_info, _["n"]=nsample, _["Allele_counts"]=allele_counts, _["He"]=Hes, _["Ne"]=Nes ); }
//' @export // [[Rcpp::export(name=".gt_to_popsum")]] Rcpp::DataFrame gt_to_popsum(Rcpp::DataFrame var_info, Rcpp::CharacterMatrix gt) { // Calculate popgen summaries for the sample. // var_info should contain columns named 'CHROM', 'POS', 'mask' and possibly others. Rcpp::LogicalVector mask = var_info["mask"]; Rcpp::IntegerVector nsample(mask.size()); Rcpp::StringVector allele_counts(mask.size()); Rcpp::NumericVector Hes(mask.size()); Rcpp::NumericVector Nes(mask.size()); int i = 0; int j = 0; // unsigned int j = 0; unsigned int k = 0; for(i=0; i < gt.nrow(); i++){ // Iterate over variants (rows) if(mask[i] == TRUE){ std::vector<int> myalleles (1,0); for(j=0; j < gt.ncol(); j++){ // Iterate over samples (columns) if(gt(i, j) != NA_STRING){ nsample[i]++; // Increment sample count. // Rcout << "gt: " << gt(i, j) << "\n"; // Count alleles per sample. int unphased_as_na = 0; // 0 == FALSE std::vector < std::string > gt_vector; std::string gt2 = as<std::string>(gt(i,j)); vcfRCommon::gtsplit( gt2, gt_vector, unphased_as_na ); // Rcout << "gt_vector.size: " << gt_vector.size() << "\n"; for(k=0; k<gt_vector.size(); k++){ int myAllele = std::stoi(gt_vector[k]); // Rcout << " " << myAllele; // // If this genotype had an allele we did not previously observe // we'll have to grow the vector. while(myalleles.size() - 1 < myAllele){ myalleles.push_back(0); } myalleles[myAllele]++; } // Rcout << "\n\n"; } } // Concatenate allele counts into a comma delimited string. char buffer [50]; // int n; // n=sprintf(buffer, "%d", myalleles[0]); sprintf(buffer, "%d", myalleles[0]); for(j=1; (unsigned)j < myalleles.size(); j++){ // n=sprintf (buffer, "%s,%d", buffer, myalleles[j]); sprintf (buffer, "%s,%d", buffer, myalleles[j]); } allele_counts[i] = buffer; // Sum all alleles. int nalleles = myalleles[0]; for(j=1; (unsigned)j < myalleles.size(); j++){ nalleles = nalleles + myalleles[j]; } // Stats. double He = 1; He = He - pow(double(myalleles[0])/double(nalleles), myalleles.size()); for(j=1; (unsigned)j < myalleles.size(); j++){ He = He - pow(double(myalleles[j])/double(nalleles), myalleles.size()); } Hes[i] = He; Nes[i] = 1/(1-He); } else { // Missing variant (row=NA) nsample[i] = NA_INTEGER; } } return Rcpp::DataFrame::create(var_info, _["n"]=nsample, _["Allele_counts"]=allele_counts, _["He"]=Hes, _["Ne"]=Nes ); }