예제 #1
0
// [[Rcpp::export]]
void write_vcf_body( Rcpp::CharacterMatrix fix,
                     Rcpp::CharacterMatrix gt,
                     std::string filename,
                     int mask=0 ) {
  // http://stackoverflow.com/a/5649224
  
//  
int verbose = 0;
//  int verbose = 1;
  
  if( verbose == 1 ){
    Rcpp::Rcout << "Made it into the function!\n";
  }
  
  int i = 0; // Rows
  int j = 0; // Columns
  std::string tmpstring;  // Assemble each line before writing

  // Initialize filehandle.
  gzFile fi;
  
  // Initialize file.
  // Note that gzfile does not tolerate initializing an empty file.
  // Use ofstream instead.
  if ( ! std::ifstream( filename ) ){
    if( verbose == 1 ){
      Rcpp::Rcout << "File does not exist." << std::endl;
    }
    
    std::ofstream myfile;
    myfile.open (filename, std::ios::out | std::ios::binary);
    myfile.close();
    
    // This should make valgrind hang.
    // Or not???
//    fi = gzopen( filename.c_str(), "ab" );
//    gzclose(fi);
  }

  // In order for APPEND=TRUE to work the header
  // should not be printed here.

  if( verbose == 1 ){
    Rcpp::Rcout << "Matrix fix has " << fix.nrow() << " rows (variants).\n";
  }
  
  // Manage body
  if( fix.nrow() >= 1 ){
    if( verbose == 1 ){
      Rcpp::Rcout << "Processing the body (variants).\n";
    }
    // There is at least one variant.
    fi = gzopen( filename.c_str(), "ab" );
    if (! fi) {
      Rcpp::Rcerr << "gzopen of " << filename << " failed: " << strerror (errno) << ".\n";
    }

    for(i = 0; i < fix.nrow(); i++){
      Rcpp::checkUserInterrupt();

      if(mask == 1 && fix(i,6) != "PASS" ){
        // Don't print variant.
      } else {
        // Print variant.
        j = 0;
        tmpstring = fix(i,j);
        for(j = 1; j < fix.ncol(); j++){
          if(fix(i,j) == NA_STRING){
            tmpstring = tmpstring + "\t" + ".";
          } else {
            tmpstring = tmpstring + "\t" + fix(i,j);
          }
        }

        // gt portion
        for(j = 0; j < gt.ncol(); j++){
          if(gt(i, j) == NA_STRING){
            tmpstring = tmpstring + "\t" + "./.";
          } else {
            tmpstring = tmpstring + "\t" + gt(i, j);
          }
        }

        gzwrite(fi, tmpstring.c_str(), tmpstring.size());
        gzwrite(fi,"\n",strlen("\n"));
      }
    }
    if( verbose == 1 ){
      Rcpp::Rcout << "Finished processing the body (variants).\n";
    }
    gzclose(fi);
  } else {
    if( verbose == 1 ){
      Rcpp::Rcout << "No rows (variants).\n";
    }
  }
  
//  return void;
}
예제 #2
0
// [[Rcpp::export]]
Rcpp::DataFrame gt_to_popsum(Rcpp::DataFrame var_info, Rcpp::CharacterMatrix gt) {
  // Calculate popgen summaries for the sample.
  // var_info should contain columns named 'CHROM', 'POS', 'mask' and possibly others.
  Rcpp::LogicalVector   mask = var_info["mask"];
  Rcpp::IntegerVector   nsample(mask.size());
  Rcpp::StringVector    allele_counts(mask.size());
  Rcpp::NumericVector   Hes(mask.size());
  Rcpp::NumericVector   Nes(mask.size());
  
  int i = 0;
  int j = 0;
  int k = 0;
  
  for(i=0; i < gt.nrow(); i++){ // Iterate over variants (rows)
    if(mask[i] == TRUE){
      std::vector<int> myalleles (1,0);
      for(j=0; j < gt.ncol(); j++){ // Iterate over samples (columns)
        if(gt(i, j) != NA_STRING){
          nsample[i]++;  // Increment sample count.

          // Count alleles per sample.
          std::vector < int > intv = gtsplit(as<std::string>(gt(i, j)));
          for(k=0; k<intv.size(); k++){
            while(myalleles.size() - 1 < intv[k]){
              // We have more alleles than exist in the vector myalleles.
              myalleles.push_back(0);
            }
            myalleles[intv[k]]++;
          }
        }
      }

      // Concatenate allele counts into a comma delimited string.
      int n;
      char buffer [50];
      n = sprintf (buffer, "%d", myalleles[0]);
      for(j=1; j < myalleles.size(); j++){
        n=sprintf (buffer, "%s,%d", buffer, myalleles[j]);
      }
      allele_counts[i] = buffer;

      // Sum all alleles.
      int nalleles = myalleles[0];
      for(j=1; j < myalleles.size(); j++){
        nalleles = nalleles + myalleles[j];
      }

      // Stats.
      double He = 1;
      He = He - pow(double(myalleles[0])/double(nalleles), myalleles.size());
      for(j=1; j < myalleles.size(); j++){
        He = He - pow(double(myalleles[j])/double(nalleles), myalleles.size());
      }
      Hes[i] = He;
      Nes[i] = 1/(1-He);
    } else { // Missing variant (row=NA)
      nsample[i] = NA_INTEGER;     
    }
  }

  return Rcpp::DataFrame::create(var_info, 
      _["n"]=nsample, 
      _["Allele_counts"]=allele_counts,
      _["He"]=Hes,
      _["Ne"]=Nes
  );
}
예제 #3
0
//' @export
// [[Rcpp::export(name=".gt_to_popsum")]]
Rcpp::DataFrame gt_to_popsum(Rcpp::DataFrame var_info, Rcpp::CharacterMatrix gt) {
  // Calculate popgen summaries for the sample.
  // var_info should contain columns named 'CHROM', 'POS', 'mask' and possibly others.
  Rcpp::LogicalVector   mask = var_info["mask"];
  Rcpp::IntegerVector   nsample(mask.size());
  Rcpp::StringVector    allele_counts(mask.size());
  Rcpp::NumericVector   Hes(mask.size());
  Rcpp::NumericVector   Nes(mask.size());
  
  int i = 0;
  int j = 0;
//  unsigned int j = 0;
  unsigned int k = 0;
  
  for(i=0; i < gt.nrow(); i++){ // Iterate over variants (rows)
    if(mask[i] == TRUE){
      std::vector<int> myalleles (1,0);
      for(j=0; j < gt.ncol(); j++){ // Iterate over samples (columns)
        if(gt(i, j) != NA_STRING){
          nsample[i]++;  // Increment sample count.
          
//          Rcout << "gt: " << gt(i, j) << "\n";
          // Count alleles per sample.

          int unphased_as_na = 0; // 0 == FALSE
          std::vector < std::string > gt_vector;
          std::string gt2 = as<std::string>(gt(i,j));
          vcfRCommon::gtsplit( gt2, gt_vector, unphased_as_na );
          
//          Rcout << "gt_vector.size: " << gt_vector.size() << "\n";

          for(k=0; k<gt_vector.size(); k++){
            int myAllele = std::stoi(gt_vector[k]);
//            Rcout << "  " << myAllele;
//            // If this genotype had an allele we did not previously observe
            // we'll have to grow the vector.
            while(myalleles.size() - 1 < myAllele){
              myalleles.push_back(0);
            }
            myalleles[myAllele]++;
          }
//          Rcout << "\n\n";
        }
      }

      // Concatenate allele counts into a comma delimited string.
      char buffer [50];
//      int n;
//      n=sprintf(buffer, "%d", myalleles[0]);
      sprintf(buffer, "%d", myalleles[0]);
      for(j=1; (unsigned)j < myalleles.size(); j++){
//        n=sprintf (buffer, "%s,%d", buffer, myalleles[j]);
        sprintf (buffer, "%s,%d", buffer, myalleles[j]);
      }
      allele_counts[i] = buffer;

      // Sum all alleles.
      int nalleles = myalleles[0];
      for(j=1; (unsigned)j < myalleles.size(); j++){
        nalleles = nalleles + myalleles[j];
      }

      // Stats.
      double He = 1;
      He = He - pow(double(myalleles[0])/double(nalleles), myalleles.size());
      for(j=1; (unsigned)j < myalleles.size(); j++){
        He = He - pow(double(myalleles[j])/double(nalleles), myalleles.size());
      }
      Hes[i] = He;
      Nes[i] = 1/(1-He);
    } else { // Missing variant (row=NA)
      nsample[i] = NA_INTEGER;     
    }
  }

  return Rcpp::DataFrame::create(var_info, 
      _["n"]=nsample, 
      _["Allele_counts"]=allele_counts,
      _["He"]=Hes,
      _["Ne"]=Nes
  );
}