Beispiel #1
0
//' @title Parse data by a bed file
//' @rdname bedify
//' @name bedify
//' 
//' @description Seperate a data matrix into list elements based on coordinates from bed format data.
//' 
//' @param myBed matrix of bed format data
//' @param myData StringMatrix or IntegerMatrix to be sorted
//' @param fill_missing include records for when there is no data (0, 1).  By default these records are omitted.
//' @param verbose should verbose output be generated (0, 1)
//' 
//' @details
//' 
//' \strong{Bed format} data contain at least three columns.
//' The first column indicates the chromosome (i.e., supercontig, scaffold, contig, etc.).
//' The second cotains the starting positions.
//' The third the ending positions.
//' Optional columns are in columns four through nine.
//' For example, the fourth column may contain the names of features.
//' All subsequent columns are ignored here.
//' In an attempt to optimize performance the data are expected to be formatted as a character matrix.
//' The starting and end positions are converted to numerics internally.
//' 
//' The \strong{matrix format} used here is based on vcf type data.
//' Typically these data have a chromosome as the first column.
//' Each chromosome has its own coordinate system which begins at one.
//' This means that using multiple chromosomes will necessitate some fix to the coordinate systems.
//' Here I take the perspective that you should simply work on one chromosome at a time, so the chromosome information is ignored.
//' The first column is the chromosome, which I ignore.
//' The second column is the position, which is used for sorting.
//' Subsequent columns are not treated but are brought along with the subset.
//' 
//' 
//' When the matrix is of numeric form the first column, which contains the chromosome identifier (CHROM), must also be numeric.
//' This is because matrix elements must all be of the same type.
//' 
//' 
//' \href{https://genome.ucsc.edu/FAQ/FAQformat.html#format1}{Bed format} at UCSC
//' 
//' 
//' @examples
//' 
//' bed <- structure(c("chr_290", "chr_4176", "chr_126921", "chr_126921", 
//' "chr_125157", "chr_125157", "chr_125157", "chr_125157", "chr_126888", 
//' "chr_126888", "47", "400", "4344", "1", "3712", "6025", "2269", 
//' "1779", "7930", "4637", "80", "500", "4967", "9066", "6566", 
//' "6450", "2933", "2226", "11939", "7913", "gene_1", "gene_2", 
//' "gene_3", "gene_4", "gene_5", "gene_6", "gene_7", "gene_8", "gene_9", 
//' "gene_10"), .Dim = c(10L, 4L), .Dimnames = list(NULL, c("chrom", 
//' "chromStart", "chromEnd", "name")))
//' 
//' 
//' vcf.matrix <- structure(c("chr_290", "chr_290", "chr_4176", "chr_4176", "chr_50514", 
//' "chr_64513", "chr_107521", "chr_121987", "chr_122006", "chr_122006", 
//' "78", "96", "406", "425", "863", "2853", "77", "103", "243", 
//' "636", "0/1:5,4:9:99:117,0,153", "0/0:9,0:9:99:0,27,255", "0/1:10,11:21:99:255,0,255", 
//' "0/1:10,11:21:99:255,0,255", "0/1:14,14:28:99:255,0,255", "0/1:29,13:42:99:255,0,255", 
//' "0/1:26,11:37:99:255,0,255", "0/1:21,14:35:99:255,0,255", "0/0:12,1:13:67:0,4,255", 
//' "0/1:55,8:63:99:99,0,255", "0/1:10,8:18:99:234,0,255", "0/0:17,0:17:99:0,51,255", 
//' "0/1:16,13:29:99:255,0,255", "0/1:16,13:29:99:255,0,255", "0/1:26,19:45:99:255,0,255", 
//' "0/1:50,19:69:99:255,0,255", "0/1:62,17:79:99:255,0,255", "0/1:95,22:117:99:255,0,255", 
//' "0/1:32,5:37:99:68,0,255", "0/1:69,21:90:99:255,0,255"), .Dim = c(10L, 
//' 4L), .Dimnames = list(NULL, c("CHROM", "POS", "sample_1", "sample_2"
//' )))
//' 
//' 
//' class(bed)
//' is.character(bed)
//' class(vcf.matrix)
//' is.character(vcf.matrix)
//' 
//' var.list <- bedify(bed, vcf.matrix)
//' table(unlist(lapply(var.list, nrow)))
//' 
//' @export
// [[Rcpp::export]]
Rcpp::List bedify( Rcpp::StringMatrix myBed,
                   Rcpp::StringMatrix myData, 
                   int fill_missing = 0, 
                   int verbose = 0 ) {

  // Start a timer
  time_t result = time(nullptr);

  // Initialize return List
  Rcpp::List myList(myBed.nrow());

  // Col names for each return matrix
  Rcpp::StringVector myColNames = Rcpp::colnames(myData);
  
  Rcpp::StringVector myListNames( myBed.nrow() );
  for(int i = 0; i < myBed.nrow(); i++){
    myListNames(i) = myBed( i, 3 );
  }
  myList.attr( "names" ) = myListNames;


  // Convert POS to ints
  std::vector< int > POS = get_pos(myData);

  
  // Begin to parse the data.
  //
  // Scroll over bed rows (features).
  for( int i=0; i<myBed.nrow(); i++ ){
    Rcpp::checkUserInterrupt();

    if( verbose == 1){
      Rcpp::Rcout << "Searching for annotation " << i + 1 << ": " << myBed(i,3);
      Rcpp::Rcout << " on CHROM " << myBed(i,0) << ":" << myBed(i,1) << "," << myBed(i,2) ;
      Rcpp::Rcout << " at " << time(nullptr) - result << " seconds.\n";
    }
    
    // Send one annotation (bed row) to proc_feature.
    myList(i) = proc_feature( myBed(i,Rcpp::_), myData, fill_missing );
//    myList(i) = proc_feature( myBed(i,Rcpp::_), POS, myData, fill_missing );
    
    Rcpp::colnames(myList(i)) = myColNames;

  }

//  myList.attr("names") = myNames;

  return myList;
}
Beispiel #2
0
std::vector< int > get_pos( Rcpp::StringMatrix myData ){
  std::vector< int > POS(myData.nrow());
  
  for(int i=0; i<POS.size(); i++){
    std::string temp = Rcpp::as< std::string >(myData(i,1));
    POS[i] = atoi(temp.c_str());
//    POS[i] = stoi(temp);
  }
  
  return(POS);
}
Beispiel #3
0
Rcpp::StringMatrix proc_feature( Rcpp::StringVector myBed,
//                                 std::vector< int > POS,
                                 Rcpp::StringMatrix myData,
                                 int fill_missing
                                 ){
  
  // myBed is a StringVector:
  //  myBed(0) = chrom
  //  myBed(1) = chromStart
  //  myBed(2) = chromEnd
  //  myBed(3) = name
  
  // myData is a StringMatrix
  // column 0 = CHROM
  // column 1 = POS
  
  // Convert POS to ints
  std::vector< int > POS = get_pos(myData);
  
  // Create an empty matrix to return in exceptions.
  Rcpp::StringMatrix MT_matrix(0, myData.ncol());

  // Rcpp::StringVector myBed includes start and stop integer coordinates.
  // Convert Rcpp::StringVector elements to int
  std::string temp = Rcpp::as< std::string >( myBed(1) );
  int start = atoi(temp.c_str());
  temp = Rcpp::as< std::string >( myBed(2) );
  int end = atoi(temp.c_str());

  // Manage if feature is on reverse strand
  if(end < start){
    int tmp = start;
    start = end;
    end = tmp;
  }

  // Increment i so that chromosome in myData
  // matches the chromosome in the single BED record.
  int i = 0; // Data row counter
  while ( myData(i,0) != myBed(0) && i < myData.nrow() ){
    i++;
  }
  
  // If we didn't find the chromosome return an empty matrix.
  if( i == myData.nrow() & fill_missing != 1 ){
    return MT_matrix;
  }
//  Rcpp::Rcout << "  Found CHROM " << myBed(0) << " in myData CHROM:" << myData(i,0) << " POS:" << myData(i,1) << "\n";

  // We should now have i at the correct chromosome in myData.
  // POS is the integer recast of POS in myData.
  // We can now increment to the correct position in the chromosome
  // by incrementing to the start of teh annotation.
  //
  // Increment to POS.
  while( myData(i,0) == myBed(0) && POS[i] < start && i < myData.nrow() ){
    i++;
  }
  // If we didn't find the POS return an empty matrix.
  if( i == myData.nrow() & fill_missing != 1 ){
    return MT_matrix;
  }
//  Rcpp::Rcout << "  Found CHROM " << myBed(0) << " in myData POS:" << POS[i] << "\n";
  
  
  // Increment to the end of the feature
  int j=i;
  while( myData(j,0) == myBed(0) && POS[j] <= end && j < myData.nrow() ){
    j++;
  }
//  Rcpp::Rcout << "  Found end of feature at: " << POS[j-1] << "\n";


  // We now have the information to declare a return matrix
  // and populate it.
  if( fill_missing != 1 ){
    // Do not fill missubg data.
    Rcpp::StringMatrix myMatrix( j-i , myData.ncol());
    Rcpp::colnames(myMatrix) = Rcpp::colnames(myData);
    // Populate the return matrix
    for(int k = 0; k < myMatrix.nrow(); k++){
      Rcpp::checkUserInterrupt();
      myMatrix(k, Rcpp::_) = myData(k+i, Rcpp::_);
    }
    return myMatrix;
  } else {
    // Fill missing data.
    Rcpp::StringMatrix myMatrix( end - start + 1 , myData.ncol());
    Rcpp::colnames(myMatrix) = Rcpp::colnames(myData);

    // Populate the return matrix
    if( i >= myData.nrow()){
      // No data
      for(int k = 0; k < myMatrix.nrow(); k++){
        Rcpp::checkUserInterrupt();
        myMatrix(k,0) = myBed(0);
//        myMatrix(k,1) = std::to_string(start + k);
        std::ostringstream stm;
        stm << start + k;
        myMatrix(k,1) = stm.str();
        
        myMatrix(k,2) = NA_STRING;
//        for(int m=2; m<myMatrix.ncol(); m++){
//          myMatrix(k,m) = NA_STRING;
//        }
      }
    } else {
      // Data and possibly missing data    
      int l = 0;
      for(int k = 0; k < myMatrix.nrow(); k++){
        Rcpp::checkUserInterrupt();
        
        if( i + l < myData.nrow() ){
          // We have not overrun the file yet
          temp = Rcpp::as< std::string >( myData( i+l , 1 ) );
          int myPOS = atoi(temp.c_str());
//          int myPOS = stoi(temp);

          if( myPOS == start + k ){
//            myMatrix(k, Rcpp::_) = myData(k+i, Rcpp::_);
            myMatrix(k, Rcpp::_) = myData( i + l, Rcpp::_);

            l++;
          } else {
            myMatrix(k,0) = myBed(0);
//            myMatrix(k,1) = std::to_string(myPOS);
            std::ostringstream stm;
            stm << myPOS;
            myMatrix(k,1) = stm.str();
            
            myMatrix(k,2) = NA_STRING;
            //myMatrix(k,1) = myBed(1) + k;
            //for(int m=2; m<myMatrix.ncol(); m++){
            //  myMatrix(k,m) = NA_STRING;
            //}
          }
        } else {
          // We've overrun the rows in the file.
          myMatrix(k,0) = myBed(0);
//          myMatrix(k,1) = std::to_string( start + k );
          std::ostringstream stm;
          stm << start + k;
          myMatrix(k,1) = stm.str();

          myMatrix(k,2) = NA_STRING;
          //myMatrix(k,1) = myBed(1) + k;
          //for(int m=2; m<myMatrix.ncol(); m++){
          //  myMatrix(k,m) = NA_STRING;
          //}
        }
      }
    }
    return myMatrix;
  }

  Rcpp::Rcerr << "You should never get here, something bad has happened!\n";
}
Beispiel #4
0
//' @rdname is_het
//' @name is_het
//' 
//' 
//' 
//' @export
// [[Rcpp::export]]
Rcpp::LogicalMatrix is_het(Rcpp::StringMatrix x,
                           Rcpp::LogicalVector na_is_false = true
){

  // NA matrix to return in case of unexpected results.
//  Rcpp::LogicalMatrix nam( 1, 1 );
//  nam(0,0) = NA_LOGICAL;
  
  // Initialize return data matrix.
  Rcpp::LogicalMatrix hets( x.nrow(), x.ncol() );
  hets.attr("dimnames") = x.attr("dimnames");
  
  int i;
  int j;
  int k;  
  for( i=0; i<x.nrow(); i++){
    for( j=0; j<x.ncol(); j++){

      // Parse genotype string into alleles.
      std::string my_string;
      if( x(i,j) == NA_STRING ){
        my_string = ".";
      } else {
        my_string = x(i,j);
      }


      std::vector < std::string > allele_vec;
//      vcfRCommon::strsplit(my_string, allele_vec, my_split);
      int unphased_as_na = 0; // 0 == FALSE
      vcfRCommon::gtsplit( my_string, allele_vec, unphased_as_na );

//      Rcpp::Rcout << "gtsplit returned: " << allele_vec[0];
//      for( k=1; k<allele_vec.size(); k++){
//        Rcpp::Rcout << "," << allele_vec[k];
//      }
//      Rcpp::Rcout << "\n";


      // Initialize new vector of alleles with first element of allele_vec.
      std::vector < std::string > allele_vec2;

      // Scroll through vector looking for alleles.
      for(k=0; k<allele_vec.size(); k++){
        if( allele_vec[k] == "." ){
          // Found missing value.
          // Delete and bail out.
          while( allele_vec2.size() > 0 ){
            allele_vec2.erase( allele_vec2.begin() );
          }
          k = allele_vec.size();
        } else if( allele_vec2.size() == 0 ){
          // Initialize.
          allele_vec2.push_back( allele_vec[k] );
        } else if( allele_vec2[0] != allele_vec[k] ){
          allele_vec2.push_back( allele_vec[k] );
        }
      }
      
//      Rcpp::Rcout << "allele_vec2.size(): " << allele_vec2.size();
//      Rcpp::Rcout << "\n";
//      Rcpp::Rcout << "\n";
      
      // Score return value.
      if( allele_vec2.size() == 0){
        if( na_is_false[0] == true ){
          hets(i,j) = false;
        } else if( na_is_false[0] == false ){
          hets(i,j) = NA_LOGICAL;
        }
      } else if( allele_vec2.size() == 1){
        hets(i,j) = false;
      } else if( allele_vec2.size() > 1){
        hets(i,j) = true;
      }

    }
  }
  
  return( hets );
}