コード例 #1
0
ファイル: pcr_duplicates.cpp プロジェクト: mrG7/HipSTR
std::string get_library(BamTools::BamAlignment& aln, std::map<std::string, std::string>& rg_to_library){
  std::string rg;
  std::string rg_tag = "RG";
  char tag_type = 'Z';
  if (!aln.GetTagType(rg_tag, tag_type))
    printErrorAndDie("Failed to retrieve BAM alignment's RG tag");
  aln.GetTag("RG", rg);
  auto iter = rg_to_library.find(rg);
  if (iter == rg_to_library.end())
    printErrorAndDie("No library found for read group " + rg + " in BAM file headers");
  return iter->second;
}
コード例 #2
0
ファイル: Pedigree.cpp プロジェクト: mrG7/HipSTR
bool PEDIGREE_GRAPH::build(std::string filename) {
  std::ifstream input(filename.c_str());
  if (!input.is_open())
    printErrorAndDie("Failed to open pedigree file " + filename);
  
  std::map<std::string, PEDIGREE_NODE*> samples;
  std::vector<PEDIGREE_NODE*> nodes;
  std::string line;
  //std::getline(input, line));  // TO DO: Fix weird expection that occurs if I try to skip header  
  while (std::getline(input, line)){
    std::istringstream iss(line);
    std::string family, child, father, mother;
    if(! (iss >> family >> child >> father >> mother))
      printErrorAndDie("Improperly formated .ped pedigree file " + filename);

    if (child.compare("0") == 0)
      printErrorAndDie("Invalid individual id " + child);

    // Create new nodes for any previously unseen samples that have 
    // an identifier other than 0
    if (samples.find(child) == samples.end()){
      PEDIGREE_NODE* new_node = new PEDIGREE_NODE(child);
      nodes.push_back(new_node);
      samples[child] = new_node;
    }
    if (mother.compare("0") != 0 && samples.find(mother) == samples.end()){
      PEDIGREE_NODE* new_node = new PEDIGREE_NODE(mother);
      samples[father] = new_node;
      nodes.push_back(new_node);
    }
    if (father.compare("0") != 0 && samples.find(father) == samples.end()){
      PEDIGREE_NODE* new_node = new PEDIGREE_NODE(father);
      samples[mother] = new_node;
      nodes.push_back(new_node);
    }
    
    // Store relationships in node instance
    PEDIGREE_NODE* child_node  = samples.find(child)->second;
    PEDIGREE_NODE* mother_node = (mother.compare("0") == 0 ? NULL : samples.find(mother)->second);
    PEDIGREE_NODE* father_node = (father.compare("0") == 0 ? NULL : samples.find(father)->second); 
    child_node->set_mother(mother_node);
    child_node->set_father(father_node);
    if (mother_node != NULL) mother_node->add_child(child_node);
    if (father_node != NULL) father_node->add_child(child_node);
  }
  input.close();
  
  // Sort nodes in pedigree graph topologically
  return topological_sort(nodes);
}
コード例 #3
0
ファイル: base_caller.cpp プロジェクト: tfwillems/BaseCaller
void compute_confusion_matrix(int32_t max_read_length, std::string bam_file, std::string fasta_file, std::string fasta_dir, bool skip_soft_clipped, std::ostream& out){
  BamTools::BamReader bam_reader;
  if (!bam_reader.Open(bam_file)) printErrorAndDie("Failed to open BAM file");

  std::string ref_seq;
  int32_t ref_id;
  if (fasta_file.compare("N/A") == 0)
    ref_id = -2;
  else {
    readFasta(fasta_file, fasta_dir, ref_seq);
    ref_id = 0;
  }

  int32_t* matrix_counts = new int32_t [25*max_read_length]();
  int32_t* total_counts  = new int32_t [5*max_read_length]();
  int32_t forward = 0, backward = 0;
  process_reads(bam_reader, max_read_length, ref_id, ref_seq, fasta_dir, skip_soft_clipped, matrix_counts, total_counts, forward, backward);

  out << forward  << "\n"
      << backward << std::endl;
  print_confusion_matrix(matrix_counts, total_counts, max_read_length, out);

  delete [] matrix_counts;
  delete [] total_counts;
}
コード例 #4
0
ファイル: Pedigree.cpp プロジェクト: mrG7/HipSTR
void read_sample_list(std::string input_file, std::set<std::string>& sample_set){
  sample_set.clear();
  std::ifstream input(input_file);
  if (!input.is_open())
    printErrorAndDie("Unable to open sample list file " + input_file);
  std::string line;
  while (std::getline(input, line))
    sample_set.insert(line);
}
コード例 #5
0
ファイル: read_vcf_priors_test.cpp プロジェクト: mrG7/HipSTR
int main(int argc, char* argv[]){
  if (argc != 3)
    printErrorAndDie("Script requires exactly 2 arguments");
  std::string region_file = std::string(argv[1]);
  std::string vcf_file    = std::string(argv[2]);

  // Read list of regions
  std::vector<Region> regions;  
  readRegions(region_file, regions, 1000, "", std::cerr);

  vcflib::VariantCallFile ref_vcf;
  if(!ref_vcf.open(vcf_file))
    printErrorAndDie("Failed to open VCF");

  // Populate map with samples in VCF header
  std::map<std::string, int> sample_indices;
  for (unsigned int i = 0; i < ref_vcf.sampleNames.size(); i++)
    sample_indices[ref_vcf.sampleNames[i]] = i;

  std::vector<std::string> alleles;
  std::vector<bool> got_priors;
  int32_t pos;
  for (unsigned int i = 0; i < regions.size(); i++){
    bool success;
    double* priors = extract_vcf_alleles_and_log_priors(&ref_vcf, &(regions[i]), sample_indices, alleles, got_priors, pos, success, std::cerr);

    if (success){
      std::cerr << "Position=" << pos << std::endl;
      std::cerr << "Alleles:" << std::endl;
      for (unsigned int j = 0; j < alleles.size(); j++)
	std::cerr << alleles[j] << std::endl;
    }
    else {
      std::cerr << "Failed to read alleles and priors for region " << regions[i].str() << std::endl;
    }
      
    alleles.clear();
    got_priors.clear();
    delete [] priors;
  }

}
コード例 #6
0
ファイル: snp_bam_processor.cpp プロジェクト: mrG7/HipSTR
int SNPBamProcessor::get_haplotype(BamTools::BamAlignment& aln){
  if (!aln.HasTag(HAPLOTYPE_TAG))
    return -1;
  uint8_t haplotype;
  if (!aln.GetTag(HAPLOTYPE_TAG, haplotype)){
    char type;
    aln.GetTagType(HAPLOTYPE_TAG, type);
    printErrorAndDie("Failed to extract haplotype tag");
  }
  assert(haplotype == 1 || haplotype == 2);
  return (int)haplotype;
}
コード例 #7
0
ファイル: base_quality.cpp プロジェクト: mrG7/HipSTR
std::string BaseQuality::average_base_qualities(std::vector<const std::string*> qualities){
  assert(qualities.size() > 0);

  // Check that all base quality strings are of the same length
  for (unsigned int i = 0; i < qualities.size(); i++){
    if (qualities[i]->size() != qualities[0]->size())
      printErrorAndDie("All base quality strings must be of the same length when averaging probabilities");
  }

  // Average raw error probabilities for each base and convert
  // to the closest quality score
  std::string avg_qualities('N', qualities[0]->size());
  std::vector<double> log_probs(qualities.size());
  for (unsigned int i = 0; i < qualities[0]->size(); i++){
    for (unsigned int j = 0; j < qualities.size(); j++)
      log_probs[j] = log_prob_error(qualities[j]->at(i));
    double log_mean_prob = log_sum_exp(log_probs) - log(qualities.size());
    avg_qualities[i]     = closest_char(log_mean_prob);
  }
  return avg_qualities;
}
コード例 #8
0
ファイル: Pedigree.cpp プロジェクト: mrG7/HipSTR
bool PEDIGREE_GRAPH::topological_sort(std::vector<PEDIGREE_NODE*>& nodes){
  no_ancestors_.clear();
  no_descendants_.clear();
  nodes_.clear();
  
  std::map<PEDIGREE_NODE*, int> parent_counts;
  std::vector<PEDIGREE_NODE*>   sources;
  for (int i = 0; i < nodes.size(); i++){
    int count = nodes[i]->has_mother() + nodes[i]->has_father();
    if (count == 0)
      sources.push_back(nodes[i]);
    else
      parent_counts[nodes[i]] = count;
  }

  while (sources.size() != 0){
    PEDIGREE_NODE* source = sources.back();
    std::vector<PEDIGREE_NODE*>& children = source->get_children();
    nodes_.push_back(source);
    sources.pop_back();

    for (auto child_iter = children.begin(); child_iter != children.end(); child_iter++){
      auto count_iter = parent_counts.find(*child_iter);
      if (count_iter == parent_counts.end()){
	source->print(std::cerr);
	(*child_iter)->print(std::cerr);
	printErrorAndDie("Logical error in topological_sort() for parent " + source->get_name() + " and child " + (*child_iter)->get_name());
      }
      else if (count_iter->second == 1){
	sources.push_back(*child_iter);
	parent_counts.erase(count_iter);
      }
      else
	count_iter->second -= 1;
    }
  }
  return parent_counts.size() == 0; // Only a DAG if no unprocessed individuals are left
}
コード例 #9
0
ファイル: HaplotypeGenerator.cpp プロジェクト: mrG7/HipSTR
bool extract_sequence(Alignment& aln, int32_t start, int32_t end, std::string& seq){    
  if (aln.get_start() >= start) return false;
  if (aln.get_stop()  <= end)   return false;

  int align_index = 0; // Index into alignment string
  int char_index  = 0; // Index of current base in current CIGAR element
  int32_t pos     = aln.get_start();
  auto cigar_iter = aln.get_cigar_list().begin();
  
  // Extract region sequence if fully spanned by alignment
  std::stringstream reg_seq;
  while (cigar_iter != aln.get_cigar_list().end()){
    if (char_index == cigar_iter->get_num()){
      cigar_iter++;
      char_index = 0;
    }
    else if (pos > end){
      if (reg_seq.str() == "")
	seq = "";
      else
	seq = uppercase(reg_seq.str());
      return true;
    }
    else if (pos == end){
      if (cigar_iter->get_type() == 'I'){
	reg_seq << aln.get_alignment().substr(align_index, cigar_iter->get_num());
	align_index += cigar_iter->get_num();
	char_index = 0;
	cigar_iter++;
      }
      else {
	if (reg_seq.str() == "")
	  seq = "";
	else
	  seq = uppercase(reg_seq.str());
	return true;
      }
    }
    else if (pos >= start){
      int32_t num_bases = std::min(end-pos, cigar_iter->get_num()-char_index);
      switch(cigar_iter->get_type()){	 
      case 'I':
	// Insertion within region
	num_bases = cigar_iter->get_num();
	reg_seq << aln.get_alignment().substr(align_index, num_bases);
	break;
      case '=': case 'X':
	reg_seq << aln.get_alignment().substr(align_index, num_bases);
	pos += num_bases;
	break;
      case 'D':
	pos += num_bases;
	break;
      default:
	printErrorAndDie("Invalid CIGAR char in extractRegionSequences()");
	break;
      }
      align_index += num_bases;
      char_index  += num_bases;
    }
    else {
      int32_t num_bases;
      if (cigar_iter->get_type() == 'I')
	num_bases = cigar_iter->get_num()-char_index;
      else {
	num_bases = std::min(start-pos, cigar_iter->get_num()-char_index);
	pos      += num_bases;
      }
      align_index  += num_bases;
      char_index   += num_bases;
    }
  }
  printErrorAndDie("Logical error in extract_sequence");
  return false;
}