std::string get_library(BamTools::BamAlignment& aln, std::map<std::string, std::string>& rg_to_library){ std::string rg; std::string rg_tag = "RG"; char tag_type = 'Z'; if (!aln.GetTagType(rg_tag, tag_type)) printErrorAndDie("Failed to retrieve BAM alignment's RG tag"); aln.GetTag("RG", rg); auto iter = rg_to_library.find(rg); if (iter == rg_to_library.end()) printErrorAndDie("No library found for read group " + rg + " in BAM file headers"); return iter->second; }
bool PEDIGREE_GRAPH::build(std::string filename) { std::ifstream input(filename.c_str()); if (!input.is_open()) printErrorAndDie("Failed to open pedigree file " + filename); std::map<std::string, PEDIGREE_NODE*> samples; std::vector<PEDIGREE_NODE*> nodes; std::string line; //std::getline(input, line)); // TO DO: Fix weird expection that occurs if I try to skip header while (std::getline(input, line)){ std::istringstream iss(line); std::string family, child, father, mother; if(! (iss >> family >> child >> father >> mother)) printErrorAndDie("Improperly formated .ped pedigree file " + filename); if (child.compare("0") == 0) printErrorAndDie("Invalid individual id " + child); // Create new nodes for any previously unseen samples that have // an identifier other than 0 if (samples.find(child) == samples.end()){ PEDIGREE_NODE* new_node = new PEDIGREE_NODE(child); nodes.push_back(new_node); samples[child] = new_node; } if (mother.compare("0") != 0 && samples.find(mother) == samples.end()){ PEDIGREE_NODE* new_node = new PEDIGREE_NODE(mother); samples[father] = new_node; nodes.push_back(new_node); } if (father.compare("0") != 0 && samples.find(father) == samples.end()){ PEDIGREE_NODE* new_node = new PEDIGREE_NODE(father); samples[mother] = new_node; nodes.push_back(new_node); } // Store relationships in node instance PEDIGREE_NODE* child_node = samples.find(child)->second; PEDIGREE_NODE* mother_node = (mother.compare("0") == 0 ? NULL : samples.find(mother)->second); PEDIGREE_NODE* father_node = (father.compare("0") == 0 ? NULL : samples.find(father)->second); child_node->set_mother(mother_node); child_node->set_father(father_node); if (mother_node != NULL) mother_node->add_child(child_node); if (father_node != NULL) father_node->add_child(child_node); } input.close(); // Sort nodes in pedigree graph topologically return topological_sort(nodes); }
void compute_confusion_matrix(int32_t max_read_length, std::string bam_file, std::string fasta_file, std::string fasta_dir, bool skip_soft_clipped, std::ostream& out){ BamTools::BamReader bam_reader; if (!bam_reader.Open(bam_file)) printErrorAndDie("Failed to open BAM file"); std::string ref_seq; int32_t ref_id; if (fasta_file.compare("N/A") == 0) ref_id = -2; else { readFasta(fasta_file, fasta_dir, ref_seq); ref_id = 0; } int32_t* matrix_counts = new int32_t [25*max_read_length](); int32_t* total_counts = new int32_t [5*max_read_length](); int32_t forward = 0, backward = 0; process_reads(bam_reader, max_read_length, ref_id, ref_seq, fasta_dir, skip_soft_clipped, matrix_counts, total_counts, forward, backward); out << forward << "\n" << backward << std::endl; print_confusion_matrix(matrix_counts, total_counts, max_read_length, out); delete [] matrix_counts; delete [] total_counts; }
void read_sample_list(std::string input_file, std::set<std::string>& sample_set){ sample_set.clear(); std::ifstream input(input_file); if (!input.is_open()) printErrorAndDie("Unable to open sample list file " + input_file); std::string line; while (std::getline(input, line)) sample_set.insert(line); }
int main(int argc, char* argv[]){ if (argc != 3) printErrorAndDie("Script requires exactly 2 arguments"); std::string region_file = std::string(argv[1]); std::string vcf_file = std::string(argv[2]); // Read list of regions std::vector<Region> regions; readRegions(region_file, regions, 1000, "", std::cerr); vcflib::VariantCallFile ref_vcf; if(!ref_vcf.open(vcf_file)) printErrorAndDie("Failed to open VCF"); // Populate map with samples in VCF header std::map<std::string, int> sample_indices; for (unsigned int i = 0; i < ref_vcf.sampleNames.size(); i++) sample_indices[ref_vcf.sampleNames[i]] = i; std::vector<std::string> alleles; std::vector<bool> got_priors; int32_t pos; for (unsigned int i = 0; i < regions.size(); i++){ bool success; double* priors = extract_vcf_alleles_and_log_priors(&ref_vcf, &(regions[i]), sample_indices, alleles, got_priors, pos, success, std::cerr); if (success){ std::cerr << "Position=" << pos << std::endl; std::cerr << "Alleles:" << std::endl; for (unsigned int j = 0; j < alleles.size(); j++) std::cerr << alleles[j] << std::endl; } else { std::cerr << "Failed to read alleles and priors for region " << regions[i].str() << std::endl; } alleles.clear(); got_priors.clear(); delete [] priors; } }
int SNPBamProcessor::get_haplotype(BamTools::BamAlignment& aln){ if (!aln.HasTag(HAPLOTYPE_TAG)) return -1; uint8_t haplotype; if (!aln.GetTag(HAPLOTYPE_TAG, haplotype)){ char type; aln.GetTagType(HAPLOTYPE_TAG, type); printErrorAndDie("Failed to extract haplotype tag"); } assert(haplotype == 1 || haplotype == 2); return (int)haplotype; }
std::string BaseQuality::average_base_qualities(std::vector<const std::string*> qualities){ assert(qualities.size() > 0); // Check that all base quality strings are of the same length for (unsigned int i = 0; i < qualities.size(); i++){ if (qualities[i]->size() != qualities[0]->size()) printErrorAndDie("All base quality strings must be of the same length when averaging probabilities"); } // Average raw error probabilities for each base and convert // to the closest quality score std::string avg_qualities('N', qualities[0]->size()); std::vector<double> log_probs(qualities.size()); for (unsigned int i = 0; i < qualities[0]->size(); i++){ for (unsigned int j = 0; j < qualities.size(); j++) log_probs[j] = log_prob_error(qualities[j]->at(i)); double log_mean_prob = log_sum_exp(log_probs) - log(qualities.size()); avg_qualities[i] = closest_char(log_mean_prob); } return avg_qualities; }
bool PEDIGREE_GRAPH::topological_sort(std::vector<PEDIGREE_NODE*>& nodes){ no_ancestors_.clear(); no_descendants_.clear(); nodes_.clear(); std::map<PEDIGREE_NODE*, int> parent_counts; std::vector<PEDIGREE_NODE*> sources; for (int i = 0; i < nodes.size(); i++){ int count = nodes[i]->has_mother() + nodes[i]->has_father(); if (count == 0) sources.push_back(nodes[i]); else parent_counts[nodes[i]] = count; } while (sources.size() != 0){ PEDIGREE_NODE* source = sources.back(); std::vector<PEDIGREE_NODE*>& children = source->get_children(); nodes_.push_back(source); sources.pop_back(); for (auto child_iter = children.begin(); child_iter != children.end(); child_iter++){ auto count_iter = parent_counts.find(*child_iter); if (count_iter == parent_counts.end()){ source->print(std::cerr); (*child_iter)->print(std::cerr); printErrorAndDie("Logical error in topological_sort() for parent " + source->get_name() + " and child " + (*child_iter)->get_name()); } else if (count_iter->second == 1){ sources.push_back(*child_iter); parent_counts.erase(count_iter); } else count_iter->second -= 1; } } return parent_counts.size() == 0; // Only a DAG if no unprocessed individuals are left }
bool extract_sequence(Alignment& aln, int32_t start, int32_t end, std::string& seq){ if (aln.get_start() >= start) return false; if (aln.get_stop() <= end) return false; int align_index = 0; // Index into alignment string int char_index = 0; // Index of current base in current CIGAR element int32_t pos = aln.get_start(); auto cigar_iter = aln.get_cigar_list().begin(); // Extract region sequence if fully spanned by alignment std::stringstream reg_seq; while (cigar_iter != aln.get_cigar_list().end()){ if (char_index == cigar_iter->get_num()){ cigar_iter++; char_index = 0; } else if (pos > end){ if (reg_seq.str() == "") seq = ""; else seq = uppercase(reg_seq.str()); return true; } else if (pos == end){ if (cigar_iter->get_type() == 'I'){ reg_seq << aln.get_alignment().substr(align_index, cigar_iter->get_num()); align_index += cigar_iter->get_num(); char_index = 0; cigar_iter++; } else { if (reg_seq.str() == "") seq = ""; else seq = uppercase(reg_seq.str()); return true; } } else if (pos >= start){ int32_t num_bases = std::min(end-pos, cigar_iter->get_num()-char_index); switch(cigar_iter->get_type()){ case 'I': // Insertion within region num_bases = cigar_iter->get_num(); reg_seq << aln.get_alignment().substr(align_index, num_bases); break; case '=': case 'X': reg_seq << aln.get_alignment().substr(align_index, num_bases); pos += num_bases; break; case 'D': pos += num_bases; break; default: printErrorAndDie("Invalid CIGAR char in extractRegionSequences()"); break; } align_index += num_bases; char_index += num_bases; } else { int32_t num_bases; if (cigar_iter->get_type() == 'I') num_bases = cigar_iter->get_num()-char_index; else { num_bases = std::min(start-pos, cigar_iter->get_num()-char_index); pos += num_bases; } align_index += num_bases; char_index += num_bases; } } printErrorAndDie("Logical error in extract_sequence"); return false; }