string alignment_to_sam(const Alignment& alignment, const string& refseq, const int32_t refpos, const string& cigar, const string& mateseq, const int32_t matepos, const int32_t tlen) { stringstream sam; sam << (!alignment.name().empty() ? alignment.name() : "*") << "\t" << sam_flag(alignment) << "\t" << (refseq.empty() ? "*" : refseq) << "\t" << refpos + 1 << "\t" //<< (alignment.path().mapping_size() ? refpos + 1 : 0) << "\t" // positions are 1-based in SAM, 0 means unmapped << alignment.mapping_quality() << "\t" << (alignment.has_path() && alignment.path().mapping_size() ? cigar : "*") << "\t" << (mateseq == refseq ? "=" : mateseq) << "\t" << matepos + 1 << "\t" << tlen << "\t" << (!alignment.sequence().empty() ? alignment.sequence() : "*") << "\t"; // hack much? if (!alignment.quality().empty()) { const string& quality = alignment.quality(); for (int i = 0; i < quality.size(); ++i) { sam << quality_short_to_char(quality[i]); } } else { sam << "*"; //sam << string(alignment.sequence().size(), 'I'); } //<< (alignment.has_quality() ? string_quality_short_to_char(alignment.quality()) : string(alignment.sequence().size(), 'I')); if (!alignment.read_group().empty()) sam << "\tRG:Z:" << alignment.read_group(); sam << "\n"; return sam.str(); }
void QualAdjAligner::align(Alignment& alignment, Graph& g, bool print_score_matrices) { gssw_graph* graph = create_gssw_graph(g, 0, nullptr); const string& sequence = alignment.sequence(); const string& quality = alignment.quality(); if (quality.length() != sequence.length()) { cerr << "error:[Aligner] sequence and quality strings different lengths, cannot perform base quality adjusted alignmenterror:[Aligner] sequence and quality strings different lengths, cannot perform base quality adjusted alignment" << endl; } gssw_graph_fill_qual_adj(graph, sequence.c_str(), quality.c_str(), nt_table, adjusted_score_matrix, scaled_gap_open, scaled_gap_extension, 15, 2); gssw_graph_mapping* gm = gssw_graph_trace_back_qual_adj (graph, sequence.c_str(), quality.c_str(), sequence.size(), nt_table, adjusted_score_matrix, scaled_gap_open, scaled_gap_extension); gssw_mapping_to_alignment(graph, gm, alignment, print_score_matrices); #ifdef debug gssw_print_graph_mapping(gm, stderr); #endif gssw_graph_mapping_destroy(gm); gssw_graph_destroy(graph); }
Alignment Filter::avg_qual_filter(Alignment& aln){ double total_qual = 0.0; // If the parameter for window size is zero, set the local equivalent // to the entire size of the qual scores. int win_len = window_length; //>= 1 ? window_length : aln.quality().size(); cerr << "UNTESTED" << endl; exit(1); std::function<double(int64_t, int64_t)> calc_avg_qual = [](int64_t total_qual, int64_t length){ return ((double) total_qual / (double) length); }; /** * Helper function. * Sums qual scores from start : start + window_length * if start + window_len > qualstr.size(), return a negative float * otherwise return the average quality within the window * */ std::function<double(string, int, int)> window_qual = [&](string qualstr, int start, int window_length){ if (start + window_length > qualstr.size()){ return -1.0; } total_qual = 0.0; for (int i = start; i < start + window_length; i++){ total_qual += (int) qualstr[i]; } return calc_avg_qual(total_qual, window_length); }; //TODO: handle reversing alignments string quals = aln.quality(); for (int i = 0; i < quals.size() - win_len; i++){ double w_qual = window_qual(quals, i, win_len); if ( w_qual < 0.0){ break; } if (w_qual < min_avg_qual){ return inverse ? aln : Alignment(); } } return inverse ? Alignment() : aln; }
/** * CLI: vg filter -d 10 -q 40 -r -R * -r: track depth of both novel variants and those in the graph. * -R: remove edits that fail the filter (otherwise toss the whole alignment) */ Alignment Filter::qual_filter(Alignment& aln){ string quals = aln.quality(); int offset = qual_offset; for (int i = 0; i < quals.size(); i++){ if (((int) quals[i] - offset) < min_qual){ if (!remove_failing_edits){ return inverse ? aln : Alignment(); } else{ //TODO: Should we really edit out poor-quality bases?? // It's complex and awkward. return inverse ? aln : Alignment(); } } } return inverse ? Alignment() : aln; }
void Pileups::compute_from_edit(NodePileup& pileup, int64_t& node_offset, int64_t& read_offset, const Node& node, const Alignment& alignment, const Mapping& mapping, const Edit& edit) { string seq = edit.sequence(); bool is_reverse = mapping.position().is_reverse(); // ***** MATCH ***** if (edit.from_length() == edit.to_length()) { assert (edit.from_length() > 0); make_match(seq, edit.from_length(), is_reverse); assert(seq.length() == edit.from_length()); int64_t delta = 1; for (int64_t i = 0; i < edit.from_length(); ++i) { BasePileup* base_pileup = get_create_base_pileup(pileup, node_offset); // reference_base if empty if (base_pileup->num_bases() == 0) { base_pileup->set_ref_base(node.sequence()[node_offset]); } else { assert(base_pileup->ref_base() == node.sequence()[node_offset]); } // add base to bases field (converting to ,. if match) char base = seq[i]; if (!edit.sequence().empty() && base_equal(seq[i], node.sequence()[node_offset], is_reverse)) { base = is_reverse ? ',' : '.'; } *base_pileup->mutable_bases() += base; // add quality if there if (!alignment.quality().empty()) { *base_pileup->mutable_qualities() += alignment.quality()[read_offset]; } // pileup size increases by 1 base_pileup->set_num_bases(base_pileup->num_bases() + 1); // move right along read, and left/right depending on strand on reference node_offset += delta; ++read_offset; } } // ***** INSERT ***** else if (edit.from_length() < edit.to_length()) { make_insert(seq, is_reverse); assert(edit.from_length() == 0); // we define insert (like sam) as insertion between current and next // position (on forward node coordinates). this means an insertion before // offset 0 is invalid! int64_t insert_offset = is_reverse ? node_offset : node_offset - 1; if (insert_offset >= 0) { BasePileup* base_pileup = get_create_base_pileup(pileup, insert_offset); // reference_base if empty if (base_pileup->num_bases() == 0) { base_pileup->set_ref_base(node.sequence()[insert_offset]); } else { assert(base_pileup->ref_base() == node.sequence()[insert_offset]); } // add insertion string to bases field // todo: should we reverse complement this if mapping is reversed ??? base_pileup->mutable_bases()->append(seq); if (!alignment.quality().empty()) { *base_pileup->mutable_qualities() += alignment.quality()[read_offset]; } // pileup size increases by 1 base_pileup->set_num_bases(base_pileup->num_bases() + 1); } else { // todo: need to either forget about these, or extend pileup format. // easy solution: change insert to come before position, and just add // optional pileup at n+1st base of node. would like to figure out // how samtools does it first... /* stringstream ss; ss << "Warning: pileup does not support insertions before 0th base in node." << " Offending edit: " << pb2json(edit) << endl; #pragma omp critical(cerr) cerr << ss.str(); */ } // move right along read (and stay put on reference) read_offset += edit.to_length(); } // ***** DELETE ***** else { assert(edit.to_length() == 0); assert(edit.sequence().empty()); int64_t del_start = !is_reverse ? node_offset : node_offset - edit.from_length() + 1; seq = node.sequence().substr(del_start, edit.from_length()); make_delete(seq, is_reverse); BasePileup* base_pileup = get_create_base_pileup(pileup, node_offset); // reference_base if empty if (base_pileup->num_bases() == 0) { base_pileup->set_ref_base(node.sequence()[node_offset]); } else { assert(base_pileup->ref_base() == node.sequence()[node_offset]); } // add deletion string to bases field // todo: should we reverse complement this if mapping is reversed ??? base_pileup->mutable_bases()->append(seq); if (!alignment.quality().empty()) { *base_pileup->mutable_qualities() += alignment.quality()[read_offset]; } // pileup size increases by 1 base_pileup->set_num_bases(base_pileup->num_bases() + 1); int64_t delta = edit.from_length(); // stay put on read, move left/right depending on strand on reference node_offset += delta; } }
void alignment_quality_char_to_short(Alignment& alignment) { alignment.set_quality(string_quality_char_to_short(alignment.quality())); }
void Aligner::align_internal(Alignment& alignment, vector<Alignment>* multi_alignments, Graph& g, int64_t pinned_node_id, bool pin_left, int32_t max_alt_alns, bool print_score_matrices) { // check input integrity if (pin_left && !pinned_node_id) { cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl; exit(EXIT_FAILURE); } if (multi_alignments && !pinned_node_id) { cerr << "error:[Aligner] multiple traceback is not valid in local alignment, only pinned and global" << endl; exit(EXIT_FAILURE); } if (!(multi_alignments) && max_alt_alns != 1) { cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl; exit(EXIT_FAILURE); } // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top // left we need to reverse all the sequences first and translate the alignment back later // create reversed objects if necessary Graph reversed_graph; string reversed_sequence; if (pin_left) { reversed_sequence.resize(alignment.sequence().length()); reverse_copy(alignment.sequence().begin(), alignment.sequence().end(), reversed_sequence.begin()); reverse_graph(g, reversed_graph); } // choose forward or reversed objects Graph* align_graph; string* align_sequence; if (pin_left) { align_graph = &reversed_graph; align_sequence = &reversed_sequence; } else { align_graph = &g; align_sequence = alignment.mutable_sequence(); } // convert into gssw graph and get the counterpart to pinned node (if pinning) gssw_node* pinned_node = nullptr; gssw_graph* graph = create_gssw_graph(*align_graph, pinned_node_id, &pinned_node); if (pinned_node_id & !pinned_node) { cerr << "error:[Aligner] pinned node for pinned alignment is not in graph" << endl; exit(EXIT_FAILURE); } // perform dynamic programming gssw_graph_fill(graph, (*align_sequence).c_str(), nt_table, score_matrix, gap_open, gap_extension, 15, 2); // traceback either from pinned position or optimal local alignment if (pinned_node) { // trace back pinned alignment gssw_graph_mapping** gms = gssw_graph_trace_back_pinned_multi (graph, pinned_node, max_alt_alns, (*align_sequence).c_str(), (*align_sequence).size(), nt_table, score_matrix, gap_open, gap_extension); if (pin_left) { // translate graph and mappings into original node space unreverse_graph(reversed_graph); for (int32_t i = 0; i < max_alt_alns; i++) { unreverse_graph_mapping(gms[i]); } } // convert optimal alignment and store it in the input Alignment object (in the multi alignment, // this will have been set to the first in the vector) if (gms[0]->score > 0) { // have a mapping, can just convert normally gssw_mapping_to_alignment(graph, gms[0], alignment, print_score_matrices); } else { // gssw will not identify mappings with 0 score, infer location based on pinning Mapping* mapping = alignment.mutable_path()->add_mapping(); mapping->set_rank(1); // locate at the end of the node Position* position = mapping->mutable_position(); position->set_node_id(pinned_node_id); position->set_offset(pin_left ? 0 : pinned_node->len); // soft clip Edit* edit = mapping->add_edit(); edit->set_to_length(alignment.sequence().length()); edit->set_sequence(alignment.sequence()); } if (multi_alignments) { // determine how many non-null alignments were returned int32_t num_non_null = max_alt_alns; for (int32_t i = 1; i < max_alt_alns; i++) { if (gms[i]->score <= 0) { num_non_null = i; break; } } // reserve to avoid illegal access errors that occur when the vector reallocates multi_alignments->reserve(num_non_null); // copy the primary alignment multi_alignments->emplace_back(alignment); // convert the alternate alignments and store them at the back of the vector (this will not // execute if we are doing single alignment) for (int32_t i = 1; i < num_non_null; i++) { gssw_graph_mapping* gm = gms[i]; // make new alignment object multi_alignments->emplace_back(); Alignment& next_alignment = multi_alignments->back(); // copy over sequence information from the primary alignment next_alignment.set_sequence(alignment.sequence()); next_alignment.set_quality(alignment.quality()); // get path of the alternate alignment gssw_mapping_to_alignment(graph, gm, next_alignment, print_score_matrices); } } for (int32_t i = 0; i < max_alt_alns; i++) { gssw_graph_mapping_destroy(gms[i]); } free(gms); } else { // trace back local alignment gssw_graph_mapping* gm = gssw_graph_trace_back (graph, (*align_sequence).c_str(), (*align_sequence).size(), nt_table, score_matrix, gap_open, gap_extension); gssw_mapping_to_alignment(graph, gm, alignment, print_score_matrices); gssw_graph_mapping_destroy(gm); } //gssw_graph_print_score_matrices(graph, sequence.c_str(), sequence.size(), stderr); gssw_graph_destroy(graph); }