void Pileups::compute_from_alignment(VG& graph, Alignment& alignment) { // if we start reversed if (alignment.has_path() && alignment.path().mapping(0).position().is_reverse()) { alignment = reverse_alignment(alignment, (function<int64_t(int64_t)>) ([&graph](int64_t id) { return graph.get_node(id)->sequence().size(); })); } const Path& path = alignment.path(); int64_t read_offset = 0; for (int i = 0; i < path.mapping_size(); ++i) { const Mapping& mapping = path.mapping(i); if (graph.has_node(mapping.position().node_id())) { const Node* node = graph.get_node(mapping.position().node_id()); NodePileup* pileup = get_create(node->id()); int64_t node_offset = mapping.position().offset(); for (int j = 0; j < mapping.edit_size(); ++j) { const Edit& edit = mapping.edit(j); // process all pileups in edit. // update the offsets as we go compute_from_edit(*pileup, node_offset, read_offset, *node, alignment, mapping, edit); } } } assert(alignment.sequence().empty() || alignment.path().mapping_size() == 0 || read_offset == alignment.sequence().length()); }
Alignment Sampler::alignment_with_error(size_t length, double base_error, double indel_error) { size_t maxiter = 100; Alignment aln; if (base_error > 0 || indel_error > 0) { // sample a longer-than necessary alignment, then trim size_t iter = 0; while (iter++ < maxiter) { aln = mutate( alignment(length + 2 * ((double) length * indel_error)), base_error, indel_error); if (aln.sequence().size() == length) { break; } else if (aln.sequence().size() > length) { aln = strip_from_end(aln, aln.sequence().size() - length); break; } } if (iter == maxiter) { cerr << "[vg::Sampler] Warning: could not generate alignment of sufficient length. " << "Graph may be too small, or indel rate too high." << endl; } } else { aln = alignment(length); } aln.set_identity(identity(aln.path())); return aln; }
Alignment Filter::soft_clip_filter(Alignment& aln){ //Find overhangs - portions of the read that // are inserted at the ends. if (aln.path().mapping_size() > 0){ Path path = aln.path(); Edit left_edit = path.mapping(0).edit(0); Edit right_edit = path.mapping(path.mapping_size() - 1).edit(path.mapping(path.mapping_size() - 1).edit_size() - 1); int left_overhang = left_edit.to_length() - left_edit.from_length(); int right_overhang = right_edit.to_length() - right_edit.from_length(); if (left_overhang > soft_clip_limit || right_overhang > soft_clip_limit){ return inverse ? Alignment() : aln; } else{ return inverse ? aln : Alignment(); } } else{ if (aln.sequence().length() > soft_clip_limit){ return inverse ? Alignment() : aln; } cerr << "WARNING: SHORT ALIGNMENT: " << aln.sequence().size() << "bp" << endl << "WITH NO MAPPINGS TO REFERENCE" << endl << "CONSIDER REMOVING IT FROM ANALYSIS" << endl; return inverse ? Alignment() : aln; } }
string alignment_to_sam(const Alignment& alignment, const string& refseq, const int32_t refpos, const string& cigar, const string& mateseq, const int32_t matepos, const int32_t tlen) { stringstream sam; sam << (!alignment.name().empty() ? alignment.name() : "*") << "\t" << sam_flag(alignment) << "\t" << (refseq.empty() ? "*" : refseq) << "\t" << refpos + 1 << "\t" //<< (alignment.path().mapping_size() ? refpos + 1 : 0) << "\t" // positions are 1-based in SAM, 0 means unmapped << alignment.mapping_quality() << "\t" << (alignment.has_path() && alignment.path().mapping_size() ? cigar : "*") << "\t" << (mateseq == refseq ? "=" : mateseq) << "\t" << matepos + 1 << "\t" << tlen << "\t" << (!alignment.sequence().empty() ? alignment.sequence() : "*") << "\t"; // hack much? if (!alignment.quality().empty()) { const string& quality = alignment.quality(); for (int i = 0; i < quality.size(); ++i) { sam << quality_short_to_char(quality[i]); } } else { sam << "*"; //sam << string(alignment.sequence().size(), 'I'); } //<< (alignment.has_quality() ? string_quality_short_to_char(alignment.quality()) : string(alignment.sequence().size(), 'I')); if (!alignment.read_group().empty()) sam << "\tRG:Z:" << alignment.read_group(); sam << "\n"; return sam.str(); }
Alignment merge_alignments(const Alignment& a1, const Alignment& a2, bool debug) { //cerr << "overlap is " << overlap << endl; // if either doesn't have a path, then treat it like a massive softclip if (debug) cerr << "merging alignments " << endl << pb2json(a1) << endl << pb2json(a2) << endl; // concatenate them Alignment a3; a3.set_sequence(a1.sequence() + a2.sequence()); *a3.mutable_path() = concat_paths(a1.path(), a2.path()); if (debug) cerr << "merged alignments, result is " << endl << pb2json(a3) << endl; return a3; }
void QualAdjAligner::align(Alignment& alignment, Graph& g, bool print_score_matrices) { gssw_graph* graph = create_gssw_graph(g, 0, nullptr); const string& sequence = alignment.sequence(); const string& quality = alignment.quality(); if (quality.length() != sequence.length()) { cerr << "error:[Aligner] sequence and quality strings different lengths, cannot perform base quality adjusted alignmenterror:[Aligner] sequence and quality strings different lengths, cannot perform base quality adjusted alignment" << endl; } gssw_graph_fill_qual_adj(graph, sequence.c_str(), quality.c_str(), nt_table, adjusted_score_matrix, scaled_gap_open, scaled_gap_extension, 15, 2); gssw_graph_mapping* gm = gssw_graph_trace_back_qual_adj (graph, sequence.c_str(), quality.c_str(), sequence.size(), nt_table, adjusted_score_matrix, scaled_gap_open, scaled_gap_extension); gssw_mapping_to_alignment(graph, gm, alignment, print_score_matrices); #ifdef debug gssw_print_graph_mapping(gm, stderr); #endif gssw_graph_mapping_destroy(gm); gssw_graph_destroy(graph); }
/** * Filter reads that are less than <PCTID> reference. * I.E. if a read matches the reference along 80% of its * length, and your cutoff is 90% PCTID, throw it out. */ Alignment Filter::percent_identity_filter(Alignment& aln){ double read_pctid = 0.0; //read pct_id = len(matching sequence / len(total sequence) int64_t aln_total_len = aln.sequence().size(); int64_t aln_match_len = 0; std::function<double(int64_t, int64_t)> calc_pct_id = [](int64_t rp, int64_t ttlp){ return ((double) rp / (double) ttlp); }; Path path = aln.path(); //TODO handle reversing mappings for (int i = 0; i < path.mapping_size(); i++){ Mapping mapping = path.mapping(i); for (int j = 0; j < mapping.edit_size(); j++){ Edit ee = mapping.edit(j); if (ee.from_length() == ee.to_length() && ee.sequence() == ""){ aln_match_len += ee.to_length(); } } } if (calc_pct_id(aln_match_len, aln_total_len) < min_percent_identity){ return inverse ? aln : Alignment(); } return inverse ? Alignment() : aln; }
Alignment strip_from_start(const Alignment& aln, size_t drop) { if (!drop) return aln; Alignment res; res.set_name(aln.name()); res.set_score(aln.score()); //cerr << "drop " << drop << " from start" << endl; res.set_sequence(aln.sequence().substr(drop)); if (!aln.has_path()) return res; *res.mutable_path() = cut_path(aln.path(), drop).second; assert(res.has_path()); if (alignment_to_length(res) != res.sequence().size()) { cerr << "failed!!! drop from start 轰" << endl; cerr << pb2json(res) << endl << endl; assert(false); } return res; }
Alignment strip_from_end(const Alignment& aln, size_t drop) { if (!drop) return aln; Alignment res; res.set_name(aln.name()); res.set_score(aln.score()); //cerr << "drop " << drop << " from end" << endl; size_t cut_at = aln.sequence().size()-drop; //cerr << "Cut at " << cut_at << endl; res.set_sequence(aln.sequence().substr(0, cut_at)); if (!aln.has_path()) return res; *res.mutable_path() = cut_path(aln.path(), cut_at).first; assert(res.has_path()); if (alignment_to_length(res) != res.sequence().size()) { cerr << "failed!!! drop from end 轰" << endl; cerr << pb2json(res) << endl << endl; assert(false); } return res; }
Alignment reverse_alignment(const Alignment& aln, const function<int64_t(int64_t)>& node_length) { // We're going to reverse the alignment and all its mappings. // TODO: should we/can we do this in place? Alignment reversed = aln; reversed.set_sequence(reverse_complement(aln.sequence())); if(aln.has_path()) { // Now invert the order of the mappings, and for each mapping, flip the // is_reverse flag. The edits within mappings also get put in reverse // order, get their positions corrected, and get their sequences get // reverse complemented. *reversed.mutable_path() = reverse_path(aln.path(), node_length); } return reversed; }
void GSSWAligner::align(Alignment& alignment) { const string& sequence = alignment.sequence(); gssw_graph_fill(graph, sequence.c_str(), nt_table, score_matrix, gap_open, gap_extension, 15, 2); gssw_graph_mapping* gm = gssw_graph_trace_back (graph, sequence.c_str(), sequence.size(), match, mismatch, gap_open, gap_extension); gssw_mapping_to_alignment(gm, alignment); //gssw_print_graph_mapping(gm); gssw_graph_mapping_destroy(gm); }
void Graph::insertSequenceAlignment(const Alignment& alignment, const string& seq, const string& label) { const string& aln_sequence = alignment.sequence(); const deque<int>& seq_idxs = alignment.seq_idxs(); const deque<int>& node_ids = alignment.node_ids(); int first_id = -1; int head_id = -1; int tail_id = -1; pair<uint32_t, uint32_t> prefix_end_ids; pair<uint32_t, uint32_t> suffix_end_ids; // because of local alignment prefix or sufix of sequence can be unaligned // so we add it directly to graph deque<uint32_t> valid_seq_idxs; for (auto idx : seq_idxs) { if (idx != -1) { valid_seq_idxs.emplace_back(idx); } } uint32_t aln_seq_start_idx = valid_seq_idxs.front(); uint32_t aln_seq_end_idx = valid_seq_idxs.back(); if (aln_seq_start_idx > 0) { prefix_end_ids = addUnmatchedSequence( aln_sequence.substr(0, aln_seq_start_idx), label, false); first_id = prefix_end_ids.first; head_id = prefix_end_ids.second; } if (aln_seq_end_idx < aln_sequence.length()) { suffix_end_ids = addUnmatchedSequence( aln_sequence.substr(aln_seq_end_idx + 1), label, false); tail_id = suffix_end_ids.first; } // aligned part of sequence uint32_t size = max(seq_idxs.size(), node_ids.size()); for (uint32_t i = 0; i < size; ++i) { auto& seq_idx = seq_idxs[i]; auto& match_id = node_ids[i]; if (seq_idx == -1) { continue; } int node_id = -1; char base = aln_sequence[seq_idx]; if (match_id == -1) { // if sequence base unmatched with graph node add new node addNode(base); node_id = next_id_ - 1; } else if (nodes_[match_id]->base() == base) { // if sequence base matched to a node with same base node_id = match_id; } else { // if sequence base matched to a node with different base // which is aligned to a node with same base int found_node_id = -1; for (auto id : nodes_[match_id]->getAlignedIds()) { if (nodes_[id]->base() == base) { found_node_id = id; break; } } if (found_node_id == -1) { // we didn't find aligned node with same base addNode(base); node_id = next_id_ - 1; // add all aligned to nodes to newly created node for (auto id : nodes_[match_id]->getAlignedIds()) { nodes_[node_id]->addAlignedNode(id); } nodes_[node_id]->addAlignedNode(match_id); // to nodes aligned to newly created node add this node // as aligned to for (auto id : nodes_[node_id]->getAlignedIds()) { nodes_[id]->addAlignedNode(node_id); } } else { // node id is found node id node_id = found_node_id; } } if (head_id != -1 && node_id != -1) { addEdge(head_id, node_id, label); } head_id = node_id; if (first_id == -1) { first_id = head_id; } } // connect aligned part with unaligned suffix if (head_id != -1 && tail_id != -1) { addEdge(head_id, tail_id, label); } // resort nodes order topological_sort(); sequences_.emplace_back(seq); labels_.emplace_back(label); start_ids_.emplace_back(first_id); }
/// Make seeds for Alignments based on their sequences. inline string make_shuffle_seed(const Alignment& aln) { return aln.sequence(); }
void Aligner::align_internal(Alignment& alignment, vector<Alignment>* multi_alignments, Graph& g, int64_t pinned_node_id, bool pin_left, int32_t max_alt_alns, bool print_score_matrices) { // check input integrity if (pin_left && !pinned_node_id) { cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl; exit(EXIT_FAILURE); } if (multi_alignments && !pinned_node_id) { cerr << "error:[Aligner] multiple traceback is not valid in local alignment, only pinned and global" << endl; exit(EXIT_FAILURE); } if (!(multi_alignments) && max_alt_alns != 1) { cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl; exit(EXIT_FAILURE); } // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top // left we need to reverse all the sequences first and translate the alignment back later // create reversed objects if necessary Graph reversed_graph; string reversed_sequence; if (pin_left) { reversed_sequence.resize(alignment.sequence().length()); reverse_copy(alignment.sequence().begin(), alignment.sequence().end(), reversed_sequence.begin()); reverse_graph(g, reversed_graph); } // choose forward or reversed objects Graph* align_graph; string* align_sequence; if (pin_left) { align_graph = &reversed_graph; align_sequence = &reversed_sequence; } else { align_graph = &g; align_sequence = alignment.mutable_sequence(); } // convert into gssw graph and get the counterpart to pinned node (if pinning) gssw_node* pinned_node = nullptr; gssw_graph* graph = create_gssw_graph(*align_graph, pinned_node_id, &pinned_node); if (pinned_node_id & !pinned_node) { cerr << "error:[Aligner] pinned node for pinned alignment is not in graph" << endl; exit(EXIT_FAILURE); } // perform dynamic programming gssw_graph_fill(graph, (*align_sequence).c_str(), nt_table, score_matrix, gap_open, gap_extension, 15, 2); // traceback either from pinned position or optimal local alignment if (pinned_node) { // trace back pinned alignment gssw_graph_mapping** gms = gssw_graph_trace_back_pinned_multi (graph, pinned_node, max_alt_alns, (*align_sequence).c_str(), (*align_sequence).size(), nt_table, score_matrix, gap_open, gap_extension); if (pin_left) { // translate graph and mappings into original node space unreverse_graph(reversed_graph); for (int32_t i = 0; i < max_alt_alns; i++) { unreverse_graph_mapping(gms[i]); } } // convert optimal alignment and store it in the input Alignment object (in the multi alignment, // this will have been set to the first in the vector) if (gms[0]->score > 0) { // have a mapping, can just convert normally gssw_mapping_to_alignment(graph, gms[0], alignment, print_score_matrices); } else { // gssw will not identify mappings with 0 score, infer location based on pinning Mapping* mapping = alignment.mutable_path()->add_mapping(); mapping->set_rank(1); // locate at the end of the node Position* position = mapping->mutable_position(); position->set_node_id(pinned_node_id); position->set_offset(pin_left ? 0 : pinned_node->len); // soft clip Edit* edit = mapping->add_edit(); edit->set_to_length(alignment.sequence().length()); edit->set_sequence(alignment.sequence()); } if (multi_alignments) { // determine how many non-null alignments were returned int32_t num_non_null = max_alt_alns; for (int32_t i = 1; i < max_alt_alns; i++) { if (gms[i]->score <= 0) { num_non_null = i; break; } } // reserve to avoid illegal access errors that occur when the vector reallocates multi_alignments->reserve(num_non_null); // copy the primary alignment multi_alignments->emplace_back(alignment); // convert the alternate alignments and store them at the back of the vector (this will not // execute if we are doing single alignment) for (int32_t i = 1; i < num_non_null; i++) { gssw_graph_mapping* gm = gms[i]; // make new alignment object multi_alignments->emplace_back(); Alignment& next_alignment = multi_alignments->back(); // copy over sequence information from the primary alignment next_alignment.set_sequence(alignment.sequence()); next_alignment.set_quality(alignment.quality()); // get path of the alternate alignment gssw_mapping_to_alignment(graph, gm, next_alignment, print_score_matrices); } } for (int32_t i = 0; i < max_alt_alns; i++) { gssw_graph_mapping_destroy(gms[i]); } free(gms); } else { // trace back local alignment gssw_graph_mapping* gm = gssw_graph_trace_back (graph, (*align_sequence).c_str(), (*align_sequence).size(), nt_table, score_matrix, gap_open, gap_extension); gssw_mapping_to_alignment(graph, gm, alignment, print_score_matrices); gssw_graph_mapping_destroy(gm); } //gssw_graph_print_score_matrices(graph, sequence.c_str(), sequence.size(), stderr); gssw_graph_destroy(graph); }