int softclip_start(Alignment& alignment) { if (alignment.mutable_path()->mapping_size() > 0) { Path* path = alignment.mutable_path(); Mapping* first_mapping = path->mutable_mapping(0); Edit* first_edit = first_mapping->mutable_edit(0); if (first_edit->from_length() == 0 && first_edit->to_length() > 0) { return first_edit->to_length(); } } return 0; }
int softclip_end(Alignment& alignment) { if (alignment.mutable_path()->mapping_size() > 0) { Path* path = alignment.mutable_path(); Mapping* last_mapping = path->mutable_mapping(path->mapping_size()-1); Edit* last_edit = last_mapping->mutable_edit(last_mapping->edit_size()-1); if (last_edit->from_length() == 0 && last_edit->to_length() > 0) { return last_edit->to_length(); } } return 0; }
Alignment Sampler::mutate(const Alignment& aln, double base_error, double indel_error) { if (base_error == 0 && indel_error == 0) return aln; string bases = "ATGC"; uniform_real_distribution<double> rprob(0, 1); uniform_int_distribution<int> rbase(0, 3); Alignment mutaln; for (size_t i = 0; i < aln.path().mapping_size(); ++i) { auto& orig_mapping = aln.path().mapping(i); Mapping* new_mapping = mutaln.mutable_path()->add_mapping(); *new_mapping->mutable_position() = orig_mapping.position(); // for each edit in the mapping for (size_t j = 0; j < orig_mapping.edit_size(); ++j) { auto& orig_edit = orig_mapping.edit(j); auto new_edits = mutate_edit(orig_edit, make_pos_t(orig_mapping.position()), base_error, indel_error, bases, rprob, rbase); for (auto& edit : new_edits) { *new_mapping->add_edit() = edit; } } } // re-derive the alignment's sequence mutaln = simplify(mutaln); mutaln.set_sequence(alignment_seq(mutaln)); mutaln.set_name(aln.name()); return mutaln; }
void flip_nodes(Alignment& a, set<int64_t> ids, const std::function<size_t(int64_t)>& node_length) { Path* path = a.mutable_path(); for(size_t i = 0; i < path->mapping_size(); i++) { // Grab each mapping (includes its position) Mapping* mapping = path->mutable_mapping(i); if(ids.count(mapping->position().node_id())) { // We need to flip this mapping *mapping = reverse_mapping(*mapping, node_length); } } }
// generates a perfect alignment from the graph Alignment Sampler::alignment(size_t length) { string seq; Alignment aln; Path* path = aln.mutable_path(); pos_t pos = position(); char c = pos_char(pos); // we do something wildly inefficient but conceptually clean // for each position in the mapping we add a mapping // at the end we will simplify the alignment, merging redundant mappings do { // add in the char for the current position seq += c; Mapping* mapping = path->add_mapping(); *mapping->mutable_position() = make_position(pos); Edit* edit = mapping->add_edit(); edit->set_from_length(1); edit->set_to_length(1); // decide the next position auto nextc = next_pos_chars(pos); // no new positions mean we are done; we've reached the end of the graph if (nextc.empty()) break; // what positions do we go to next? vector<pos_t> nextp; for (auto& n : nextc) nextp.push_back(n.first); // pick one at random uniform_int_distribution<int> next_dist(0, nextc.size()-1); // update our position pos = nextp.at(next_dist(rng)); // update our char c = nextc[pos]; } while (seq.size() < length); // save our sequence in the alignment aln.set_sequence(seq); aln = simplify(aln); { // name the alignment string data; aln.SerializeToString(&data); int n; #pragma omp critical(nonce) n = nonce++; data += std::to_string(n); const string hash = sha1head(data, 16); aln.set_name(hash); } // and simplify it aln.set_identity(identity(aln.path())); return aln; }
Alignment strip_from_start(const Alignment& aln, size_t drop) { if (!drop) return aln; Alignment res; res.set_name(aln.name()); res.set_score(aln.score()); //cerr << "drop " << drop << " from start" << endl; res.set_sequence(aln.sequence().substr(drop)); if (!aln.has_path()) return res; *res.mutable_path() = cut_path(aln.path(), drop).second; assert(res.has_path()); if (alignment_to_length(res) != res.sequence().size()) { cerr << "failed!!! drop from start 轰" << endl; cerr << pb2json(res) << endl << endl; assert(false); } return res; }
Alignment reverse_alignment(const Alignment& aln, const function<int64_t(int64_t)>& node_length) { // We're going to reverse the alignment and all its mappings. // TODO: should we/can we do this in place? Alignment reversed = aln; reversed.set_sequence(reverse_complement(aln.sequence())); if(aln.has_path()) { // Now invert the order of the mappings, and for each mapping, flip the // is_reverse flag. The edits within mappings also get put in reverse // order, get their positions corrected, and get their sequences get // reverse complemented. *reversed.mutable_path() = reverse_path(aln.path(), node_length); } return reversed; }
Alignment strip_from_end(const Alignment& aln, size_t drop) { if (!drop) return aln; Alignment res; res.set_name(aln.name()); res.set_score(aln.score()); //cerr << "drop " << drop << " from end" << endl; size_t cut_at = aln.sequence().size()-drop; //cerr << "Cut at " << cut_at << endl; res.set_sequence(aln.sequence().substr(0, cut_at)); if (!aln.has_path()) return res; *res.mutable_path() = cut_path(aln.path(), cut_at).first; assert(res.has_path()); if (alignment_to_length(res) != res.sequence().size()) { cerr << "failed!!! drop from end 轰" << endl; cerr << pb2json(res) << endl << endl; assert(false); } return res; }
// merge that properly handles long indels // assumes that alignments should line up end-to-end Alignment merge_alignments(const vector<Alignment>& alns, bool debug) { if (alns.size() == 0) { Alignment aln; return aln; } else if (alns.size() == 1) { return alns.front(); } // where possible get node and target lengths // to validate after merge /* map<int64_t, map<size_t, set<const Alignment*> > > node_lengths; map<int64_t, map<size_t, set<const Alignment*> > > to_lengths; for (auto& aln : alns) { auto& path = aln.path(); // find a mapping that overlaps the whole node // note that edits aren't simplified // so deletions are intact for (size_t i = 0; i < path.mapping_size(); ++i) { auto& m = path.mapping(i); if (m.position().offset() == 0) { // can we see if the next mapping is on the following node if (i < path.mapping_size()-1 && path.mapping(i+1).position().offset() == 0 && mapping_from_length(path.mapping(i+1)) && mapping_from_length(m)) { // we cover the node, record the to_length and from_length set<const Alignment*>& n = node_lengths[m.position().node_id()][from_length(m)]; n.insert(&aln); set<const Alignment*>& t = to_lengths[m.position().node_id()][to_length(m)]; t.insert(&aln); } } } } // verify our input by checking for disagreements for (auto& n : node_lengths) { auto& node_id = n.first; if (n.second.size() > 1) { cerr << "disagreement in node lengths for " << node_id << endl; for (auto& l : n.second) { cerr << "alignments that report length of " << l.first << endl; for (auto& a : l.second) { cerr << pb2json(*a) << endl; } } } else { //cerr << n.second.begin()->second.size() << " alignments support " // << n.second.begin()->first << " as length for " << node_id << endl; } } */ // parallel merge algorithm // for each generation // merge 0<-0+1, 1<-2+3, ... // until there is only one alignment vector<Alignment> last = alns; // get the alignments ready for merge #pragma omp parallel for for (size_t i = 0; i < last.size(); ++i) { Alignment& aln = last[i]; //cerr << "on " << i << "th aln" << endl // << pb2json(aln) << endl; if (!aln.has_path()) { Mapping m; Edit* e = m.add_edit(); e->set_to_length(aln.sequence().size()); e->set_sequence(aln.sequence()); *aln.mutable_path()->add_mapping() = m; } } while (last.size() > 1) { //cerr << "last size " << last.size() << endl; size_t new_count = last.size()/2; //cerr << "new count b4 " << new_count << endl; new_count += last.size() % 2; // force binary //cerr << "New count = " << new_count << endl; vector<Alignment> curr; curr.resize(new_count); #pragma omp parallel for for (size_t i = 0; i < curr.size(); ++i) { //cerr << "merging " << 2*i << " and " << 2*i+1 << endl; // take a pair from the old alignments // merge them into this one if (2*i+1 < last.size()) { auto& a1 = last[2*i]; auto& a2 = last[2*i+1]; curr[i] = merge_alignments(a1, a2, debug); // check that the merge did the right thing /* auto& a3 = curr[i]; for (size_t j = 0; j < a3.path().mapping_size()-1; ++j) { // look up reported node length // and compare to what we saw // skips last mapping auto& m = a3.path().mapping(j); if (from_length(m) == to_length(m) && m.has_position() && m.position().offset()==0 && a3.path().mapping(j+1).has_position() && a3.path().mapping(j+1).position().offset()==0) { auto nl = node_lengths.find(m.position().node_id()); if (nl != node_lengths.end()) { if (nl->second.find(from_length(m)) == nl->second.end()) { cerr << "node length is not consistent for " << m.position().node_id() << endl; cerr << "expected " << nl->second.begin()->first << endl; cerr << "got " << from_length(m) << endl; cerr << "inputs:" << endl << pb2json(a1) << endl << pb2json(a2) << endl << "output: " << endl << pb2json(a3) << endl; //exit(1); } } } } */ } else { auto& a1 = last[2*i]; //cerr << "no need to merge" << endl; curr[i] = a1; } } last = curr; } Alignment res = last.front(); *res.mutable_path() = simplify(res.path()); return res; }
void Aligner::align_internal(Alignment& alignment, vector<Alignment>* multi_alignments, Graph& g, int64_t pinned_node_id, bool pin_left, int32_t max_alt_alns, bool print_score_matrices) { // check input integrity if (pin_left && !pinned_node_id) { cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl; exit(EXIT_FAILURE); } if (multi_alignments && !pinned_node_id) { cerr << "error:[Aligner] multiple traceback is not valid in local alignment, only pinned and global" << endl; exit(EXIT_FAILURE); } if (!(multi_alignments) && max_alt_alns != 1) { cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl; exit(EXIT_FAILURE); } // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top // left we need to reverse all the sequences first and translate the alignment back later // create reversed objects if necessary Graph reversed_graph; string reversed_sequence; if (pin_left) { reversed_sequence.resize(alignment.sequence().length()); reverse_copy(alignment.sequence().begin(), alignment.sequence().end(), reversed_sequence.begin()); reverse_graph(g, reversed_graph); } // choose forward or reversed objects Graph* align_graph; string* align_sequence; if (pin_left) { align_graph = &reversed_graph; align_sequence = &reversed_sequence; } else { align_graph = &g; align_sequence = alignment.mutable_sequence(); } // convert into gssw graph and get the counterpart to pinned node (if pinning) gssw_node* pinned_node = nullptr; gssw_graph* graph = create_gssw_graph(*align_graph, pinned_node_id, &pinned_node); if (pinned_node_id & !pinned_node) { cerr << "error:[Aligner] pinned node for pinned alignment is not in graph" << endl; exit(EXIT_FAILURE); } // perform dynamic programming gssw_graph_fill(graph, (*align_sequence).c_str(), nt_table, score_matrix, gap_open, gap_extension, 15, 2); // traceback either from pinned position or optimal local alignment if (pinned_node) { // trace back pinned alignment gssw_graph_mapping** gms = gssw_graph_trace_back_pinned_multi (graph, pinned_node, max_alt_alns, (*align_sequence).c_str(), (*align_sequence).size(), nt_table, score_matrix, gap_open, gap_extension); if (pin_left) { // translate graph and mappings into original node space unreverse_graph(reversed_graph); for (int32_t i = 0; i < max_alt_alns; i++) { unreverse_graph_mapping(gms[i]); } } // convert optimal alignment and store it in the input Alignment object (in the multi alignment, // this will have been set to the first in the vector) if (gms[0]->score > 0) { // have a mapping, can just convert normally gssw_mapping_to_alignment(graph, gms[0], alignment, print_score_matrices); } else { // gssw will not identify mappings with 0 score, infer location based on pinning Mapping* mapping = alignment.mutable_path()->add_mapping(); mapping->set_rank(1); // locate at the end of the node Position* position = mapping->mutable_position(); position->set_node_id(pinned_node_id); position->set_offset(pin_left ? 0 : pinned_node->len); // soft clip Edit* edit = mapping->add_edit(); edit->set_to_length(alignment.sequence().length()); edit->set_sequence(alignment.sequence()); } if (multi_alignments) { // determine how many non-null alignments were returned int32_t num_non_null = max_alt_alns; for (int32_t i = 1; i < max_alt_alns; i++) { if (gms[i]->score <= 0) { num_non_null = i; break; } } // reserve to avoid illegal access errors that occur when the vector reallocates multi_alignments->reserve(num_non_null); // copy the primary alignment multi_alignments->emplace_back(alignment); // convert the alternate alignments and store them at the back of the vector (this will not // execute if we are doing single alignment) for (int32_t i = 1; i < num_non_null; i++) { gssw_graph_mapping* gm = gms[i]; // make new alignment object multi_alignments->emplace_back(); Alignment& next_alignment = multi_alignments->back(); // copy over sequence information from the primary alignment next_alignment.set_sequence(alignment.sequence()); next_alignment.set_quality(alignment.quality()); // get path of the alternate alignment gssw_mapping_to_alignment(graph, gm, next_alignment, print_score_matrices); } } for (int32_t i = 0; i < max_alt_alns; i++) { gssw_graph_mapping_destroy(gms[i]); } free(gms); } else { // trace back local alignment gssw_graph_mapping* gm = gssw_graph_trace_back (graph, (*align_sequence).c_str(), (*align_sequence).size(), nt_table, score_matrix, gap_open, gap_extension); gssw_mapping_to_alignment(graph, gm, alignment, print_score_matrices); gssw_graph_mapping_destroy(gm); } //gssw_graph_print_score_matrices(graph, sequence.c_str(), sequence.size(), stderr); gssw_graph_destroy(graph); }
void Aligner::gssw_mapping_to_alignment(gssw_graph* graph, gssw_graph_mapping* gm, Alignment& alignment, bool print_score_matrices) { alignment.clear_path(); alignment.set_score(gm->score); alignment.set_query_position(0); Path* path = alignment.mutable_path(); //alignment.set_cigar(graph_cigar(gm)); gssw_graph_cigar* gc = &gm->cigar; gssw_node_cigar* nc = gc->elements; int to_pos = 0; int from_pos = gm->position; //cerr << "gm->position " << gm->position << endl; string& to_seq = *alignment.mutable_sequence(); //cerr << "-------------" << endl; if (print_score_matrices) { gssw_graph_print_score_matrices(graph, to_seq.c_str(), to_seq.size(), stderr); //cerr << alignment.DebugString() << endl; } for (int i = 0; i < gc->length; ++i, ++nc) { if (i > 0) from_pos = 0; // reset for each node after the first // check that the current alignment has a non-zero length gssw_cigar* c = nc->cigar; int l = c->length; if (l == 0) continue; gssw_cigar_element* e = c->elements; Node* from_node = (Node*) nc->node->data; string& from_seq = *from_node->mutable_sequence(); Mapping* mapping = path->add_mapping(); mapping->mutable_position()->set_node_id(nc->node->id); mapping->mutable_position()->set_offset(from_pos); mapping->set_rank(path->mapping_size()); //cerr << from_node->id() << ":" << endl; for (int j=0; j < l; ++j, ++e) { Edit* edit; int32_t length = e->length; //cerr << e->length << e->type << endl; switch (e->type) { case 'M': case 'X': case 'N': { // do the sequences match? // emit a stream of "SNPs" and matches int h = from_pos; int last_start = from_pos; int k = to_pos; for ( ; h < from_pos + length; ++h, ++k) { //cerr << h << ":" << k << " " << from_seq[h] << " " << to_seq[k] << endl; if (from_seq[h] != to_seq[k]) { // emit the last "match" region if (h-last_start > 0) { edit = mapping->add_edit(); edit->set_from_length(h-last_start); edit->set_to_length(h-last_start); } // set up the SNP edit = mapping->add_edit(); edit->set_from_length(1); edit->set_to_length(1); edit->set_sequence(to_seq.substr(k,1)); last_start = h+1; } } // handles the match at the end or the case of no SNP if (h-last_start > 0) { edit = mapping->add_edit(); edit->set_from_length(h-last_start); edit->set_to_length(h-last_start); } to_pos += length; from_pos += length; } break; case 'D': edit = mapping->add_edit(); edit->set_from_length(length); edit->set_to_length(0); from_pos += length; break; case 'I': edit = mapping->add_edit(); edit->set_from_length(0); edit->set_to_length(length); edit->set_sequence(to_seq.substr(to_pos, length)); to_pos += length; break; case 'S': // note that soft clips and insertions are semantically equivalent // and can only be differentiated by their position in the read // with soft clips coming at the start or end edit = mapping->add_edit(); edit->set_from_length(0); edit->set_to_length(length); edit->set_sequence(to_seq.substr(to_pos, length)); to_pos += length; break; default: cerr << "error:[Aligner::gssw_mapping_to_alignment] " << "unsupported cigar op type " << e->type << endl; exit(1); break; } } //cerr << "path to_length " << path_to_length(*path) << endl; } // set identity alignment.set_identity(identity(alignment.path())); }