Alignment Sampler::alignment_with_error(size_t length, double base_error, double indel_error) { size_t maxiter = 100; Alignment aln; if (base_error > 0 || indel_error > 0) { // sample a longer-than necessary alignment, then trim size_t iter = 0; while (iter++ < maxiter) { aln = mutate( alignment(length + 2 * ((double) length * indel_error)), base_error, indel_error); if (aln.sequence().size() == length) { break; } else if (aln.sequence().size() > length) { aln = strip_from_end(aln, aln.sequence().size() - length); break; } } if (iter == maxiter) { cerr << "[vg::Sampler] Warning: could not generate alignment of sufficient length. " << "Graph may be too small, or indel rate too high." << endl; } } else { aln = alignment(length); } aln.set_identity(identity(aln.path())); return aln; }
// generates a perfect alignment from the graph Alignment Sampler::alignment(size_t length) { string seq; Alignment aln; Path* path = aln.mutable_path(); pos_t pos = position(); char c = pos_char(pos); // we do something wildly inefficient but conceptually clean // for each position in the mapping we add a mapping // at the end we will simplify the alignment, merging redundant mappings do { // add in the char for the current position seq += c; Mapping* mapping = path->add_mapping(); *mapping->mutable_position() = make_position(pos); Edit* edit = mapping->add_edit(); edit->set_from_length(1); edit->set_to_length(1); // decide the next position auto nextc = next_pos_chars(pos); // no new positions mean we are done; we've reached the end of the graph if (nextc.empty()) break; // what positions do we go to next? vector<pos_t> nextp; for (auto& n : nextc) nextp.push_back(n.first); // pick one at random uniform_int_distribution<int> next_dist(0, nextc.size()-1); // update our position pos = nextp.at(next_dist(rng)); // update our char c = nextc[pos]; } while (seq.size() < length); // save our sequence in the alignment aln.set_sequence(seq); aln = simplify(aln); { // name the alignment string data; aln.SerializeToString(&data); int n; #pragma omp critical(nonce) n = nonce++; data += std::to_string(n); const string hash = sha1head(data, 16); aln.set_name(hash); } // and simplify it aln.set_identity(identity(aln.path())); return aln; }
void Aligner::gssw_mapping_to_alignment(gssw_graph* graph, gssw_graph_mapping* gm, Alignment& alignment, bool print_score_matrices) { alignment.clear_path(); alignment.set_score(gm->score); alignment.set_query_position(0); Path* path = alignment.mutable_path(); //alignment.set_cigar(graph_cigar(gm)); gssw_graph_cigar* gc = &gm->cigar; gssw_node_cigar* nc = gc->elements; int to_pos = 0; int from_pos = gm->position; //cerr << "gm->position " << gm->position << endl; string& to_seq = *alignment.mutable_sequence(); //cerr << "-------------" << endl; if (print_score_matrices) { gssw_graph_print_score_matrices(graph, to_seq.c_str(), to_seq.size(), stderr); //cerr << alignment.DebugString() << endl; } for (int i = 0; i < gc->length; ++i, ++nc) { if (i > 0) from_pos = 0; // reset for each node after the first // check that the current alignment has a non-zero length gssw_cigar* c = nc->cigar; int l = c->length; if (l == 0) continue; gssw_cigar_element* e = c->elements; Node* from_node = (Node*) nc->node->data; string& from_seq = *from_node->mutable_sequence(); Mapping* mapping = path->add_mapping(); mapping->mutable_position()->set_node_id(nc->node->id); mapping->mutable_position()->set_offset(from_pos); mapping->set_rank(path->mapping_size()); //cerr << from_node->id() << ":" << endl; for (int j=0; j < l; ++j, ++e) { Edit* edit; int32_t length = e->length; //cerr << e->length << e->type << endl; switch (e->type) { case 'M': case 'X': case 'N': { // do the sequences match? // emit a stream of "SNPs" and matches int h = from_pos; int last_start = from_pos; int k = to_pos; for ( ; h < from_pos + length; ++h, ++k) { //cerr << h << ":" << k << " " << from_seq[h] << " " << to_seq[k] << endl; if (from_seq[h] != to_seq[k]) { // emit the last "match" region if (h-last_start > 0) { edit = mapping->add_edit(); edit->set_from_length(h-last_start); edit->set_to_length(h-last_start); } // set up the SNP edit = mapping->add_edit(); edit->set_from_length(1); edit->set_to_length(1); edit->set_sequence(to_seq.substr(k,1)); last_start = h+1; } } // handles the match at the end or the case of no SNP if (h-last_start > 0) { edit = mapping->add_edit(); edit->set_from_length(h-last_start); edit->set_to_length(h-last_start); } to_pos += length; from_pos += length; } break; case 'D': edit = mapping->add_edit(); edit->set_from_length(length); edit->set_to_length(0); from_pos += length; break; case 'I': edit = mapping->add_edit(); edit->set_from_length(0); edit->set_to_length(length); edit->set_sequence(to_seq.substr(to_pos, length)); to_pos += length; break; case 'S': // note that soft clips and insertions are semantically equivalent // and can only be differentiated by their position in the read // with soft clips coming at the start or end edit = mapping->add_edit(); edit->set_from_length(0); edit->set_to_length(length); edit->set_sequence(to_seq.substr(to_pos, length)); to_pos += length; break; default: cerr << "error:[Aligner::gssw_mapping_to_alignment] " << "unsupported cigar op type " << e->type << endl; exit(1); break; } } //cerr << "path to_length " << path_to_length(*path) << endl; } // set identity alignment.set_identity(identity(alignment.path())); }