Alignment Sampler::mutate(const Alignment& aln, double base_error, double indel_error) { if (base_error == 0 && indel_error == 0) return aln; string bases = "ATGC"; uniform_real_distribution<double> rprob(0, 1); uniform_int_distribution<int> rbase(0, 3); Alignment mutaln; for (size_t i = 0; i < aln.path().mapping_size(); ++i) { auto& orig_mapping = aln.path().mapping(i); Mapping* new_mapping = mutaln.mutable_path()->add_mapping(); *new_mapping->mutable_position() = orig_mapping.position(); // for each edit in the mapping for (size_t j = 0; j < orig_mapping.edit_size(); ++j) { auto& orig_edit = orig_mapping.edit(j); auto new_edits = mutate_edit(orig_edit, make_pos_t(orig_mapping.position()), base_error, indel_error, bases, rprob, rbase); for (auto& edit : new_edits) { *new_mapping->add_edit() = edit; } } } // re-derive the alignment's sequence mutaln = simplify(mutaln); mutaln.set_sequence(alignment_seq(mutaln)); mutaln.set_name(aln.name()); return mutaln; }
Alignment bam_to_alignment(const bam1_t *b, map<string, string>& rg_sample) { Alignment alignment; // get the sequence and qual int32_t lqseq = b->core.l_qseq; string sequence; sequence.resize(lqseq); uint8_t* qualptr = bam_get_qual(b); string quality;//(lqseq, 0); quality.assign((char*)qualptr, lqseq); // process the sequence into chars uint8_t* seqptr = bam_get_seq(b); for (int i = 0; i < lqseq; ++i) { sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)]; } // get the read group and sample name uint8_t *rgptr = bam_aux_get(b, "RG"); char* rg = (char*) (rgptr+1); //if (!rg_sample string sname; if (!rg_sample.empty()) { sname = rg_sample[string(rg)]; } // Now name the read after the scaffold string read_name = bam_get_qname(b); // Decide if we are a first read (/1) or second (last) read (/2) if(b->core.flag & BAM_FREAD1) { read_name += "/1"; } if(b->core.flag & BAM_FREAD2) { read_name += "/2"; } // If we are marked as both first and last we get /1/2, and if we are marked // as neither the scaffold name comes through unchanged as the read name. // TODO: produce correct names for intermediate reads on >2 read scaffolds. // add features to the alignment alignment.set_name(read_name); alignment.set_sequence(sequence); alignment.set_quality(quality); // TODO: htslib doesn't wrap this flag for some reason. alignment.set_is_secondary(b->core.flag & BAM_FSECONDARY); if (sname.size()) { alignment.set_sample_name(sname); alignment.set_read_group(rg); } return alignment; }
// generates a perfect alignment from the graph Alignment Sampler::alignment(size_t length) { string seq; Alignment aln; Path* path = aln.mutable_path(); pos_t pos = position(); char c = pos_char(pos); // we do something wildly inefficient but conceptually clean // for each position in the mapping we add a mapping // at the end we will simplify the alignment, merging redundant mappings do { // add in the char for the current position seq += c; Mapping* mapping = path->add_mapping(); *mapping->mutable_position() = make_position(pos); Edit* edit = mapping->add_edit(); edit->set_from_length(1); edit->set_to_length(1); // decide the next position auto nextc = next_pos_chars(pos); // no new positions mean we are done; we've reached the end of the graph if (nextc.empty()) break; // what positions do we go to next? vector<pos_t> nextp; for (auto& n : nextc) nextp.push_back(n.first); // pick one at random uniform_int_distribution<int> next_dist(0, nextc.size()-1); // update our position pos = nextp.at(next_dist(rng)); // update our char c = nextc[pos]; } while (seq.size() < length); // save our sequence in the alignment aln.set_sequence(seq); aln = simplify(aln); { // name the alignment string data; aln.SerializeToString(&data); int n; #pragma omp critical(nonce) n = nonce++; data += std::to_string(n); const string hash = sha1head(data, 16); aln.set_name(hash); } // and simplify it aln.set_identity(identity(aln.path())); return aln; }
Alignment strip_from_start(const Alignment& aln, size_t drop) { if (!drop) return aln; Alignment res; res.set_name(aln.name()); res.set_score(aln.score()); //cerr << "drop " << drop << " from start" << endl; res.set_sequence(aln.sequence().substr(drop)); if (!aln.has_path()) return res; *res.mutable_path() = cut_path(aln.path(), drop).second; assert(res.has_path()); if (alignment_to_length(res) != res.sequence().size()) { cerr << "failed!!! drop from start 轰" << endl; cerr << pb2json(res) << endl << endl; assert(false); } return res; }
Alignment reverse_alignment(const Alignment& aln, const function<int64_t(int64_t)>& node_length) { // We're going to reverse the alignment and all its mappings. // TODO: should we/can we do this in place? Alignment reversed = aln; reversed.set_sequence(reverse_complement(aln.sequence())); if(aln.has_path()) { // Now invert the order of the mappings, and for each mapping, flip the // is_reverse flag. The edits within mappings also get put in reverse // order, get their positions corrected, and get their sequences get // reverse complemented. *reversed.mutable_path() = reverse_path(aln.path(), node_length); } return reversed; }
Alignment strip_from_end(const Alignment& aln, size_t drop) { if (!drop) return aln; Alignment res; res.set_name(aln.name()); res.set_score(aln.score()); //cerr << "drop " << drop << " from end" << endl; size_t cut_at = aln.sequence().size()-drop; //cerr << "Cut at " << cut_at << endl; res.set_sequence(aln.sequence().substr(0, cut_at)); if (!aln.has_path()) return res; *res.mutable_path() = cut_path(aln.path(), cut_at).first; assert(res.has_path()); if (alignment_to_length(res) != res.sequence().size()) { cerr << "failed!!! drop from end 轰" << endl; cerr << pb2json(res) << endl << endl; assert(false); } return res; }
bool get_next_alignment_from_fastq(gzFile fp, char* buffer, size_t len, Alignment& alignment) { alignment.Clear(); // handle name if (0!=gzgets(fp,buffer,len)) { buffer[strlen(buffer)-1] = '\0'; string name = buffer; name = name.substr(1); // trim off leading @ // keep trailing /1 /2 alignment.set_name(name); } else { return false; } // handle sequence if (0!=gzgets(fp,buffer,len)) { buffer[strlen(buffer)-1] = '\0'; alignment.set_sequence(buffer); } else { cerr << "[vg::alignment.cpp] error: incomplete fastq record" << endl; exit(1); } // handle "+" sep if (0!=gzgets(fp,buffer,len)) { } else { cerr << "[vg::alignment.cpp] error: incomplete fastq record" << endl; exit(1); } // handle quality if (0!=gzgets(fp,buffer,len)) { buffer[strlen(buffer)-1] = '\0'; string quality = string_quality_char_to_short(buffer); //cerr << string_quality_short_to_char(quality) << endl; alignment.set_quality(quality); } else { cerr << "[vg::alignment.cpp] error: incomplete fastq record" << endl; exit(1); } return true; }