/** Merge the specified two contigs, default overlap is k-1, * generate a consensus sequence of the overlapping region. The result * is stored in the first argument. */ static void mergeContigs(const Graph& g, unsigned overlap, Sequence& seq, const Sequence& s, const ContigNode& node, const Path& path) { assert(s.length() > overlap); Sequence ao; Sequence bo(s, 0, overlap); Sequence o; do { assert(seq.length() > overlap); ao = seq.substr(seq.length() - overlap); o = createConsensus(ao, bo); } while (o.empty() && chomp(seq, 'n')); if (o.empty()) { cerr << "warning: the head of " << get(vertex_name, g, node) << " does not match the tail of the previous contig\n" << ao << '\n' << bo << '\n' << path << endl; seq += 'n'; seq += s; } else { seq.resize(seq.length() - overlap); seq += o; seq += Sequence(s, overlap); } }
/// specify the set of possible values (to do before creating the row filter) void CellTranslatorString::setUserValues(const Sequence<std::string> &values, bool check_database) { // clear all current data __strings.clear(); __max_value = 0; if (__user_values != nullptr) { delete __user_values; __user_values = nullptr; } // set the internal structures according to the method's parameters __check_database = check_database; if (!check_database) { if (values.empty()) { __check_database = true; } else { // if we do not want to parse the database, store all the values directly DBCell cell; for (const auto &str : values) { cell.setStringSafe(str); __strings.insert(cell.getStringIndex(), __max_value); ++__max_value; } } } else { // if we specified values, store them if (!values.empty()) { __user_values = new Sequence<std::string>(values); } } }
void Path::stitch(Sequence::iterator first_replaced, Sequence::iterator last_replaced, Sequence &source) { if (!source.empty()) { if ( first_replaced != get_curves().begin() ) { if ( (*first_replaced)->initialPoint() != source.front()->initialPoint() ) { Curve *stitch = new StitchSegment((*first_replaced)->initialPoint(), source.front()->initialPoint()); source.insert(source.begin(), boost::shared_ptr<Curve>(stitch)); } } if ( last_replaced != (get_curves().end()-1) ) { if ( (*last_replaced)->finalPoint() != source.back()->finalPoint() ) { Curve *stitch = new StitchSegment(source.back()->finalPoint(), (*last_replaced)->finalPoint()); source.insert(source.end(), boost::shared_ptr<Curve>(stitch)); } } } else if ( first_replaced != last_replaced && first_replaced != get_curves().begin() && last_replaced != get_curves().end()-1) { if ( (*first_replaced)->initialPoint() != (*(last_replaced-1))->finalPoint() ) { Curve *stitch = new StitchSegment((*(last_replaced-1))->finalPoint(), (*first_replaced)->initialPoint()); source.insert(source.begin(), boost::shared_ptr<Curve>(stitch)); } } }
/** Append the sequence of contig v to seq. */ static void mergeContigs(const Graph& g, const Contigs& contigs, vertex_descriptor u, vertex_descriptor v, Sequence& seq, const ContigPath& path) { int d = get(edge_bundle, g, u, v).distance; assert(d < 0); unsigned overlap = -d; const Sequence& s = sequence(contigs, v); assert(s.length() > overlap); Sequence ao; Sequence bo(s, 0, overlap); Sequence o; do { assert(seq.length() > overlap); ao = seq.substr(seq.length() - overlap); o = createConsensus(ao, bo); if (!o.empty()) { seq.resize(seq.length() - overlap); seq += o; seq += Sequence(s, overlap); return; } } while (chomp(seq, 'n')); // Try an overlap alignment. if (opt::verbose > 2) cerr << '\n'; vector<overlap_align> overlaps; alignOverlap(ao, bo, 0, overlaps, false, opt::verbose > 2); bool good = false; if (!overlaps.empty()) { assert(overlaps.size() == 1); const overlap_align& o = overlaps.front(); unsigned matches = o.overlap_match; const string& consensus = o.overlap_str; float identity = (float)matches / consensus.size(); good = matches >= opt::minOverlap && identity >= opt::minIdentity; if (opt::verbose > 2) cerr << matches << " / " << consensus.size() << " = " << identity << (matches < opt::minOverlap ? " (too few)" : identity < opt::minIdentity ? " (too low)" : " (good)") << '\n'; } if (good) { assert(overlaps.size() == 1); const overlap_align& o = overlaps.front(); seq.erase(seq.length() - overlap + o.overlap_t_pos); seq += o.overlap_str; seq += Sequence(s, o.overlap_h_pos + 1); } else { cerr << "warning: the head of " << get(vertex_name, g, v) << " does not match the tail of the previous contig\n" << ao << '\n' << bo << '\n' << path << endl; seq += 'n'; seq += s; } }
void erase_unordered( Sequence& s, typename Sequence::iterator pos ) { assert( ! s.empty() && pos != s.end() ); typename Sequence::iterator last = s.end(); --last; if ( pos != last ) *pos = *last; // *pos = std::move( *last ); s.pop_back(); }
void erase_if_dispatch(Sequence& c, Predicate p, sequence_tag, IteratorStability) { #if 0 c.erase(std::remove_if(c.begin(), c.end(), p), c.end()); #else if (! c.empty()) c.erase(std::remove_if(c.begin(), c.end(), p), c.end()); #endif }
/// default constructor CellTranslatorString::CellTranslatorString(Sequence<std::string> values, bool check_database) : __check_database(check_database) { if (!check_database) { if (values.empty()) { __check_database = true; } else { // if we do not want to parse the database, store all the values directly DBCell cell; for (const auto &str : values) { cell.setStringSafe(str); __strings.insert(cell.getStringIndex(), __max_value); ++__max_value; } } } else { // if we specified values, store them if (!values.empty()) { __user_values = new Sequence<std::string>(std::move(values)); } } }
bool lookup_history(const Sequence<std::string,Allocator>& query_history, std::string& query) { if (query_history.empty()) return false; static strtk::ignore_token ignore; std::size_t query_index = 0; if (!strtk::parse(query," \t",ignore,query_index)) return false; else if (query_index >= query_history.size()) return false; query = query_history[query_index]; std::cout << "Query: " << query << std::endl; return true; }
static Sequence mergePath(const Graph&g, const Path& path) { Sequence seq; Path::const_iterator prev_it; for (Path::const_iterator it = path.begin(); it != path.end(); ++it) { if (seq.empty()) { seq = getSequence(*it); } else { int d = get(edge_bundle, g, *(it-1), *it).distance; assert(d < 0); unsigned overlap = -d; mergeContigs(g, overlap, seq, getSequence(*it), *it, path); } prev_it = it; } return seq; }
/** Merge the specified path. */ static Contig mergePath(const Graph& g, const Contigs& contigs, const ContigPath& path) { Sequence seq; unsigned coverage = 0; for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { if (!it->ambiguous()) coverage += g[*it].coverage; if (seq.empty()) { seq = sequence(contigs, *it); } else { assert(it != path.begin()); mergeContigs(g, contigs, *(it-1), *it, seq, path); } } ostringstream ss; ss << seq.size() << ' ' << coverage << ' '; pathToComment(ss, g, path); return Contig(ss.str(), seq); }
void FastaReader::NextRead(Sequence &r, bool &success, bool &done) { if (!r.empty()) r.clear(); int c; success = false; done = false; // Pick off the first at if(this->first_) { c = fHandler_->get(); if(c != '>') { c = fHandler_->getPastNewline(); if(c < 0) { r.clear();success = false; done = true; } } if(c != '>') { std::cerr << "Error: reads file does not look like a FASTA file" << std::endl; throw 1; } assert_eq('>', c); first_ = false; } // Read to the end of the id line, sticking everything after the '>' // into *name string& id = r.id(); while(true) { c = fHandler_->get(); if(c < 0) { r.clear(); success = false; done = true; std::cerr << "Error: reads file does not look like a FASTA file" << std::endl; throw 1; } if(c == '\n' || c == '\r') { // Break at end of line, after consuming all \r's, \n's while(c == '\n' || c == '\r') { c = fHandler_->get(); if(c < 0) { r.clear(); success = false; done = true; std::cerr << "Error: reads file does not look like a FASTA file" << std::endl; throw 1; } } break; } if(c == '>') continue; id += c; //name.append(c); } // fb_ now points just past the first character of a // sequence line, and c holds the first character BTDnaString& sbuf = r.fowardSeq(); while(true){ c = toupper(c); if(c < 0){ r.clear(); success = false; done = true; std::cerr << "Error: reads file does not look like a FASTA file" << std::endl; throw 1; } else{ if(c == '\n' || c == '\r') { break; } sbuf.append(asc2dna[c]); c = fHandler_->get(); } } success = true; done = fHandler_->eof(); is_done_ = done; }
/** Read a single record. */ Sequence FastaReader::read(string& id, string& comment, char& anchor, string& q) { next_record: id.clear(); comment.clear(); anchor = 0; q.clear(); // Discard comments. while (peek() == '#') ignoreLines(1); signed char recordType = peek(); Sequence s; unsigned qualityOffset = 0; if (eof() || recordType == EOF || ftell(m_in) >= m_end) { string header; getline(header); return s; } else if (recordType == '>' || recordType == '@') { // Read the header. string header; getline(header); istringstream headerStream(header); headerStream >> recordType >> id >> ws; std::getline(headerStream, comment); // Ignore SAM headers. if (id.length() == 2 && isupper(id[0]) && isupper(id[1]) && comment.length() > 2 && comment[2] == ':') goto next_record; // Casava FASTQ format if (comment.size() > 3 && comment[1] == ':' && comment[3] == ':') { // read, chastity, flags, index: 1:Y:0:AAAAAA if (opt::chastityFilter && comment[2] == 'Y') { m_unchaste++; if (recordType == '@') { ignoreLines(3); } else { while (peek() != '>' && peek() != '#' && ignoreLines(1)) ; } goto next_record; } if (id.size() > 2 && id.rbegin()[1] != '/') { // Add the read number to the ID. id += '/'; id += comment[0]; } } getline(s); if (recordType == '>') { // Read a multi-line FASTA record. string line; while (peek() != '>' && peek() != '#' && getline(line)) s += line; if (eof()) clear(); } if (recordType == '@') { char c = peek(); if (c != '+') { die() << s << '\n' << header << '\n'; string line; getline(line); die() << "expected `+' and saw "; if (eof()) cerr << "end-of-file\n"; else cerr << "`" << c << "' near\n" << c << line << "\n"; exit(EXIT_FAILURE); } ignoreLines(1); getline(q); } else q.clear(); if (s.empty()) { die() << "sequence with ID `" << id << "' is empty\n"; exit(EXIT_FAILURE); } if (s.length() < opt::minLength) { goto next_record; } bool colourSpace = isColourSpace(s); if (colourSpace && !isdigit(s[0])) { // The first character is the primer base. The second // character is the dibase read of the primer and the // first base of the sample, which is not part of the // assembly. assert(s.length() > 2); anchor = colourToNucleotideSpace(s[0], s[1]); s.erase(0, 2); q.erase(0, 1); } if (!q.empty()) checkSeqQual(s, q); if (opt::trimMasked && !colourSpace) { // Removed masked (lower case) sequence at the beginning // and end of the read. size_t trimFront = 0; while (trimFront <= s.length() && islower(s[trimFront])) trimFront++; size_t trimBack = s.length(); while (trimBack > 0 && islower(s[trimBack - 1])) trimBack--; s.erase(trimBack); s.erase(0, trimFront); if (!q.empty()) { q.erase(trimBack); q.erase(0, trimFront); } } if (flagFoldCase()) transform(s.begin(), s.end(), s.begin(), ::toupper); qualityOffset = 33; } else {