Exemplo n.º 1
0
/** Merge the specified two contigs, default overlap is k-1,
 * generate a consensus sequence of the overlapping region. The result
 * is stored in the first argument.
 */
static void mergeContigs(const Graph& g,
		unsigned overlap, Sequence& seq,
		const Sequence& s, const ContigNode& node, const Path& path)
{
	assert(s.length() > overlap);
	Sequence ao;
	Sequence bo(s, 0, overlap);
	Sequence o;
	do {
		assert(seq.length() > overlap);
		ao = seq.substr(seq.length() - overlap);
		o = createConsensus(ao, bo);
	} while (o.empty() && chomp(seq, 'n'));
	if (o.empty()) {
		cerr << "warning: the head of "
			<< get(vertex_name, g, node)
			<< " does not match the tail of the previous contig\n"
			<< ao << '\n' << bo << '\n' << path << endl;
		seq += 'n';
		seq += s;
	} else {
		seq.resize(seq.length() - overlap);
		seq += o;
		seq += Sequence(s, overlap);
	}
}
    /// specify the set of possible values (to do before creating the row filter)
    void CellTranslatorString::setUserValues(const Sequence<std::string> &values,
                                             bool check_database) {
      // clear all current data
      __strings.clear();
      __max_value = 0;
      if (__user_values != nullptr) {
        delete __user_values;
        __user_values = nullptr;
      }

      // set the internal structures according to the method's parameters
      __check_database = check_database;

      if (!check_database) {
        if (values.empty()) {
          __check_database = true;
        } else {
          // if we do not want to parse the database, store all the values directly
          DBCell cell;
          for (const auto &str : values) {
            cell.setStringSafe(str);
            __strings.insert(cell.getStringIndex(), __max_value);
            ++__max_value;
          }
        }
      } else {
        // if we specified values, store them
        if (!values.empty()) {
          __user_values = new Sequence<std::string>(values);
        }
      }
    }
Exemplo n.º 3
0
void Path::stitch(Sequence::iterator first_replaced,
                  Sequence::iterator last_replaced,
                  Sequence &source)
{
  if (!source.empty()) {
    if ( first_replaced != get_curves().begin() ) {
      if ( (*first_replaced)->initialPoint() != source.front()->initialPoint() ) {
        Curve *stitch = new StitchSegment((*first_replaced)->initialPoint(),
                                          source.front()->initialPoint());
        source.insert(source.begin(), boost::shared_ptr<Curve>(stitch));
      }
    }
    if ( last_replaced != (get_curves().end()-1) ) {
      if ( (*last_replaced)->finalPoint() != source.back()->finalPoint() ) {
        Curve *stitch = new StitchSegment(source.back()->finalPoint(),
                                          (*last_replaced)->finalPoint());
        source.insert(source.end(), boost::shared_ptr<Curve>(stitch));
      }
    }
  } else if ( first_replaced != last_replaced && first_replaced != get_curves().begin() && last_replaced != get_curves().end()-1) {
    if ( (*first_replaced)->initialPoint() != (*(last_replaced-1))->finalPoint() ) {
      Curve *stitch = new StitchSegment((*(last_replaced-1))->finalPoint(),
                                        (*first_replaced)->initialPoint());
      source.insert(source.begin(), boost::shared_ptr<Curve>(stitch));
    }
  }
}
Exemplo n.º 4
0
/** Append the sequence of contig v to seq. */
static void mergeContigs(const Graph& g, const Contigs& contigs,
		vertex_descriptor u, vertex_descriptor v,
		Sequence& seq, const ContigPath& path)
{
	int d = get(edge_bundle, g, u, v).distance;
	assert(d < 0);
	unsigned overlap = -d;
	const Sequence& s = sequence(contigs, v);
	assert(s.length() > overlap);
	Sequence ao;
	Sequence bo(s, 0, overlap);
	Sequence o;
	do {
		assert(seq.length() > overlap);
		ao = seq.substr(seq.length() - overlap);
		o = createConsensus(ao, bo);
		if (!o.empty()) {
			seq.resize(seq.length() - overlap);
			seq += o;
			seq += Sequence(s, overlap);
			return;
		}
	} while (chomp(seq, 'n'));

	// Try an overlap alignment.
	if (opt::verbose > 2)
		cerr << '\n';
	vector<overlap_align> overlaps;
	alignOverlap(ao, bo, 0, overlaps, false, opt::verbose > 2);
	bool good = false;
	if (!overlaps.empty()) {
		assert(overlaps.size() == 1);
		const overlap_align& o = overlaps.front();
		unsigned matches = o.overlap_match;
		const string& consensus = o.overlap_str;
		float identity = (float)matches / consensus.size();
		good = matches >= opt::minOverlap
			&& identity >= opt::minIdentity;
		if (opt::verbose > 2)
			cerr << matches << " / " << consensus.size()
				<< " = " << identity
				<< (matches < opt::minOverlap ? " (too few)"
						: identity < opt::minIdentity ? " (too low)"
						: " (good)") << '\n';
	}
	if (good) {
		assert(overlaps.size() == 1);
		const overlap_align& o = overlaps.front();
		seq.erase(seq.length() - overlap + o.overlap_t_pos);
		seq += o.overlap_str;
		seq += Sequence(s, o.overlap_h_pos + 1);
	} else {
		cerr << "warning: the head of " << get(vertex_name, g, v)
			<< " does not match the tail of the previous contig\n"
			<< ao << '\n' << bo << '\n' << path << endl;
		seq += 'n';
		seq += s;
	}
}
Exemplo n.º 5
0
void erase_unordered( Sequence& s, typename Sequence::iterator pos )
{
  assert( ! s.empty() && pos != s.end() );
  typename Sequence::iterator last = s.end();
  --last;
  if ( pos != last )
    *pos = *last; // *pos = std::move( *last );
  s.pop_back();
}
Exemplo n.º 6
0
  void erase_if_dispatch(Sequence& c, Predicate p,
                         sequence_tag, IteratorStability)
  {
#if 0
    c.erase(std::remove_if(c.begin(), c.end(), p), c.end());
#else
    if (! c.empty())
      c.erase(std::remove_if(c.begin(), c.end(), p), c.end());
#endif
  }
    /// default constructor
    CellTranslatorString::CellTranslatorString(Sequence<std::string> values,
                                               bool check_database)
        : __check_database(check_database) {

      if (!check_database) {
        if (values.empty()) {
          __check_database = true;
        } else {
          // if we do not want to parse the database, store all the values directly
          DBCell cell;
          for (const auto &str : values) {
            cell.setStringSafe(str);
            __strings.insert(cell.getStringIndex(), __max_value);
            ++__max_value;
          }
        }
      } else {
        // if we specified values, store them
        if (!values.empty()) {
          __user_values = new Sequence<std::string>(std::move(values));
        }
      }
    }
Exemplo n.º 8
0
bool lookup_history(const Sequence<std::string,Allocator>& query_history, std::string& query)
{
   if (query_history.empty())
      return false;

   static strtk::ignore_token ignore;
   std::size_t query_index = 0;

   if (!strtk::parse(query," \t",ignore,query_index))
      return false;
   else if (query_index >= query_history.size())
      return false;

   query = query_history[query_index];
   std::cout << "Query: " << query << std::endl;

   return true;
}
Exemplo n.º 9
0
static Sequence mergePath(const Graph&g, const Path& path)
{
	Sequence seq;
	Path::const_iterator prev_it;
	for (Path::const_iterator it = path.begin();
			it != path.end(); ++it) {
		if (seq.empty()) {
			seq = getSequence(*it);
		} else {
			int d = get(edge_bundle, g, *(it-1), *it).distance;
			assert(d < 0);
			unsigned overlap = -d;
			mergeContigs(g, overlap, seq,
					getSequence(*it), *it, path);
		}
		prev_it = it;
	}
	return seq;
}
Exemplo n.º 10
0
/** Merge the specified path. */
static Contig mergePath(const Graph& g, const Contigs& contigs,
		const ContigPath& path)
{
	Sequence seq;
	unsigned coverage = 0;
	for (ContigPath::const_iterator it = path.begin();
			it != path.end(); ++it) {
		if (!it->ambiguous())
			coverage += g[*it].coverage;
		if (seq.empty()) {
			seq = sequence(contigs, *it);
		} else {
			assert(it != path.begin());
			mergeContigs(g, contigs, *(it-1), *it, seq, path);
		}
	}
	ostringstream ss;
	ss << seq.size() << ' ' << coverage << ' ';
	pathToComment(ss, g, path);
	return Contig(ss.str(), seq);
}
Exemplo n.º 11
0
void FastaReader::NextRead(Sequence &r, bool &success, bool &done) {

    if (!r.empty())
        r.clear();
    int c;

    success = false;
    done = false;
    // Pick off the first at
    if(this->first_) {
        c = fHandler_->get();
        if(c != '>') {
            c = fHandler_->getPastNewline();
            if(c < 0) {
               r.clear();success = false; done = true;
            }
        }
        if(c != '>') {
            std::cerr << "Error: reads file does not look like a FASTA file" << std::endl;
            throw 1;
        }
        assert_eq('>', c);
        first_ = false;
    }
    // Read to the end of the id line, sticking everything after the '>'
    // into *name
    string& id = r.id();
    while(true) {
        c = fHandler_->get();
        if(c < 0) {
            r.clear(); success = false; done = true;
            std::cerr << "Error: reads file does not look like a FASTA file" << std::endl;
            throw 1;
        }
        if(c == '\n' || c == '\r') {
            // Break at end of line, after consuming all \r's, \n's
            while(c == '\n' || c == '\r') {
                c = fHandler_->get();
                if(c < 0) {
                    r.clear(); success = false; done = true;
                    std::cerr << "Error: reads file does not look like a FASTA file" << std::endl;
                    throw 1;
                }
            }
            break;
        }
        if(c == '>')
            continue;
        id += c;
        //name.append(c);
    }
    // fb_ now points just past the first character of a
    // sequence line, and c holds the first character
    BTDnaString& sbuf = r.fowardSeq();
    while(true){
        c = toupper(c);
        if(c < 0){
            r.clear(); success = false; done = true;
            std::cerr << "Error: reads file does not look like a FASTA file" << std::endl;
            throw 1;
        }
        else{
            if(c == '\n' || c == '\r') {
                break;
            }
            sbuf.append(asc2dna[c]);
            c = fHandler_->get();
        }
    }
    success = true;
    done = fHandler_->eof();
    is_done_ = done;
}
Exemplo n.º 12
0
/** Read a single record. */
Sequence FastaReader::read(string& id, string& comment,
		char& anchor, string& q)
{
next_record:
	id.clear();
	comment.clear();
	anchor = 0;
	q.clear();

	// Discard comments.
	while (peek() == '#')
		ignoreLines(1);

	signed char recordType = peek();
	Sequence s;

	unsigned qualityOffset = 0;
	if (eof() || recordType == EOF || ftell(m_in) >= m_end) {
		string header;
		getline(header);

		return s;
	} else if (recordType == '>' || recordType == '@') {
		// Read the header.
		string header;
		getline(header);
		istringstream headerStream(header);
		headerStream >> recordType >> id >> ws;
		std::getline(headerStream, comment);

		// Ignore SAM headers.
		if (id.length() == 2 && isupper(id[0]) && isupper(id[1])
				&& comment.length() > 2 && comment[2] == ':')
			goto next_record;

		// Casava FASTQ format
		if (comment.size() > 3
				&& comment[1] == ':' && comment[3] == ':') {
			// read, chastity, flags, index: 1:Y:0:AAAAAA
			if (opt::chastityFilter && comment[2] == 'Y') {
				m_unchaste++;
				if (recordType == '@') {
					ignoreLines(3);
				} else {
					while (peek() != '>' && peek() != '#'
							&& ignoreLines(1))
						;
				}
				goto next_record;
			}
			if (id.size() > 2 && id.rbegin()[1] != '/') {
				// Add the read number to the ID.
				id += '/';
				id += comment[0];
			}
		}

		getline(s);
		if (recordType == '>') {
			// Read a multi-line FASTA record.
			string line;
			while (peek() != '>' && peek() != '#'
					&& getline(line))
				s += line;
			if (eof())
				clear();
		}

		if (recordType == '@') {
			char c = peek();
			if (c != '+') {
				die() << s << '\n' << header << '\n';
				string line;
				getline(line);
				die() << "expected `+' and saw ";
				if (eof())
					cerr << "end-of-file\n";
				else
					cerr << "`" << c << "' near\n"
					<< c << line << "\n";
				exit(EXIT_FAILURE);
			}
			ignoreLines(1);
			getline(q);
		} else
			q.clear();

		if (s.empty()) {
			die() << "sequence with ID `" << id << "' is empty\n";
			exit(EXIT_FAILURE);
		}

		if (s.length() < opt::minLength) {
			goto next_record;
		}

		bool colourSpace = isColourSpace(s);
		if (colourSpace && !isdigit(s[0])) {
			// The first character is the primer base. The second
			// character is the dibase read of the primer and the
			// first base of the sample, which is not part of the
			// assembly.
			assert(s.length() > 2);
			anchor = colourToNucleotideSpace(s[0], s[1]);
			s.erase(0, 2);
			q.erase(0, 1);
		}

		if (!q.empty())
			checkSeqQual(s, q);

		if (opt::trimMasked && !colourSpace) {
			// Removed masked (lower case) sequence at the beginning
			// and end of the read.
			size_t trimFront = 0;
			while (trimFront <= s.length() && islower(s[trimFront]))
				trimFront++;
			size_t trimBack = s.length();
			while (trimBack > 0 && islower(s[trimBack - 1]))
				trimBack--;
			s.erase(trimBack);
			s.erase(0, trimFront);
			if (!q.empty()) {
				q.erase(trimBack);
				q.erase(0, trimFront);
			}
		}
		if (flagFoldCase())
			transform(s.begin(), s.end(), s.begin(), ::toupper);

		qualityOffset = 33;
	} else {