Exemple #1
0
void LodExtract::writeSequences(const Genome* inParent,
                                const vector<const Genome*>& inChildren)
{
  vector<const Genome*> inGenomes = inChildren;
  inGenomes.push_back(inParent);
  const Genome* outParent = _outAlignment->openGenome(inParent->getName());
  (void)outParent;
  assert(outParent != NULL && outParent->getNumBottomSegments() > 0);
  string buffer;

  for (hal_size_t i = 0; i < inGenomes.size(); ++i)
  {
    const Genome* inGenome = inGenomes[i];
    Genome* outGenome = _outAlignment->openGenome(inGenome->getName());
    if (inGenome == inParent || outGenome->getNumChildren() == 0)
    {   
      SequenceIteratorConstPtr inSeqIt = inGenome->getSequenceIterator();
      SequenceIteratorConstPtr end = inGenome->getSequenceEndIterator();
      for (; inSeqIt != end; inSeqIt->toNext())
      {
        const Sequence* inSequence = inSeqIt->getSequence();
        if (inSequence->getSequenceLength() > 0)
        {
          Sequence* outSequence = outGenome->getSequence(inSequence->getName());
          assert(outSequence != NULL);
          inSequence->getString(buffer);
          outSequence->setString(buffer);
        }
      }
    }
  }
}
Exemple #2
0
void Genome::copyTopDimensions(Genome *dest) const
{
  vector<Sequence::UpdateInfo> dimensions;
  SequenceIteratorConstPtr seqIt = getSequenceIterator();
  SequenceIteratorConstPtr seqEndIt = getSequenceEndIterator();
  for (; seqIt != seqEndIt; seqIt->toNext())
  {
    const Sequence* sequence = seqIt->getSequence();
    if (sequence->getSequenceLength() == 0 &&
        dest->getSequence(sequence->getName()) == NULL) {
      // progressiveCactus creates 0-length sequences in ancestors,
      // which are not usually extractable and aren't important
      continue;
    }
    Sequence::UpdateInfo info(sequence->getName(),
                              sequence->getNumTopSegments());
    dimensions.push_back(info);
  }
  dest->updateTopDimensions(dimensions);
}
Exemple #3
0
void Genome::copyDimensions(Genome *dest) const
{
  vector<Sequence::Info> dimensions;
  const Alignment *inAlignment = getAlignment();
  SequenceIteratorConstPtr seqIt = getSequenceIterator();
  SequenceIteratorConstPtr seqEndIt = getSequenceEndIterator();

  bool root = inAlignment->getParentName(getName()).empty();
  bool leaf = inAlignment->getChildNames(getName()).empty();     
  
  for (; seqIt != seqEndIt; seqIt->toNext())
  {
    const Sequence* sequence = seqIt->getSequence();
    Sequence::Info info(sequence->getName(),
                        sequence->getSequenceLength(),
                        root ? 0 : sequence->getNumTopSegments(),
                        leaf ? 0 : sequence->getNumBottomSegments());
    dimensions.push_back(info);
  }
  dest->setDimensions(dimensions);
}
Exemple #4
0
void printBedSequenceStats(ostream& os, AlignmentConstPtr alignment, 
                           const string& genomeName)
{
  const Genome* genome = alignment->openGenome(genomeName);
  if (genome == NULL)
  {
    throw hal_exception(string("Genome ") + genomeName + " not found.");
  }
  if (genome->getNumSequences() > 0)
  {
    SequenceIteratorConstPtr seqIt = genome->getSequenceIterator();
    SequenceIteratorConstPtr seqEnd = genome->getSequenceEndIterator();

    for (; !seqIt->equals(seqEnd); seqIt->toNext())
    {
      os << seqIt->getSequence()->getName() << "\t"
         << 0 << "\t"
         << seqIt->getSequence()->getSequenceLength() << "\n";
    }
  }
  os << endl;
}
Exemple #5
0
void LodExtract::writeDimensions(
  const map<const Sequence*, hal_size_t>& segmentCounts, 
  const string& parentName,
  const vector<string>& childNames)
{
  // initialize a dimensions list for each (input) genome
  map<const Genome*, vector<Sequence::Info> > dimMap;
  map<const Genome*, vector<Sequence::Info> >::iterator dimMapIt;
  vector<string> newGenomeNames = childNames;
  newGenomeNames.push_back(parentName);
 
  for (size_t i = 0; i < newGenomeNames.size(); ++i)
  {
    const Genome* inGenome = _inAlignment->openGenome(newGenomeNames[i]);
    pair<const Genome*, vector<Sequence::Info> > newEntry;
    newEntry.first = inGenome;
    
    // it's important we keep the sequences in the output genome
    // in the same order as the sequences in the input genome since
    // we always use global coordinates!
    SequenceIteratorConstPtr seqIt = inGenome->getSequenceIterator();
    SequenceIteratorConstPtr seqEnd = inGenome->getSequenceEndIterator();
    for (; seqIt != seqEnd; seqIt->toNext())
    {
      const Sequence* inSequence = seqIt->getSequence();
      map<const Sequence*, hal_size_t>::const_iterator segMapIt;
      segMapIt = segmentCounts.find(inSequence);
      // we skip empty sequences for now with below check
      if (segMapIt != segmentCounts.end())
      {
        vector<Sequence::Info>& segDims = newEntry.second;
        hal_size_t nTop = 
           inGenome->getName() == parentName ? 0 : segMapIt->second;
        hal_size_t nBot = 
           inGenome->getName() != parentName ? 0 : segMapIt->second;
        segDims.push_back(Sequence::Info(inSequence->getName(),
                                         inSequence->getSequenceLength(),
                                         nTop,
                                         nBot));
      }
    }

    // note potential bug here for genome with no data
    dimMap.insert(newEntry);
  }
  
  // now that we have the dimensions for each genome, update them in
  // the output alignment
  for (dimMapIt = dimMap.begin(); dimMapIt != dimMap.end(); ++dimMapIt)
  {
    Genome* newGenome = _outAlignment->openGenome(dimMapIt->first->getName());
    assert(newGenome != NULL);
    vector<Sequence::Info>& segDims = dimMapIt->second;
    // ROOT 
    if (newGenome->getName() == _outAlignment->getRootName())
    {
      assert(newGenome->getName() == parentName);
      newGenome->setDimensions(segDims, _keepSequences);
    }
    // LEAF
    else if (newGenome->getName() != parentName)
    {
      newGenome->setDimensions(segDims, _keepSequences);
    }
    // INTERNAL NODE
    else
    {
      vector<Sequence::UpdateInfo> updateInfo;
      for (size_t i = 0; i < segDims.size(); ++i)
      {
        updateInfo.push_back(
          Sequence::UpdateInfo(segDims[i]._name,
                               segDims[i]._numBottomSegments));
      }
      newGenome->updateBottomDimensions(updateInfo);
    }
  }
}
void hal::validateGenome(const Genome* genome)
{
  // first we check the sequence coverage
  hal_size_t totalTop = 0;
  hal_size_t totalBottom = 0;
  hal_size_t totalLength = 0;
  
  SequenceIteratorConstPtr seqIt = genome->getSequenceIterator();
  SequenceIteratorConstPtr seqEnd = genome->getSequenceEndIterator();
  for (; seqIt != seqEnd; seqIt->toNext())
  {
    const Sequence* sequence = seqIt->getSequence();
    validateSequence(sequence);

    totalTop += sequence->getNumTopSegments();
    totalBottom += sequence->getNumBottomSegments();
    totalLength += sequence->getSequenceLength();

    // make sure it doesn't overlap any other sequences;
    if (sequence->getSequenceLength() > 0)
    {
      const Sequence* s1 =
         genome->getSequenceBySite(sequence->getStartPosition());

      if (s1 == NULL || s1->getName() != sequence->getName())
      {
        stringstream ss;
        ss << "Sequence " << sequence->getName() << " has a bad overlap in "
           << genome->getName();
        throw hal_exception(ss.str());
      }
      const Sequence* s2 = 
         genome->getSequenceBySite(sequence->getStartPosition() +
                                   sequence->getSequenceLength() - 1);
      if (s2 == NULL || s2->getName() != sequence->getName())
      {
        stringstream ss;
        ss << "Sequence " << sequence->getName() << " has a bad overlap in "
           << genome->getName();
        throw hal_exception(ss.str());
      }
    }
  }

  hal_size_t genomeLength = genome->getSequenceLength();
  hal_size_t genomeTop = genome->getNumTopSegments();
  hal_size_t genomeBottom = genome->getNumBottomSegments();

  if (genomeLength != totalLength)
  {
    stringstream ss;
    ss << "Problem: genome has length " << genomeLength 
       << "But sequences total " << totalLength;
    throw hal_exception(ss.str());
  }
  if (genomeTop != totalTop)
  {
    stringstream ss;
    ss << "Problem: genome has " << genomeTop << " top segments but "
       << "sequences have " << totalTop << " top segments";
    throw ss.str();
  }
  if (genomeBottom != totalBottom)
  {
    stringstream ss;
    ss << "Problem: genome has " << genomeBottom << " bottom segments but "
       << "sequences have " << totalBottom << " bottom segments";
    throw hal_exception(ss.str());
  }

  if (genomeLength > 0 && genomeTop == 0 && genomeBottom == 0)
  {
    stringstream ss;
    ss << "Problem: genome " << genome->getName() << " has length " 
       << genomeLength << "but no segments";
    throw hal_exception(ss.str());
  }
  
  validateDuplications(genome);
}
Exemple #7
0
void Genome::copyBottomSegments(Genome *dest) const
{
  assert(getNumBottomSegments() == dest->getNumBottomSegments());
  hal_size_t inNc = getNumChildren();
  hal_size_t outNc = dest->getNumChildren();
  // The child indices aren't consistent across files--make sure each bottom
  // segment points to the correct children
  vector<string> inChildNames;
  vector<string> outChildNames;
  for (hal_size_t inChild = 0; inChild < inNc; ++inChild)
  {
    inChildNames.push_back(getChild(inChild)->getName());
  }
  for (hal_size_t outChild = 0; outChild < outNc; ++outChild)
  {
    outChildNames.push_back(dest->getChild(outChild)->getName());
  }
  map<hal_size_t, hal_size_t> inChildToOutChild;
  for (hal_size_t inChild = 0; inChild < inNc; inChild++)
  {
    hal_size_t outChild;
    for (outChild = 0; outChild < outNc; outChild++)
    {
      if (inChildNames[inChild] == outChildNames[outChild])
      {
        inChildToOutChild[inChild] = outChild;
        break;
      }
    }
    if (outChild == outNc)
    {
      inChildToOutChild[inChild] = outNc;
    }
  }

  // Go through each sequence in this genome, find the matching
  // sequence in the dest genome, then copy over the segments for each
  // sequence.
  SequenceIteratorConstPtr seqIt = getSequenceIterator();
  SequenceIteratorConstPtr seqEndIt = getSequenceEndIterator();

  for (; seqIt != seqEndIt; seqIt->toNext())
  {
    const Sequence *inSeq = seqIt->getSequence();
    const Sequence *outSeq = dest->getSequence(inSeq->getName());
    BottomSegmentIteratorPtr inBot = inSeq->getBottomSegmentIterator();
    BottomSegmentIteratorPtr outBot = outSeq->getBottomSegmentIterator();

    cout << "DEBUG: inSeq name: " << inSeq->getName() << ", outSeq name: " << outSeq->getName() << endl;

    if (inSeq->getName() != outSeq->getName()) {
      // This check is important enough that it can't be an assert.
      stringstream ss;
      ss << "When copying bottom segments: segment #" << inBot->getArrayIndex() << " of source genome is from sequence " << inBot->getSequence()->getName() << ", but segment #" << outBot->getArrayIndex() << " is from sequence " << outBot->getSequence()->getName();
      throw hal_exception(ss.str());
    }

    if (inSeq->getNumBottomSegments() != outSeq->getNumBottomSegments()) {
      stringstream ss;
      ss << "When copying bottom segments: sequence " << inSeq->getName() << " has " << inSeq->getNumBottomSegments() << " in genome " << getName() << ", while it has " << outSeq->getNumBottomSegments() << " in genome " << dest->getName();
      throw hal_exception(ss.str());      
    }

    hal_index_t inSegmentEnd = inSeq->getBottomSegmentArrayIndex() + inSeq->getNumBottomSegments();
    cout << "DEBUG: inSegmentStart: " << inSeq->getBottomSegmentArrayIndex() << " inSegmentEnd: " << inSegmentEnd << " num bottom segments: " << inSeq->getNumBottomSegments() << endl;
    for (; inBot->getArrayIndex() < inSegmentEnd; inBot->toRight(),
           outBot->toRight())
    {
      hal_index_t outStartPosition = inBot->getStartPosition() - inSeq->getStartPosition() + outSeq->getStartPosition();

      cout << "Decided on outStartPosition " << outStartPosition << " for seg index " << outBot->getArrayIndex() << " (src index " << inBot->getArrayIndex() << ")" << endl;

      if (dest->getSequenceBySite(outStartPosition) != outSeq) {
        stringstream ss;
        ss << "When copying bottom segments from " << getName() << " to " << dest->getName() << ": expected destination sequence " << outSeq->getName() << " for segment # " << inBot->getArrayIndex() << " but got " << dest->getSequenceBySite(outStartPosition)->getName();
        throw hal_exception(ss.str());
      }
      outBot->setCoordinates(outStartPosition, inBot->getLength());
      for(hal_size_t inChild = 0; inChild < inNc; inChild++) {
        hal_size_t outChild = inChildToOutChild[inChild];
        if (outChild != outNc) {
          outBot->setChildIndex(outChild, inBot->getChildIndex(inChild));
          cout << "genome " << getName() << ": Set child index " << inChild << " to " << inBot->getChildIndex(inChild) << endl;
          outBot->setChildReversed(outChild, inBot->getChildReversed(inChild));
        }
      }
      outBot->setTopParseIndex(inBot->getTopParseIndex());
    }
  }
}