Beispiel #1
0
void  MappedSegmentColCompareTest::createColArray()
{
  hal_size_t N = _ref->getSequenceLength();
  _colArray.clear();
  _colArray.resize(N);
  set<const Genome*> tgtSet;
  tgtSet.insert(_tgt);
  ColumnIteratorConstPtr colIt = _ref->getColumnIterator(&tgtSet, 0, 0, 
                                                         NULL_INDEX, false,
                                                         false, false, true);
  while (true)
  {
    const ColumnIterator::ColumnMap* colMap = colIt->getColumnMap();
    ColumnIterator::ColumnMap::const_iterator colMapIt = colMap->begin();
    vector<pair<hal_index_t, bool> > insertList;
    // Pass 1 find all homologies in target
    for (; colMapIt != colMap->end(); colMapIt++)
    {
      if (colMapIt->first->getGenome() == _tgt)
      {
        ColumnIterator::DNASet* dnaSet = colMapIt->second;
        ColumnIterator::DNASet::const_iterator dnaIt = dnaSet->begin();
        for (; dnaIt != dnaSet->end(); ++dnaIt)
        {
          DNAIteratorConstPtr dna = *dnaIt;
          insertList.push_back(
            pair<hal_index_t, bool>(dna->getArrayIndex(), dna->getReversed()));
        }
      }
      else
      {
        CuAssertTrue(_testCase, colMapIt->first->getGenome() == _ref);
      }
    }

    // Pass 2 update each reference position with all homologies found
    // in Pass 1
    for (colMapIt = colMap->begin(); colMapIt != colMap->end(); colMapIt++)
    {
      if (colMapIt->first->getGenome() == _ref)
      {
        ColumnIterator::DNASet* dnaSet = colMapIt->second;
        ColumnIterator::DNASet::const_iterator dnaIt = dnaSet->begin();
        for (; dnaIt != dnaSet->end(); ++dnaIt)
        {
          DNAIteratorConstPtr dna = *dnaIt;
          for (size_t insIdx = 0; insIdx < insertList.size(); ++insIdx)
          {
            _colArray[dna->getArrayIndex()].insert(insertList[insIdx]);
          }
        }
      }
    }   
    if (colIt->lastColumn())
    {
      break;
    }
    colIt->toRight();
  }
}
Beispiel #2
0
void MafExport::convertSegmentedSequence(ostream& mafStream,
                                         AlignmentConstPtr alignment,
                                         const SegmentedSequence* seq,
                                         hal_index_t startPosition,
                                         hal_size_t length,
                                         const set<const Genome*>& targets)
{
  assert(seq != NULL);
  if (startPosition >= (hal_index_t)seq->getSequenceLength() ||
      (hal_size_t)startPosition + length > seq->getSequenceLength())
  {
    throw hal_exception("Invalid range specified for convertGenome");
  }
  if (length == 0)
  {
    length = seq->getSequenceLength() - startPosition;
  }
  if (length == 0)
  {
    throw hal_exception("Cannot convert zero length sequence");
  }
  hal_index_t lastPosition = startPosition + (hal_index_t)(length - 1);

  _mafStream = &mafStream;
  _alignment = alignment;
  if (!_append)
  {
    writeHeader();
  }

  ColumnIteratorConstPtr colIt = seq->getColumnIterator(&targets,
                                                        _maxRefGap, 
                                                        startPosition,
                                                        lastPosition,
                                                        _noDupes,
                                                        _noAncestors);

  hal_size_t appendCount = 0;
  if (_unique == false || colIt->isCanonicalOnRef() == true)
  {
    _mafBlock.initBlock(colIt, _ucscNames, _printTree);
    assert(_mafBlock.canAppendColumn(colIt) == true);
    _mafBlock.appendColumn(colIt);
    ++appendCount;
  }
  size_t numBlocks = 0;
  while (colIt->lastColumn() == false)
  {
    colIt->toRight();
    if (_unique == false || colIt->isCanonicalOnRef() == true)
    {
      if (appendCount == 0)
      {
        _mafBlock.initBlock(colIt, _ucscNames, _printTree);
        assert(_mafBlock.canAppendColumn(colIt) == true);
      }
      if (_mafBlock.canAppendColumn(colIt) == false)
      {
        // erase empty entries from the column.  helps when there are 
        // millions of sequences (ie from fastas with lots of scaffolds)
        if (numBlocks++ % 1000 == 0)
        {
          colIt->defragment();
        }
        if (appendCount > 0)
        {
          mafStream << _mafBlock << '\n';
        }
        _mafBlock.initBlock(colIt, _ucscNames, _printTree);
        assert(_mafBlock.canAppendColumn(colIt) == true);
      }
      _mafBlock.appendColumn(colIt);
      ++appendCount;
    }
  }
  // if nothing was ever added (seems to happen in corner case where
  // all columns violate unique), mafBlock ostream operator will crash
  // so we do following check
  if (appendCount > 0)
  {
    mafStream << _mafBlock << endl;
  }
}
Beispiel #3
0
void MafExport::convertEntireAlignment(ostream& mafStream,
                                       AlignmentConstPtr alignment)
{
    hal_size_t appendCount = 0;
    size_t numBlocks = 0;

    _mafStream = &mafStream;
    _alignment = alignment;

    writeHeader();

    // Load in all leaves from alignment
    vector<string> leafNames = alignment->getLeafNamesBelow(alignment->getRootName());
    vector<const Genome *> leafGenomes;
    for (hal_size_t i = 0; i < leafNames.size(); i++) {
        const Genome *genome = alignment->openGenome(leafNames[i]);
        assert(genome != NULL);
        leafGenomes.push_back(genome);
    }
    ColumnIterator::VisitCache visitCache;
    // Go through all the genomes one by one, and spit out any columns
    // they participate in that we haven't seen.
    for (hal_size_t i = 0; i < leafGenomes.size(); i++) {
        const Genome *genome = leafGenomes[i];
        ColumnIteratorConstPtr colIt = genome->getColumnIterator(NULL,
                                                                 0,
                                                                 0,
                                                                 NULL_INDEX,
                                                                 _noDupes,
                                                                 _noAncestors);
        colIt->setVisitCache(&visitCache);
        for (;;) {
            if (appendCount == 0) {
              _mafBlock.initBlock(colIt, _ucscNames, _printTree);
                assert(_mafBlock.canAppendColumn(colIt) == true);
            }
            if (_mafBlock.canAppendColumn(colIt) == false)
            {
                // erase empty entries from the column.  helps when there are 
                // millions of sequences (ie from fastas with lots of scaffolds)
                if (numBlocks++ % 1000 == 0)
                {
                    colIt->defragment();
                }
                if (appendCount > 0)
                {
                    mafStream << _mafBlock << '\n';
                }
                _mafBlock.initBlock(colIt, _ucscNames, _printTree);
                assert(_mafBlock.canAppendColumn(colIt) == true);
            }
            _mafBlock.appendColumn(colIt);
            appendCount++;

            if (colIt->lastColumn()) {
                // Have to break here because otherwise
                // colIt->toRight() will crash.
                break;
            }
            colIt->toRight();
        }
        // Copy over the updated visit cache information. This is a
        // deep copy, so it's slow, but necessary to preserve the
        // column iterator ownership of the visit cache
        visitCache.clear();
        ColumnIterator::VisitCache *newVisitCache = colIt->getVisitCache();
        for(ColumnIterator::VisitCache::iterator it = newVisitCache->begin();
            it != newVisitCache->end(); it++) {
            visitCache[it->first] = new PositionCache(*it->second);
        }
    }

    // if nothing was ever added (seems to happen in corner case where
    // all columns violate unique), mafBlock ostream operator will crash
    // so we do following check
    if (appendCount > 0)
    {
        mafStream << _mafBlock << endl;
    }
}