void MappedSegmentColCompareTest::createColArray() { hal_size_t N = _ref->getSequenceLength(); _colArray.clear(); _colArray.resize(N); set<const Genome*> tgtSet; tgtSet.insert(_tgt); ColumnIteratorConstPtr colIt = _ref->getColumnIterator(&tgtSet, 0, 0, NULL_INDEX, false, false, false, true); while (true) { const ColumnIterator::ColumnMap* colMap = colIt->getColumnMap(); ColumnIterator::ColumnMap::const_iterator colMapIt = colMap->begin(); vector<pair<hal_index_t, bool> > insertList; // Pass 1 find all homologies in target for (; colMapIt != colMap->end(); colMapIt++) { if (colMapIt->first->getGenome() == _tgt) { ColumnIterator::DNASet* dnaSet = colMapIt->second; ColumnIterator::DNASet::const_iterator dnaIt = dnaSet->begin(); for (; dnaIt != dnaSet->end(); ++dnaIt) { DNAIteratorConstPtr dna = *dnaIt; insertList.push_back( pair<hal_index_t, bool>(dna->getArrayIndex(), dna->getReversed())); } } else { CuAssertTrue(_testCase, colMapIt->first->getGenome() == _ref); } } // Pass 2 update each reference position with all homologies found // in Pass 1 for (colMapIt = colMap->begin(); colMapIt != colMap->end(); colMapIt++) { if (colMapIt->first->getGenome() == _ref) { ColumnIterator::DNASet* dnaSet = colMapIt->second; ColumnIterator::DNASet::const_iterator dnaIt = dnaSet->begin(); for (; dnaIt != dnaSet->end(); ++dnaIt) { DNAIteratorConstPtr dna = *dnaIt; for (size_t insIdx = 0; insIdx < insertList.size(); ++insIdx) { _colArray[dna->getArrayIndex()].insert(insertList[insIdx]); } } } } if (colIt->lastColumn()) { break; } colIt->toRight(); } }
void MafExport::convertSegmentedSequence(ostream& mafStream, AlignmentConstPtr alignment, const SegmentedSequence* seq, hal_index_t startPosition, hal_size_t length, const set<const Genome*>& targets) { assert(seq != NULL); if (startPosition >= (hal_index_t)seq->getSequenceLength() || (hal_size_t)startPosition + length > seq->getSequenceLength()) { throw hal_exception("Invalid range specified for convertGenome"); } if (length == 0) { length = seq->getSequenceLength() - startPosition; } if (length == 0) { throw hal_exception("Cannot convert zero length sequence"); } hal_index_t lastPosition = startPosition + (hal_index_t)(length - 1); _mafStream = &mafStream; _alignment = alignment; if (!_append) { writeHeader(); } ColumnIteratorConstPtr colIt = seq->getColumnIterator(&targets, _maxRefGap, startPosition, lastPosition, _noDupes, _noAncestors); hal_size_t appendCount = 0; if (_unique == false || colIt->isCanonicalOnRef() == true) { _mafBlock.initBlock(colIt, _ucscNames, _printTree); assert(_mafBlock.canAppendColumn(colIt) == true); _mafBlock.appendColumn(colIt); ++appendCount; } size_t numBlocks = 0; while (colIt->lastColumn() == false) { colIt->toRight(); if (_unique == false || colIt->isCanonicalOnRef() == true) { if (appendCount == 0) { _mafBlock.initBlock(colIt, _ucscNames, _printTree); assert(_mafBlock.canAppendColumn(colIt) == true); } if (_mafBlock.canAppendColumn(colIt) == false) { // erase empty entries from the column. helps when there are // millions of sequences (ie from fastas with lots of scaffolds) if (numBlocks++ % 1000 == 0) { colIt->defragment(); } if (appendCount > 0) { mafStream << _mafBlock << '\n'; } _mafBlock.initBlock(colIt, _ucscNames, _printTree); assert(_mafBlock.canAppendColumn(colIt) == true); } _mafBlock.appendColumn(colIt); ++appendCount; } } // if nothing was ever added (seems to happen in corner case where // all columns violate unique), mafBlock ostream operator will crash // so we do following check if (appendCount > 0) { mafStream << _mafBlock << endl; } }
void MafExport::convertEntireAlignment(ostream& mafStream, AlignmentConstPtr alignment) { hal_size_t appendCount = 0; size_t numBlocks = 0; _mafStream = &mafStream; _alignment = alignment; writeHeader(); // Load in all leaves from alignment vector<string> leafNames = alignment->getLeafNamesBelow(alignment->getRootName()); vector<const Genome *> leafGenomes; for (hal_size_t i = 0; i < leafNames.size(); i++) { const Genome *genome = alignment->openGenome(leafNames[i]); assert(genome != NULL); leafGenomes.push_back(genome); } ColumnIterator::VisitCache visitCache; // Go through all the genomes one by one, and spit out any columns // they participate in that we haven't seen. for (hal_size_t i = 0; i < leafGenomes.size(); i++) { const Genome *genome = leafGenomes[i]; ColumnIteratorConstPtr colIt = genome->getColumnIterator(NULL, 0, 0, NULL_INDEX, _noDupes, _noAncestors); colIt->setVisitCache(&visitCache); for (;;) { if (appendCount == 0) { _mafBlock.initBlock(colIt, _ucscNames, _printTree); assert(_mafBlock.canAppendColumn(colIt) == true); } if (_mafBlock.canAppendColumn(colIt) == false) { // erase empty entries from the column. helps when there are // millions of sequences (ie from fastas with lots of scaffolds) if (numBlocks++ % 1000 == 0) { colIt->defragment(); } if (appendCount > 0) { mafStream << _mafBlock << '\n'; } _mafBlock.initBlock(colIt, _ucscNames, _printTree); assert(_mafBlock.canAppendColumn(colIt) == true); } _mafBlock.appendColumn(colIt); appendCount++; if (colIt->lastColumn()) { // Have to break here because otherwise // colIt->toRight() will crash. break; } colIt->toRight(); } // Copy over the updated visit cache information. This is a // deep copy, so it's slow, but necessary to preserve the // column iterator ownership of the visit cache visitCache.clear(); ColumnIterator::VisitCache *newVisitCache = colIt->getVisitCache(); for(ColumnIterator::VisitCache::iterator it = newVisitCache->begin(); it != newVisitCache->end(); it++) { visitCache[it->first] = new PositionCache(*it->second); } } // if nothing was ever added (seems to happen in corner case where // all columns violate unique), mafBlock ostream operator will crash // so we do following check if (appendCount > 0) { mafStream << _mafBlock << endl; } }