void Liftover::liftInterval() { PositionMap posCacheMap; _colIt = _srcSequence->getColumnIterator(&_tgtSet, 0, _inStart, _inEnd - 1); while (true) { const ColumnMap* cMap = _colIt->getColumnMap(); for (ColumnMap::const_iterator i = cMap->begin(); i != cMap->end(); ++i) { if (i->first->getGenome() == _tgtGenome) { const DNASet* dSet = i->second; const Sequence* seq = i->first; // if we're not adding the column, don't bother keeping track hal_size_t paralogyFactor = _addDupeColumn ? dSet->size() : 0; SeqIndex seqIdx(seq, paralogyFactor); for (DNASet::const_iterator j = dSet->begin(); j != dSet->end(); ++j) { pair<PositionMap::iterator, bool> res = posCacheMap.insert(pair<SeqIndex, PositionCache*>(seqIdx, NULL)); if (res.second == true) { res.first->second = new PositionCache(); } res.first->second->insert((*j)->getArrayIndex()); } } } if (_colIt->lastColumn() == true) { break; } _colIt->toRight(); } PositionMap::iterator pcmIt; for (pcmIt = posCacheMap.begin(); pcmIt != posCacheMap.end(); ++pcmIt) { const Sequence* seq = pcmIt->first.first; _outParalogy = pcmIt->first.second; hal_size_t seqStart = seq->getStartPosition(); PositionCache* posCache = pcmIt->second; const IntervalSet* iSet = posCache->getIntervalSet(); _outName = seq->getName(); for (IntervalSet::const_iterator k = iSet->begin(); k != iSet->end(); ++k) { _outStart = k->second - seqStart; _outEnd = k->first + 1 - seqStart; writeBedLine(); } delete posCache; } }
static void printIndels(const Genome *refGenome, const set<const Genome *> targets, hal_size_t adjacentBases) { hal_size_t refLength = refGenome->getSequenceLength(); hal_size_t numSites = 0; ColumnIteratorPtr colIt = refGenome->getColumnIterator(&targets); // good flanking site PositionCache knownGoodSites; for (hal_index_t refPos = adjacentBases; refPos < refLength - adjacentBases; refPos++) { pair<indelType, hal_size_t> indel; indel = getIndel(refPos, refGenome, &targets); hal_index_t start = refPos - adjacentBases; hal_index_t end = refPos + adjacentBases; if (indel.first == INSERTION) { end += indel.second; } colIt->toSite(start, end, true); map<const Genome *, hal_index_t> prevPos; bool failedFiltering = false; hal_size_t step = 1; while (1) { hal_index_t refColPos = colIt->getReferenceSequencePosition() + colIt->getReferenceSequence()->getStartPosition(); if (refColPos == refPos && indel.first == DELETION) { // jump "step" bases -- i.e. past the deleted region step = indel.second + 1; } else if (refColPos == refPos && indel.first == INSERTION) { // don't enforce adjacency on insertion since we're skipping it prevPos.erase(refGenome); if (refGenome->getSequenceBySite(refPos) != refGenome->getSequenceBySite(refPos + indel.second)) { // Insertion crosses sequence end failedFiltering = true; break; } colIt->toSite(refPos + indel.second, end); continue; } else { step = 1; } const ColumnIterator::ColumnMap *colMap = colIt->getColumnMap(); if (!knownGoodSites.find(refColPos)) { if (!isStrictSingleCopy(colMap, &targets) || !isContiguous(colMap, &prevPos, step, refGenome) || !isNotAmbiguous(colMap) || (step != 1 && !deletionIsNotAmbiguous(colMap, &prevPos, refGenome))) { failedFiltering = true; if (indel.first == INSERTION) { // failed indel means that we don't have to check anywhere // inside the insertion, it will automatically fail refPos += indel.second; } break; } else { knownGoodSites.insert(refColPos); } } updatePrevPos(colMap, &prevPos); if (colIt->lastColumn()) { break; } colIt->toRight(); } if (indel.first != NONE && !failedFiltering) { if (indel.first == DELETION) { const Sequence *seq = refGenome->getSequenceBySite(refPos); cout << seq->getName() << "\t" << refPos - seq->getStartPosition() << "\t" << refPos - seq->getStartPosition() << "\tD\t" << indel.second << endl; } else { const Sequence *seq = refGenome->getSequenceBySite(refPos); assert(seq == refGenome->getSequenceBySite(refPos + indel.second)); cout << seq->getName() << "\t" << refPos - seq->getStartPosition() << "\t" << refPos + indel.second - seq->getStartPosition() << "\tI\t" << endl; refPos += indel.second; } } if (!failedFiltering) { numSites++; } } cout << "# num sites possible: " << numSites << endl; }
void ColumnLiftover::liftInterval(BedList& mappedBedLines) { PositionMap posCacheMap; PositionMap revCacheMap; _colIt = _srcSequence->getColumnIterator(&_tgtSet, 0, _bedLine._start, _bedLine._end - 1, !_traverseDupes, false, _bedLine._strand == '-'); while (true) { const ColumnMap* cMap = _colIt->getColumnMap(); for (ColumnMap::const_iterator i = cMap->begin(); i != cMap->end(); ++i) { if (i->first->getGenome() == _tgtGenome) { const DNASet* dSet = i->second; const Sequence* seq = i->first; // if we're not adding the column, don't bother keeping track SeqIndex seqIdx(seq, 0); for (DNASet::const_iterator j = dSet->begin(); j != dSet->end(); ++j) { pair<PositionMap::iterator, bool> res; if ((*j)->getReversed() == false) { res = posCacheMap.insert(pair<SeqIndex, PositionCache*>(seqIdx, NULL)); } else { res = revCacheMap.insert(pair<SeqIndex, PositionCache*>(seqIdx, NULL)); } if (res.second == true) { res.first->second = new PositionCache(); } res.first->second->insert((*j)->getArrayIndex()); } } } if (_colIt->lastColumn() == true) { break; } _colIt->toRight(); } PositionMap::iterator pcmIt; for (pcmIt = posCacheMap.begin(); pcmIt != posCacheMap.end(); ++pcmIt) { const Sequence* seq = pcmIt->first.first; _outParalogy = pcmIt->first.second; hal_size_t seqStart = seq->getStartPosition(); PositionCache* posCache = pcmIt->second; const IntervalSet* iSet = posCache->getIntervalSet(); for (IntervalSet::const_iterator k = iSet->begin(); k != iSet->end(); ++k) { mappedBedLines.push_back(_bedLine); BedLine& outBedLine = mappedBedLines.back(); outBedLine._blocks.clear(); outBedLine._chrName = seq->getName(); outBedLine._start = k->second - seqStart; outBedLine._end = k->first + 1 - seqStart; outBedLine._strand = _bedLine._strand == '.' ? '.' : '+'; outBedLine._srcStart = NULL_INDEX; // not available from posMap } delete posCache; } for (pcmIt = revCacheMap.begin(); pcmIt != revCacheMap.end(); ++pcmIt) { const Sequence* seq = pcmIt->first.first; _outParalogy = pcmIt->first.second; hal_size_t seqStart = seq->getStartPosition(); PositionCache* posCache = pcmIt->second; const IntervalSet* iSet = posCache->getIntervalSet(); for (IntervalSet::const_iterator k = iSet->begin(); k != iSet->end(); ++k) { mappedBedLines.push_back(_bedLine); BedLine& outBedLine = mappedBedLines.back(); outBedLine._blocks.clear(); outBedLine._chrName = seq->getName(); outBedLine._start = k->second - seqStart; outBedLine._end = k->first + 1 - seqStart; outBedLine._strand = _bedLine._strand == '.' ? '.' : '-'; outBedLine._srcStart = NULL_INDEX; // not available from posMap } delete posCache; } }