int BlockMgr::findBlockedOverlaps(RecordKeyVector &keyList, RecordKeyVector &hitList, RecordKeyVector &resultList) { bool deleteKeyBlocks = false; if (keyList.empty()) { //get all the blocks for the query record, put them in it's list. getBlocks(keyList, deleteKeyBlocks); } _overlapBases.clear(); int keyBlocksSumLength = getTotalBlockLength(keyList); //Loop through every database record the query intersected with for (RecordKeyVector::const_iterator_type hitListIter = hitList.begin(); hitListIter != hitList.end(); hitListIter = hitList.next()) { RecordKeyVector hitBlocks(*hitListIter); bool deleteHitBlocks = false; getBlocks(hitBlocks, deleteHitBlocks); //get all blocks for the hit record. int hitBlockSumLength = getTotalBlockLength(hitBlocks); //get total length of the bocks for the hitRecord. int totalHitOverlap = 0; bool hitHasOverlap = false; //loop through every block of the database record. for (RecordKeyVector::const_iterator_type hitBlockIter = hitBlocks.begin(); hitBlockIter != hitBlocks.end(); hitBlockIter = hitBlocks.next()) { //loop through every block of the query record. for (RecordKeyVector::const_iterator_type keyListIter = keyList.begin(); keyListIter != keyList.end(); keyListIter = keyList.next()) { const Record *keyBlock = *keyListIter; const Record *hitBlock = *hitBlockIter; int maxStart = max(keyBlock->getStartPos(), hitBlock->getStartPos()); int minEnd = min(keyBlock->getEndPos(), hitBlock->getEndPos()); int overlap = minEnd - maxStart; if (overlap > 0) { hitHasOverlap = true; totalHitOverlap += overlap; } } } if (hitHasOverlap) { if ((float) totalHitOverlap / (float)keyBlocksSumLength >= _overlapFraction) { if (_hasReciprocal && ((float)totalHitOverlap / (float)hitBlockSumLength >= _overlapFraction)) { _overlapBases.push_back(totalHitOverlap); resultList.push_back(*hitListIter); } else if (!_hasReciprocal) { _overlapBases.push_back(totalHitOverlap); resultList.push_back(*hitListIter); } } } if (deleteHitBlocks) { deleteBlocks(hitBlocks); } } if (deleteKeyBlocks) { deleteBlocks(keyList); } resultList.setKey(keyList.getKey()); return (int)resultList.size(); }
void BlockMgr::getBlocksFromBed12(RecordKeyVector &keyList, bool &mustDelete) { const Bed12Interval *keyRecord = static_cast<const Bed12Interval *>(keyList.getKey()); int blockCount = keyRecord->getBlockCount(); if ( blockCount <= 0 ) { mustDelete = false; return; } int sizeCount = _blockSizeTokens.tokenize(keyRecord->getBlockSizes(), ','); int startCount = _blockStartTokens.tokenize(keyRecord->getBlockStarts(), ','); if (blockCount != sizeCount || sizeCount != startCount) { fprintf(stderr, "Error: found wrong block counts while splitting entry.\n"); exit(-1); } for (int i=0; i < blockCount; i++) { int startPos = keyRecord->getStartPos() + str2chrPos(_blockStartTokens.getElem(i).c_str()); int endPos = startPos + str2chrPos(_blockSizeTokens.getElem(i).c_str()); Record *record = allocateAndAssignRecord(keyRecord, startPos, endPos); keyList.push_back(record); } mustDelete = true; }
void NewChromSweep::masterScan(RecordKeyVector &retList) { for (int i=0; i < _numDBs; i++) { if (dbFinished(i) || chromChange(i, retList, true)) { continue; } else { // scan the database cache for hits scanCache(i, retList); //skip if we hit the end of the DB // advance the db until we are ahead of the query. update hits and cache as necessary while (_currDbRecs[i] != NULL && _currQueryRec->sameChrom(_currDbRecs[i]) && !(_currDbRecs[i]->after(_currQueryRec))) { if (intersects(_currQueryRec, _currDbRecs[i])) { retList.push_back(_currDbRecs[i]); } if (_currQueryRec->after(_currDbRecs[i])) { _dbFRMs[i]->deleteRecord(_currDbRecs[i]); _currDbRecs[i] = NULL; } else { _caches[i].push_back(_currDbRecs[i]); _currDbRecs[i] = NULL; } nextRecord(false, i); } } } }
void CloseSweep::checkMultiDbs(RecordKeyVector &retList) { ContextClosest::tieModeType tieMode = _context->getTieMode(); if (_context->getMultiDbMode() == ContextClosest::ALL_DBS && _numDBs > 1) { _copyDists.clear(); _copyRetList.clearAll(); _copyRetList.setKey(retList.getKey()); //loop through retList, find min dist int minDist = INT_MAX; int i = 0; for (; i < (int)_finalDistances.size(); i++) { if (abs(_finalDistances[i]) < minDist) { minDist = abs(_finalDistances[i]); } } i=0; for (RecordKeyVector::const_iterator_type iter = retList.begin(); iter != retList.end(); iter++) { int dist = _finalDistances[i]; if (abs(dist) == minDist) { _copyDists.push_back(dist); _copyRetList.push_back(*iter); } i++; } retList.clearVector(); _finalDistances.clear(); if (_copyRetList.empty()) return; if (tieMode == ContextClosest::FIRST_TIE) { retList.push_back(*(_copyRetList.begin())); _finalDistances.push_back(_copyDists[0]); } else if (tieMode == ContextClosest::LAST_TIE) { retList.push_back(*(_copyRetList.begin() + _copyRetList.size() -1)); _finalDistances.push_back(_copyDists[_copyDists.size()-1]); } else { retList = _copyRetList; _finalDistances = _copyDists; } } }
void BlockMgr::getBlocksFromBam(RecordKeyVector &keyList, bool &mustDelete) { const BamRecord *keyRecord = static_cast<const BamRecord *>(keyList.getKey()); const vector<BamTools::CigarOp> &cigarData = keyRecord->getCigarData(); int currPos = keyRecord->getStartPos(); int blockLength = 0; for (int i=0; i < (int)cigarData.size(); i++) { char opType = cigarData[i].Type; int opLen = (int)(cigarData[i].Length); switch(opType) { case 'I': case 'S': case 'P': case 'H': break; case 'M': case 'X': case '=': blockLength += opLen; break; case 'D': case 'N' : if ((opType == 'D' && !_breakOnDeletionOps) || (opType == 'N' && !_breakOnSkipOps)) { blockLength += opLen; } else { keyList.push_back(allocateAndAssignRecord(keyRecord, currPos, currPos + blockLength)); currPos += opLen + blockLength; blockLength = 0; } break; default: fprintf(stderr, "ERROR: Found invalid Cigar operation: %c.\n", opType); exit(1); break; } } if (blockLength > 0) { keyList.push_back(allocateAndAssignRecord(keyRecord, currPos, currPos + blockLength)); } mustDelete = true; }
bool GroupBy::findNext(RecordKeyVector &hits) { //get one record. if (_prevRecord == NULL) { return false; } assignPrevFields(); hits.setKey(_prevRecord); hits.push_back(_prevRecord); //key should also be part of group for calculations while (1) { const Record *newRecord = getNextRecord(); if (newRecord == NULL) { _prevRecord = NULL; break; } else if (canGroup(newRecord)) { hits.push_back(newRecord); } else { _prevRecord = newRecord; break; } } return true; }
void BlockMgr::getBlocks(RecordKeyVector &keyList, bool &mustDelete) { switch (keyList.getKey()->getType()) { case FileRecordTypeChecker::BED12_RECORD_TYPE: getBlocksFromBed12(keyList, mustDelete); break; case FileRecordTypeChecker::BAM_RECORD_TYPE: getBlocksFromBam(keyList, mustDelete); break; default: keyList.push_back(keyList.getKey()); mustDelete = false; break; } }
void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) { recListIterType cacheIter = _caches[dbIdx].begin(); while (cacheIter != _caches[dbIdx].end()) { const Record *cacheRec = cacheIter->value(); if (_currQueryRec->sameChrom(cacheRec) && !_currQueryRec->after(cacheRec)) { if (intersects(_currQueryRec, cacheRec)) { retList.push_back(cacheRec); } else if (cacheRec->after(_currQueryRec)) break; // cacheRec is after the query rec, stop scanning. cacheIter = _caches[dbIdx].next(); } else { cacheIter = _caches[dbIdx].deleteCurrent(); _dbFRMs[dbIdx]->deleteRecord(cacheRec); } } }
void CloseSweep::addSingleRec(Record *rec, int currDist, int &hitsUsed, RecordKeyVector &retList) { retList.push_back(rec); _finalDistances.push_back(currDist); hitsUsed++; }
void SubtractFile::subtractHits(RecordKeyVector &hits) { if (hits.empty()) { // no intersection, nothing to subtract. // just copy key to hits as if it were a // self-intersection. This is just for reporting // purposes. hits.push_back(hits.getKey()); return; } if (upCast(_context)->getRemoveAll() && upCast(_context)->getSubtractFraction() == 0.0) { // hits aren't empty, meaning there is intersection, // so we want to not report the hit. _dontReport = true; return; } //loop through hits. Track which bases in query were covered Record *keyRec = hits.getKey(); int keyStart = keyRec->getStartPos(); int keyEnd = keyRec->getEndPos(); //this vector of bools will represent the bases of the query. //for each base, true means uncovered, false means covered. //they begin as all uncovered. vector<bool> keyBases(keyEnd - keyStart, true); //now loop through the hits, and cover corresponding query bases //by setting them to false. bool basesRemoved = false; for (RecordKeyVector::iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) { Record *hitRec = *iter; int hitStart = hitRec->getStartPos(); int hitEnd = hitRec->getEndPos(); int startIdx = max(keyStart, hitStart) - keyStart; int endIdx = min(keyEnd, hitEnd) - keyStart; int keyLen = keyEnd - keyStart; int coveredLen = endIdx - startIdx; float coveragePct = (float)coveredLen / (float)keyLen; //for each base in the hit, set the base in the query to false. //this effectively "erases" the covered bits. Only do if (upCast(_context)->getRemoveSum() || coveragePct >= upCast(_context)->getSubtractFraction()) { std::fill(keyBases.begin() + startIdx, keyBases.begin() + endIdx, false); basesRemoved = true; } } if (!basesRemoved) { //treat as if there were no intersection hits.clearVector(); hits.push_back(hits.getKey()); return; } else if (upCast(_context)->getRemoveAll()) { _dontReport = true; return; } // if the -N option is used ( removeSum), do not report if the percentage of // uniquely covered bases exceeds the overlap fraction. if (upCast(_context)->getRemoveSum()) { //determine how many bases are left uncovered. int numBasesUncovered = std::accumulate(keyBases.begin(), keyBases.end(), 0); //determine percentage that are covered. float pctCovered = 1.0 - (float)numBasesUncovered / (float)(keyEnd - keyStart); if (pctCovered > upCast(_context)->getSubtractFraction()) { _dontReport = true; return; } else { hits.clearVector(); hits.push_back(hits.getKey()); } return; } //now make "blocks" out of the query's remaining stretches of //uncovered bases. hits.clearVector(); for (int i = 0; i < (int)keyBases.size(); i++) { if (keyBases[i] == true) { int blockStart = keyStart + i; while (keyBases[i] == true && i < (int)keyBases.size()) { i++; } int blockEnd = min(keyStart + i, keyEnd); hits.push_back(_tmpBlocksMgr->allocateAndAssignRecord(keyRec, blockStart, blockEnd)); } } _deleteTmpBlocks = true; }
void CloseSweep::finalizeSelections(int dbIdx, RecordKeyVector &retList) { // If there are actual overlaps, only report those, then stop. ContextClosest::tieModeType tieMode = _context->getTieMode(); const vector<const Record *> & overlapRecs = (*(_overlapRecs[dbIdx])); if (!overlapRecs.empty()) { if (tieMode == ContextClosest::FIRST_TIE) { retList.push_back(overlapRecs[0]); _finalDistances.push_back(0); } else if (tieMode == ContextClosest::LAST_TIE) { retList.push_back(overlapRecs[overlapRecs.size()-1]); _finalDistances.push_back(0); } else { for (int i=0; i < (int)overlapRecs.size(); i++) { retList.push_back(overlapRecs[i]); _finalDistances.push_back(0); } } return; } int upStreamDist = _minUpstreamDist[dbIdx]; int downStreamDist = _minDownstreamDist[dbIdx]; const vector<const Record *> & upRecs = (*(_minUpstreamRecs[dbIdx])); const vector<const Record *> & downRecs = (*(_minDownstreamRecs[dbIdx])); if (abs(upStreamDist) < abs(downStreamDist)) { if (tieMode == ContextClosest::FIRST_TIE) { retList.push_back(upRecs[0]); _finalDistances.push_back(upStreamDist); } else if (tieMode == ContextClosest::LAST_TIE) { retList.push_back(upRecs[upRecs.size()-1]); _finalDistances.push_back(upStreamDist); } else { for (int i=0; i < (int)upRecs.size(); i++) { retList.push_back(upRecs[i]); _finalDistances.push_back(upStreamDist); } } return; } if (abs(downStreamDist) < abs(upStreamDist)) { if (tieMode == ContextClosest::FIRST_TIE) { retList.push_back(downRecs[0]); _finalDistances.push_back(downStreamDist); } else if (tieMode == ContextClosest::LAST_TIE) { retList.push_back(downRecs[downRecs.size()-1]); _finalDistances.push_back(downStreamDist); } else { for (int i=0; i < (int)downRecs.size(); i++) { retList.push_back(downRecs[i]); _finalDistances.push_back(downStreamDist); } } return; } if (downStreamDist == upStreamDist) { if (tieMode == ContextClosest::FIRST_TIE) { retList.push_back(upRecs[0]); _finalDistances.push_back(upStreamDist); } else if (tieMode == ContextClosest::LAST_TIE) { retList.push_back(downRecs[downRecs.size()-1]); _finalDistances.push_back(downStreamDist); } else { for (int i=0; i < (int)upRecs.size(); i++) { retList.push_back(upRecs[i]); _finalDistances.push_back(upStreamDist); } for (int i=0; i < (int)downRecs.size(); i++) { retList.push_back(downRecs[i]); _finalDistances.push_back(downStreamDist); } } return; } }