int BlockMgr::findBlockedOverlaps(RecordKeyVector &keyList, RecordKeyVector &hitList, RecordKeyVector &resultList) { bool deleteKeyBlocks = false; if (keyList.empty()) { //get all the blocks for the query record, put them in it's list. getBlocks(keyList, deleteKeyBlocks); } _overlapBases.clear(); int keyBlocksSumLength = getTotalBlockLength(keyList); //Loop through every database record the query intersected with for (RecordKeyVector::const_iterator_type hitListIter = hitList.begin(); hitListIter != hitList.end(); hitListIter = hitList.next()) { RecordKeyVector hitBlocks(*hitListIter); bool deleteHitBlocks = false; getBlocks(hitBlocks, deleteHitBlocks); //get all blocks for the hit record. int hitBlockSumLength = getTotalBlockLength(hitBlocks); //get total length of the bocks for the hitRecord. int totalHitOverlap = 0; bool hitHasOverlap = false; //loop through every block of the database record. for (RecordKeyVector::const_iterator_type hitBlockIter = hitBlocks.begin(); hitBlockIter != hitBlocks.end(); hitBlockIter = hitBlocks.next()) { //loop through every block of the query record. for (RecordKeyVector::const_iterator_type keyListIter = keyList.begin(); keyListIter != keyList.end(); keyListIter = keyList.next()) { const Record *keyBlock = *keyListIter; const Record *hitBlock = *hitBlockIter; int maxStart = max(keyBlock->getStartPos(), hitBlock->getStartPos()); int minEnd = min(keyBlock->getEndPos(), hitBlock->getEndPos()); int overlap = minEnd - maxStart; if (overlap > 0) { hitHasOverlap = true; totalHitOverlap += overlap; } } } if (hitHasOverlap) { if ((float) totalHitOverlap / (float)keyBlocksSumLength >= _overlapFraction) { if (_hasReciprocal && ((float)totalHitOverlap / (float)hitBlockSumLength >= _overlapFraction)) { _overlapBases.push_back(totalHitOverlap); resultList.push_back(*hitListIter); } else if (!_hasReciprocal) { _overlapBases.push_back(totalHitOverlap); resultList.push_back(*hitListIter); } } } if (deleteHitBlocks) { deleteBlocks(hitBlocks); } } if (deleteKeyBlocks) { deleteBlocks(keyList); } resultList.setKey(keyList.getKey()); return (int)resultList.size(); }
void RecordOutputMgr::printClosest(RecordKeyVector &keyList, const vector<int> *dists) { //The first time we print a record is when we print any header, because the header //hasn't been read from the query file until after the first record has also been read. checkForHeader(); const ContextClosest *context = static_cast<const ContextClosest *>(_context); bool deleteBlocks = false; const Record *keyRec = keyList.getKey(); RecordKeyVector blockList(keyRec); if (keyRec->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { _bamBlockMgr->getBlocks(blockList, deleteBlocks); _currBamBlockList = &blockList; } if (!keyList.empty()) { int distCount = 0; for (RecordKeyVector::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { const Record *hitRec = *iter; printKey(keyRec, keyRec->getStartPosStr(), keyRec->getEndPosStr()); tab(); addDbFileId(hitRec->getFileIdx()); printKey(hitRec, hitRec->getStartPosStr(), hitRec->getEndPosStr()); if (dists != NULL) { tab(); int dist = (*dists)[distCount]; //if not using sign distance, use absolute value instead. dist = context->signDistance() ? dist : abs(dist); _outBuf.append(dist); distCount++; } newline(); if (needsFlush()) flush(); } } else { printKey(keyRec, keyRec->getStartPosStr(), keyRec->getEndPosStr()); tab(); // need to add a dummy file id if multiple DB files are used if (_context->getNumInputFiles() > 2) { _outBuf.append('.'); tab(); } null(false, true); if (context->reportDistance()) { tab(); _outBuf.append(-1); } newline(); } if (deleteBlocks) { _bamBlockMgr->deleteBlocks(blockList); _currBamBlockList = NULL; } return; }
void RecordOutputMgr::printClosest(RecordKeyVector &keyList, const vector<int> *dists) { const ContextClosest *context = static_cast<const ContextClosest *>(_context); bool deleteBlocks = false; RecordKeyVector blockList(keyList.getKey()); if (keyList.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { _bamBlockMgr->getBlocks(blockList, deleteBlocks); _currBamBlockList = &blockList; } if (!keyList.empty()) { int distCount = 0; for (RecordKeyVector::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { printKey(keyList.getKey()); tab(); addDbFileId((*iter)->getFileIdx()); (*iter)->print(_outBuf); if (dists != NULL) { tab(); _outBuf.append((*dists)[distCount]); distCount++; } newline(); if (needsFlush()) flush(); } } else { printKey(keyList.getKey()); tab(); null(true, false); if (context->reportDistance()) { tab(); _outBuf.append(-1); } newline(); } if (deleteBlocks) { _bamBlockMgr->deleteBlocks(blockList); _currBamBlockList = NULL; } return; }
void RecordOutputMgr::printRecord(RecordKeyVector &keyList, RecordKeyVector *blockList) { if (needsFlush()) { flush(); } //The first time we print a record is when we print any header, because the header //hasn't been read from the query file until after the first record has also been read. checkForHeader(); const_cast<Record *>(keyList.getKey())->undoZeroLength(); _currBamBlockList = blockList; if (_context->getProgram() == ContextBase::INTERSECT) { if (_printable) { if (keyList.empty()) { if ((static_cast<ContextIntersect *>(_context))->getWriteAllOverlap()) { // -wao the user wants to force the reporting of 0 overlap if (printKeyAndTerminate(keyList)) { _currBamBlockList = NULL; return; } tab(); null(false, true); tab(); _outBuf.append('0'); newline(); if (needsFlush()) flush(); } else if ((static_cast<ContextIntersect *>(_context))->getLeftJoin()) { if (printKeyAndTerminate(keyList)) { _currBamBlockList = NULL; return; } tab(); null(false, true); newline(); if (needsFlush()) flush(); _currBamBlockList = NULL; return; } } else { if (printBamRecord(keyList, true) == BAM_AS_BAM) { _currBamBlockList = NULL; return; } int hitIdx = 0; for (RecordKeyVector::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { reportOverlapDetail(keyList.getKey(), *iter, hitIdx); hitIdx++; } } } else { // not printable reportOverlapSummary(keyList); } _currBamBlockList = NULL; } else if (_context->getProgram() == ContextBase::SAMPLE) { if (!printKeyAndTerminate(keyList)) { newline(); } _currBamBlockList = NULL; return; } else if (_context->getProgram() == ContextBase::MAP) { printKeyAndTerminate(keyList); _currBamBlockList = NULL; return; } else if (_context->getProgram() == ContextBase::MERGE) { printKeyAndTerminate(keyList); _currBamBlockList = NULL; return; } }
void SubtractFile::subtractHits(RecordKeyVector &hits) { if (hits.empty()) { // no intersection, nothing to subtract. // just copy key to hits as if it were a // self-intersection. This is just for reporting // purposes. hits.push_back(hits.getKey()); return; } if (upCast(_context)->getRemoveAll() && upCast(_context)->getSubtractFraction() == 0.0) { // hits aren't empty, meaning there is intersection, // so we want to not report the hit. _dontReport = true; return; } //loop through hits. Track which bases in query were covered Record *keyRec = hits.getKey(); int keyStart = keyRec->getStartPos(); int keyEnd = keyRec->getEndPos(); //this vector of bools will represent the bases of the query. //for each base, true means uncovered, false means covered. //they begin as all uncovered. vector<bool> keyBases(keyEnd - keyStart, true); //now loop through the hits, and cover corresponding query bases //by setting them to false. bool basesRemoved = false; for (RecordKeyVector::iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) { Record *hitRec = *iter; int hitStart = hitRec->getStartPos(); int hitEnd = hitRec->getEndPos(); int startIdx = max(keyStart, hitStart) - keyStart; int endIdx = min(keyEnd, hitEnd) - keyStart; int keyLen = keyEnd - keyStart; int coveredLen = endIdx - startIdx; float coveragePct = (float)coveredLen / (float)keyLen; //for each base in the hit, set the base in the query to false. //this effectively "erases" the covered bits. Only do if (upCast(_context)->getRemoveSum() || coveragePct >= upCast(_context)->getSubtractFraction()) { std::fill(keyBases.begin() + startIdx, keyBases.begin() + endIdx, false); basesRemoved = true; } } if (!basesRemoved) { //treat as if there were no intersection hits.clearVector(); hits.push_back(hits.getKey()); return; } else if (upCast(_context)->getRemoveAll()) { _dontReport = true; return; } // if the -N option is used ( removeSum), do not report if the percentage of // uniquely covered bases exceeds the overlap fraction. if (upCast(_context)->getRemoveSum()) { //determine how many bases are left uncovered. int numBasesUncovered = std::accumulate(keyBases.begin(), keyBases.end(), 0); //determine percentage that are covered. float pctCovered = 1.0 - (float)numBasesUncovered / (float)(keyEnd - keyStart); if (pctCovered > upCast(_context)->getSubtractFraction()) { _dontReport = true; return; } else { hits.clearVector(); hits.push_back(hits.getKey()); } return; } //now make "blocks" out of the query's remaining stretches of //uncovered bases. hits.clearVector(); for (int i = 0; i < (int)keyBases.size(); i++) { if (keyBases[i] == true) { int blockStart = keyStart + i; while (keyBases[i] == true && i < (int)keyBases.size()) { i++; } int blockEnd = min(keyStart + i, keyEnd); hits.push_back(_tmpBlocksMgr->allocateAndAssignRecord(keyRec, blockStart, blockEnd)); } } _deleteTmpBlocks = true; }