unsigned long Jaccard::getTotalIntersection(RecordKeyVector &hits) { unsigned long intersection = 0; Record *key = hits.getKey(); CHRPOS keyStart = key->getStartPos(); CHRPOS keyEnd = key->getEndPos(); int hitIdx = 0; for (RecordKeyVector::iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) { Record *currRec = *iter; CHRPOS maxStart = max(currRec->getStartPos(), keyStart); CHRPOS minEnd = min(currRec->getEndPos(), keyEnd); if (_context->getObeySplits()) { intersection += upCast(_context)->getSplitBlockInfo()->getOverlapBases(hitIdx); hitIdx++; } else { intersection += (unsigned long)(minEnd - maxStart); } } _numIntersections += (int)hits.size(); return intersection; }
void CoverageFile::makeDepthCount(RecordKeyVector &hits) { const Record *key = hits.getKey(); _queryOffset = key->getStartPos(); _queryLen = (size_t)(key->getEndPos() - _queryOffset); _totalQueryLen += _queryLen; //resize depth array if needed if (_depthArrayCapacity < _queryLen) { _depthArray = (size_t*)realloc(_depthArray, sizeof(size_t) * _queryLen); _depthArrayCapacity = _queryLen; memset(_depthArray, 0, sizeof(size_t) * _depthArrayCapacity); } //loop through hits, which may not be in sorted order, due to //potential multiple databases, and increment the depth array as needed. for (RecordKeyVector::const_iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) { const Record *dbRec = *iter; int dbStart = dbRec->getStartPos(); int dbEnd = dbRec->getEndPos(); int maxStart = max(_queryOffset, dbStart); int minEnd = min(dbEnd, key->getEndPos()); for (int i=maxStart; i < minEnd; i++) { _depthArray[i - _queryOffset]++; } } }
int BlockMgr::findBlockedOverlaps(RecordKeyVector &hitList, bool useOverlappingSubBlocks) { RecordKeyVector keyList(hitList.getKey()); bool deleteKeyBlocks = true; getBlocks(keyList, deleteKeyBlocks); _overlapBases.clear(); int keyBlocksSumLength = getTotalBlockLength(keyList); //Loop through every database record the query intersected with RecordKeyVector::iterator_type hitListIter = hitList.begin(); for (; hitListIter != hitList.end();) { RecordKeyVector hitBlocks(*hitListIter); bool deleteHitBlocks = false; getBlocks(hitBlocks, deleteHitBlocks); //get all blocks for the hit record. int hitBlockSumLength = getTotalBlockLength(hitBlocks); //get total length of the bocks for the hitRecord. int totalHitOverlap = 0; bool hitHasOverlap = false; //loop through every block of the database record. RecordKeyVector::iterator_type hitBlockIter = hitBlocks.begin(); for (; hitBlockIter != hitBlocks.end(); hitBlockIter = hitBlocks.next()) { //loop through every block of the query record. RecordKeyVector::iterator_type keyListIter = keyList.begin(); for (; keyListIter != keyList.end(); keyListIter = keyList.next()) { const Record *keyBlock = *keyListIter; const Record *hitBlock = *hitBlockIter; int maxStart = max(keyBlock->getStartPos(), hitBlock->getStartPos()); int minEnd = min(keyBlock->getEndPos(), hitBlock->getEndPos()); int overlap = minEnd - maxStart; if (overlap > 0) { hitHasOverlap = true; totalHitOverlap += overlap; if (useOverlappingSubBlocks == true) { (*hitListIter)->block_starts.push_back(maxStart); (*hitListIter)->block_ends.push_back(minEnd); } } } } if (hitHasOverlap && useOverlappingSubBlocks == false) { bool enoughKeyOverlap = (float) totalHitOverlap / (float) keyBlocksSumLength >= _overlapFraction; bool enoughHitOverlap = (float) totalHitOverlap / (float) hitBlockSumLength >= _overlapFraction; if (enoughKeyOverlap) { if (_hasReciprocal && enoughHitOverlap) { //(*hitListIter)->setValid(true); _overlapBases.push_back(totalHitOverlap); hitListIter = hitList.next(); } else if (_hasReciprocal && !enoughHitOverlap) { hitList.erase(); //(*hitListIter)->setValid(false); } else if (!_hasReciprocal) { //(*hitListIter)->setValid(true); _overlapBases.push_back(totalHitOverlap); hitListIter = hitList.next(); } } else { hitList.erase(); //(*hitListIter)->setValid(false); } } else if (!hitHasOverlap && useOverlappingSubBlocks == false) { hitList.erase(); //(*hitListIter)->setValid(false); } else { hitListIter = hitList.next(); } if (deleteHitBlocks) { deleteBlocks(hitBlocks); } } // end for loop through main hits if (deleteKeyBlocks) { deleteBlocks(keyList); } return (int)hitList.size(); }
void BlockMgr::deleteBlocks(RecordKeyVector &keyList) { for (RecordKeyVector::iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { _blockRecordsMgr->deleteRecord(*iter); } keyList.clearVector(); }
int BlockMgr::getTotalBlockLength(RecordKeyVector &keyList) { int sum = 0; for (RecordKeyVector::iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { const Record *record = *iter; sum += record->getEndPos() - record->getStartPos(); } return sum; }
void FileRecordMergeMgr::deleteAllMergedItemsButKey(RecordKeyVector &recList) { //if the key is also in the list, this method won't delete it. for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { if (*iter == recList.getKey()) { continue; } deleteRecord(*iter); } recList.clearVector(); }
int BlockMgr::findBlockedOverlaps(RecordKeyVector &keyList, RecordKeyVector &hitList, RecordKeyVector &resultList, RecordKeyVector *overlapList) { bool deleteKeyBlocks = false; if (keyList.empty()) { //get all the blocks for the query record, put them in it's list. getBlocks(keyList, deleteKeyBlocks); } _overlapBases.clear(); int keyBlocksSumLength = getTotalBlockLength(keyList); //Loop through every database record the query intersected with RecordKeyVector::iterator_type hitListIter = hitList.begin(); for (; hitListIter != hitList.end(); hitListIter = hitList.next()) { RecordKeyVector hitBlocks(*hitListIter); bool deleteHitBlocks = false; getBlocks(hitBlocks, deleteHitBlocks); //get all blocks for the hit record. int hitBlockSumLength = getTotalBlockLength(hitBlocks); //get total length of the bocks for the hitRecord. int totalHitOverlap = 0; bool hitHasOverlap = false; //loop through every block of the database record. RecordKeyVector::iterator_type hitBlockIter = hitBlocks.begin(); for (; hitBlockIter != hitBlocks.end(); hitBlockIter = hitBlocks.next()) { //loop through every block of the query record. RecordKeyVector::iterator_type keyListIter = keyList.begin(); for (; keyListIter != keyList.end(); keyListIter = keyList.next()) { const Record *keyBlock = *keyListIter; const Record *hitBlock = *hitBlockIter; int maxStart = max(keyBlock->getStartPos(), hitBlock->getStartPos()); int minEnd = min(keyBlock->getEndPos(), hitBlock->getEndPos()); int overlap = minEnd - maxStart; if (overlap > 0) { hitHasOverlap = true; if (overlapList != NULL) { overlapList->push_back(allocateAndAssignRecord(keyList.getKey(), maxStart, minEnd)); } totalHitOverlap += overlap; } } } if (hitHasOverlap) { if ((float) totalHitOverlap / (float)keyBlocksSumLength >= _overlapFraction) { if (_hasReciprocal && ((float)totalHitOverlap / (float)hitBlockSumLength >= _overlapFraction)) { _overlapBases.push_back(totalHitOverlap); resultList.push_back(*hitListIter); } else if (!_hasReciprocal) { _overlapBases.push_back(totalHitOverlap); resultList.push_back(*hitListIter); } } } if (deleteHitBlocks) { deleteBlocks(hitBlocks); } } if (deleteKeyBlocks) { deleteBlocks(keyList); } resultList.setKey(keyList.getKey()); return (int)resultList.size(); }
void RecordOutputMgr::printRecord(RecordKeyVector &keyList, RecordKeyVector *blockList) { if (needsFlush()) { flush(); } //The first time we print a record is when we print any header, because the header //hasn't been read from the query file until after the first record has also been read. checkForHeader(); const_cast<Record *>(keyList.getKey())->undoZeroLength(); _currBamBlockList = blockList; if (_context->getProgram() == ContextBase::INTERSECT) { if (_printable) { if (keyList.empty()) { if ((static_cast<ContextIntersect *>(_context))->getWriteAllOverlap()) { // -wao the user wants to force the reporting of 0 overlap if (printKeyAndTerminate(keyList)) { _currBamBlockList = NULL; return; } tab(); null(false, true); tab(); _outBuf.append('0'); newline(); if (needsFlush()) flush(); } else if ((static_cast<ContextIntersect *>(_context))->getLeftJoin()) { if (printKeyAndTerminate(keyList)) { _currBamBlockList = NULL; return; } tab(); null(false, true); newline(); if (needsFlush()) flush(); _currBamBlockList = NULL; return; } } else { if (printBamRecord(keyList, true) == BAM_AS_BAM) { _currBamBlockList = NULL; return; } int hitIdx = 0; for (RecordKeyVector::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { reportOverlapDetail(keyList.getKey(), *iter, hitIdx); hitIdx++; } } } else { // not printable reportOverlapSummary(keyList); } _currBamBlockList = NULL; } else if (_context->getProgram() == ContextBase::SAMPLE) { if (!printKeyAndTerminate(keyList)) { newline(); } _currBamBlockList = NULL; return; } else if (_context->getProgram() == ContextBase::MAP) { printKeyAndTerminate(keyList); _currBamBlockList = NULL; return; } else if (_context->getProgram() == ContextBase::MERGE) { printKeyAndTerminate(keyList); _currBamBlockList = NULL; return; } }
void RecordOutputMgr::printClosest(RecordKeyVector &keyList, const vector<int> *dists) { const ContextClosest *context = static_cast<const ContextClosest *>(_context); bool deleteBlocks = false; RecordKeyVector blockList(keyList.getKey()); if (keyList.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { _bamBlockMgr->getBlocks(blockList, deleteBlocks); _currBamBlockList = &blockList; } if (!keyList.empty()) { int distCount = 0; for (RecordKeyVector::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { printKey(keyList.getKey()); tab(); addDbFileId((*iter)->getFileIdx()); (*iter)->print(_outBuf); if (dists != NULL) { tab(); _outBuf.append((*dists)[distCount]); distCount++; } newline(); if (needsFlush()) flush(); } } else { printKey(keyList.getKey()); tab(); null(true, false); if (context->reportDistance()) { tab(); _outBuf.append(-1); } newline(); } if (deleteBlocks) { _bamBlockMgr->deleteBlocks(blockList); _currBamBlockList = NULL; } return; }
void RecordOutputMgr::printClosest(RecordKeyVector &keyList, const vector<int> *dists) { //The first time we print a record is when we print any header, because the header //hasn't been read from the query file until after the first record has also been read. checkForHeader(); const ContextClosest *context = static_cast<const ContextClosest *>(_context); bool deleteBlocks = false; const Record *keyRec = keyList.getKey(); RecordKeyVector blockList(keyRec); if (keyRec->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { _bamBlockMgr->getBlocks(blockList, deleteBlocks); _currBamBlockList = &blockList; } if (!keyList.empty()) { int distCount = 0; for (RecordKeyVector::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { const Record *hitRec = *iter; printKey(keyRec, keyRec->getStartPosStr(), keyRec->getEndPosStr()); tab(); addDbFileId(hitRec->getFileIdx()); printKey(hitRec, hitRec->getStartPosStr(), hitRec->getEndPosStr()); if (dists != NULL) { tab(); int dist = (*dists)[distCount]; //if not using sign distance, use absolute value instead. dist = context->signDistance() ? dist : abs(dist); _outBuf.append(dist); distCount++; } newline(); if (needsFlush()) flush(); } } else { printKey(keyRec, keyRec->getStartPosStr(), keyRec->getEndPosStr()); tab(); null(false, true); if (context->reportDistance()) { tab(); _outBuf.append(-1); } newline(); } if (deleteBlocks) { _bamBlockMgr->deleteBlocks(blockList); _currBamBlockList = NULL; } return; }
void SubtractFile::subtractHits(RecordKeyVector &hits) { if (hits.empty()) { // no intersection, nothing to subtract. // just copy key to hits as if it were a // self-intersection. This is just for reporting // purposes. hits.push_back(hits.getKey()); return; } if (upCast(_context)->getRemoveAll() && upCast(_context)->getSubtractFraction() == 0.0) { // hits aren't empty, meaning there is intersection, // so we want to not report the hit. _dontReport = true; return; } //loop through hits. Track which bases in query were covered Record *keyRec = hits.getKey(); int keyStart = keyRec->getStartPos(); int keyEnd = keyRec->getEndPos(); //this vector of bools will represent the bases of the query. //for each base, true means uncovered, false means covered. //they begin as all uncovered. vector<bool> keyBases(keyEnd - keyStart, true); //now loop through the hits, and cover corresponding query bases //by setting them to false. bool basesRemoved = false; for (RecordKeyVector::iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) { Record *hitRec = *iter; int hitStart = hitRec->getStartPos(); int hitEnd = hitRec->getEndPos(); int startIdx = max(keyStart, hitStart) - keyStart; int endIdx = min(keyEnd, hitEnd) - keyStart; int keyLen = keyEnd - keyStart; int coveredLen = endIdx - startIdx; float coveragePct = (float)coveredLen / (float)keyLen; //for each base in the hit, set the base in the query to false. //this effectively "erases" the covered bits. Only do if (upCast(_context)->getRemoveSum() || coveragePct >= upCast(_context)->getSubtractFraction()) { std::fill(keyBases.begin() + startIdx, keyBases.begin() + endIdx, false); basesRemoved = true; } } if (!basesRemoved) { //treat as if there were no intersection hits.clearVector(); hits.push_back(hits.getKey()); return; } else if (upCast(_context)->getRemoveAll()) { _dontReport = true; return; } // if the -N option is used ( removeSum), do not report if the percentage of // uniquely covered bases exceeds the overlap fraction. if (upCast(_context)->getRemoveSum()) { //determine how many bases are left uncovered. int numBasesUncovered = std::accumulate(keyBases.begin(), keyBases.end(), 0); //determine percentage that are covered. float pctCovered = 1.0 - (float)numBasesUncovered / (float)(keyEnd - keyStart); if (pctCovered > upCast(_context)->getSubtractFraction()) { _dontReport = true; return; } else { hits.clearVector(); hits.push_back(hits.getKey()); } return; } //now make "blocks" out of the query's remaining stretches of //uncovered bases. hits.clearVector(); for (int i = 0; i < (int)keyBases.size(); i++) { if (keyBases[i] == true) { int blockStart = keyStart + i; while (keyBases[i] == true && i < (int)keyBases.size()) { i++; } int blockEnd = min(keyStart + i, keyEnd); hits.push_back(_tmpBlocksMgr->allocateAndAssignRecord(keyRec, blockStart, blockEnd)); } } _deleteTmpBlocks = true; }
unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList) { unsigned long intersection = 0; const Record *key = recList.getKey(); int keyStart = key->getStartPos(); int keyEnd = key->getEndPos(); int hitIdx = 0; for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { int maxStart = max((*iter)->getStartPos(), keyStart); int minEnd = min((*iter)->getEndPos(), keyEnd); if (_context->getObeySplits()) { intersection += _blockMgr->getOverlapBases(hitIdx); hitIdx++; } else { intersection += (unsigned long)(minEnd - maxStart); } } _numIntersections += (int)recList.size(); return intersection; }
unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList) { unsigned long intersection = 0; Record *key = recList.getKey(); CHRPOS keyStart = key->getStartPos(); CHRPOS keyEnd = key->getEndPos(); _overlapCounts += recList.size(); // note that we truncate to a max size of 2.1GB _qsizes.push_back((int)(keyEnd - keyStart)); int hitIdx = 0; for (RecordKeyVector::iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { CHRPOS maxStart = max((*iter)->getStartPos(), keyStart); CHRPOS minEnd = min((*iter)->getEndPos(), keyEnd); _qsizes.push_back((int)(minEnd - maxStart)); if (_context->getObeySplits()) { intersection += upCast(_context)->getSplitBlockInfo()->getOverlapBases(hitIdx); hitIdx++; } else { intersection += (unsigned long)(minEnd - maxStart); } } _numIntersections += (int)recList.size(); return intersection; }