Exemplo n.º 1
0
int BlockMgr::findBlockedOverlaps(RecordKeyVector &keyList, RecordKeyVector &hitList, RecordKeyVector &resultList)
{
	bool deleteKeyBlocks = false;
	if (keyList.empty()) {
		//get all the blocks for the query record, put them in it's list.
		getBlocks(keyList, deleteKeyBlocks);
	}
	_overlapBases.clear();
	int keyBlocksSumLength = getTotalBlockLength(keyList);
	//Loop through every database record the query intersected with
	for (RecordKeyVector::const_iterator_type hitListIter = hitList.begin(); hitListIter != hitList.end(); hitListIter = hitList.next()) {
		RecordKeyVector hitBlocks(*hitListIter);
		bool deleteHitBlocks = false;
		getBlocks(hitBlocks, deleteHitBlocks); //get all blocks for the hit record.
		int hitBlockSumLength = getTotalBlockLength(hitBlocks); //get total length of the bocks for the hitRecord.
		int totalHitOverlap = 0;
		bool hitHasOverlap = false;

		//loop through every block of the database record.
		for (RecordKeyVector::const_iterator_type hitBlockIter = hitBlocks.begin(); hitBlockIter != hitBlocks.end(); hitBlockIter = hitBlocks.next()) {
			//loop through every block of the query record.
			for (RecordKeyVector::const_iterator_type keyListIter = keyList.begin(); keyListIter != keyList.end(); keyListIter = keyList.next()) {
				const Record *keyBlock = *keyListIter;
				const Record *hitBlock = *hitBlockIter;

				int maxStart = max(keyBlock->getStartPos(), hitBlock->getStartPos());
				int minEnd = min(keyBlock->getEndPos(), hitBlock->getEndPos());
				int overlap  = minEnd - maxStart;
				if (overlap > 0) {
					hitHasOverlap = true;
					totalHitOverlap += overlap;
				}

			}
		}
		if (hitHasOverlap) {
			if ((float) totalHitOverlap / (float)keyBlocksSumLength >= _overlapFraction) {
				if (_hasReciprocal &&
						((float)totalHitOverlap / (float)hitBlockSumLength >= _overlapFraction)) {
					_overlapBases.push_back(totalHitOverlap);
					resultList.push_back(*hitListIter);
				} else if (!_hasReciprocal) {
					_overlapBases.push_back(totalHitOverlap);
					resultList.push_back(*hitListIter);
				}
			}
		}
		if (deleteHitBlocks) {
			deleteBlocks(hitBlocks);
		}
	}
	if (deleteKeyBlocks) {
		deleteBlocks(keyList);
	}
	resultList.setKey(keyList.getKey());
	return (int)resultList.size();
}
Exemplo n.º 2
0
void BlockMgr::getBlocksFromBed12(RecordKeyVector &keyList, bool &mustDelete)
{
	const Bed12Interval *keyRecord = static_cast<const Bed12Interval *>(keyList.getKey());
	int blockCount = keyRecord->getBlockCount();

    if ( blockCount <= 0 ) {
    	mustDelete = false;
    	return;
    }

    int sizeCount = _blockSizeTokens.tokenize(keyRecord->getBlockSizes(), ',');
    int startCount = _blockStartTokens.tokenize(keyRecord->getBlockStarts(), ',');

    if (blockCount != sizeCount || sizeCount != startCount) {
    	fprintf(stderr, "Error: found wrong block counts while splitting entry.\n");
    	exit(-1);
    }

    for (int i=0; i < blockCount; i++) {
    	int startPos = keyRecord->getStartPos() + str2chrPos(_blockStartTokens.getElem(i).c_str());
    	int endPos = startPos + str2chrPos(_blockSizeTokens.getElem(i).c_str());

    	Record *record = allocateAndAssignRecord(keyRecord, startPos, endPos);
    	keyList.push_back(record);
    }
    mustDelete = true;
}
Exemplo n.º 3
0
void NewChromSweep::masterScan(RecordKeyVector &retList) {

	for (int i=0; i < _numDBs; i++) {
		if (dbFinished(i) || chromChange(i, retList, true)) {
			continue;
		} else {

			// scan the database cache for hits
			scanCache(i, retList);
			//skip if we hit the end of the DB
			// advance the db until we are ahead of the query. update hits and cache as necessary
			while (_currDbRecs[i] != NULL &&
					_currQueryRec->sameChrom(_currDbRecs[i]) &&
					!(_currDbRecs[i]->after(_currQueryRec))) {
				if (intersects(_currQueryRec, _currDbRecs[i])) {
					retList.push_back(_currDbRecs[i]);
				}
				if (_currQueryRec->after(_currDbRecs[i])) {
					_dbFRMs[i]->deleteRecord(_currDbRecs[i]);
					_currDbRecs[i] = NULL;
				} else {
					_caches[i].push_back(_currDbRecs[i]);
					_currDbRecs[i] = NULL;
				}
				nextRecord(false, i);
			}
		}
	}
}
Exemplo n.º 4
0
void CloseSweep::checkMultiDbs(RecordKeyVector &retList) {
	ContextClosest::tieModeType tieMode = _context->getTieMode();

	if (_context->getMultiDbMode() == ContextClosest::ALL_DBS && _numDBs > 1) {
		_copyDists.clear();
		_copyRetList.clearAll();
		_copyRetList.setKey(retList.getKey());
		//loop through retList, find min dist
		int minDist = INT_MAX;
		int i = 0;
		for (; i < (int)_finalDistances.size(); i++) {
			if (abs(_finalDistances[i]) < minDist) {
				minDist = abs(_finalDistances[i]);
			}
		}
		i=0;
		for (RecordKeyVector::const_iterator_type iter = retList.begin(); iter != retList.end(); iter++) {
			int dist = _finalDistances[i];
			if (abs(dist) == minDist) {
				_copyDists.push_back(dist);
				_copyRetList.push_back(*iter);
			}
			i++;
		}

		retList.clearVector();
		_finalDistances.clear();

		if (_copyRetList.empty()) return;

		if (tieMode == ContextClosest::FIRST_TIE) {
			retList.push_back(*(_copyRetList.begin()));
			_finalDistances.push_back(_copyDists[0]);
		} else if (tieMode == ContextClosest::LAST_TIE) {
			retList.push_back(*(_copyRetList.begin() + _copyRetList.size() -1));
			_finalDistances.push_back(_copyDists[_copyDists.size()-1]);
		} else {

			retList = _copyRetList;
			_finalDistances = _copyDists;
		}
	}
}
Exemplo n.º 5
0
void BlockMgr::getBlocksFromBam(RecordKeyVector &keyList, bool &mustDelete)
{
	const BamRecord *keyRecord = static_cast<const BamRecord *>(keyList.getKey());
	const vector<BamTools::CigarOp> &cigarData = keyRecord->getCigarData();
	int currPos = keyRecord->getStartPos();
	int  blockLength = 0;

	for (int i=0; i < (int)cigarData.size(); i++) {
		char opType = cigarData[i].Type;
		int opLen = (int)(cigarData[i].Length);

		switch(opType) {
		case 'I':
		case 'S':
		case 'P':
		case 'H':
			break;
		case 'M': case 'X': case '=':
			blockLength += opLen;
			break;
		case 'D':
		case 'N' :
			if ((opType == 'D' && !_breakOnDeletionOps) ||
					(opType == 'N' && !_breakOnSkipOps)) {
				blockLength += opLen;
			} else {
				keyList.push_back(allocateAndAssignRecord(keyRecord, currPos, currPos + blockLength));
				currPos += opLen + blockLength;
				blockLength = 0;
			}
			break;
		default:
			fprintf(stderr, "ERROR: Found invalid Cigar operation: %c.\n", opType);
			exit(1);
			break;
		}
	}
	if (blockLength > 0) {
		keyList.push_back(allocateAndAssignRecord(keyRecord, currPos, currPos + blockLength));
	}
	mustDelete = true;
}
Exemplo n.º 6
0
bool GroupBy::findNext(RecordKeyVector &hits)
{
	//get one record.
	if (_prevRecord == NULL) {
		return false;
	}
	assignPrevFields();
	hits.setKey(_prevRecord);
	hits.push_back(_prevRecord); //key should also be part of group for calculations
	while (1) {
		const Record *newRecord = getNextRecord();
		if (newRecord == NULL) {
			_prevRecord = NULL;
			break;
		} else if (canGroup(newRecord)) {
			hits.push_back(newRecord);
		} else {
			_prevRecord = newRecord;
			break;
		}
	}
	return true;
}
Exemplo n.º 7
0
void BlockMgr::getBlocks(RecordKeyVector &keyList, bool &mustDelete)
{
	switch (keyList.getKey()->getType()) {
	case FileRecordTypeChecker::BED12_RECORD_TYPE:
		getBlocksFromBed12(keyList, mustDelete);
		break;

	case FileRecordTypeChecker::BAM_RECORD_TYPE:
		getBlocksFromBam(keyList, mustDelete);
		break;

	default:
		keyList.push_back(keyList.getKey());
		mustDelete = false;
		break;
	}
}
Exemplo n.º 8
0
void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) {
	recListIterType cacheIter = _caches[dbIdx].begin();
    while (cacheIter != _caches[dbIdx].end())
    {
    	const Record *cacheRec = cacheIter->value();
        if (_currQueryRec->sameChrom(cacheRec) && !_currQueryRec->after(cacheRec)) {
            if (intersects(_currQueryRec, cacheRec)) {
                retList.push_back(cacheRec);
            } else if (cacheRec->after(_currQueryRec)) break; // cacheRec is after the query rec, stop scanning.
            cacheIter = _caches[dbIdx].next();
        }
        else {
            cacheIter = _caches[dbIdx].deleteCurrent();
    		_dbFRMs[dbIdx]->deleteRecord(cacheRec);
        }
    }
}
Exemplo n.º 9
0
void CloseSweep::addSingleRec(Record *rec, int currDist, int &hitsUsed, RecordKeyVector &retList) {
    retList.push_back(rec);
    _finalDistances.push_back(currDist);
    hitsUsed++;
}
Exemplo n.º 10
0
void SubtractFile::subtractHits(RecordKeyVector &hits) {
	if (hits.empty()) {
        // no intersection, nothing to subtract.
        // just copy key to hits as if it were a
        // self-intersection. This is just for reporting
        // purposes.
        hits.push_back(hits.getKey());
		return;
	}

	if (upCast(_context)->getRemoveAll() && upCast(_context)->getSubtractFraction() == 0.0) {
		// hits aren't empty, meaning there is intersection,
		// so we want to not report the hit.
		_dontReport = true;
		return;
	}

	//loop through hits. Track which bases in query were covered
	Record *keyRec = hits.getKey();
	int keyStart = keyRec->getStartPos();
	int keyEnd = keyRec->getEndPos();

	//this vector of bools will represent the bases of the query.
	//for each base, true means uncovered, false means covered.
	//they begin as all uncovered.
	vector<bool> keyBases(keyEnd - keyStart, true);

	//now loop through the hits, and cover corresponding query bases
	//by setting them to false.
	bool basesRemoved = false;
	for (RecordKeyVector::iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) {
		Record *hitRec = *iter;
		int hitStart = hitRec->getStartPos();
		int hitEnd = hitRec->getEndPos();

		int startIdx = max(keyStart, hitStart) - keyStart;
		int endIdx = min(keyEnd, hitEnd) - keyStart;

		int keyLen = keyEnd - keyStart;
		int coveredLen = endIdx - startIdx;
		float coveragePct = (float)coveredLen / (float)keyLen;
		//for each base in the hit, set the base in the query to false.
		//this effectively "erases" the covered bits. Only do
		if (upCast(_context)->getRemoveSum() || coveragePct >= upCast(_context)->getSubtractFraction()) {
			std::fill(keyBases.begin() + startIdx, keyBases.begin() + endIdx, false);
			basesRemoved = true;
		}
	}

	if (!basesRemoved) {
		//treat as if there were no intersection
		hits.clearVector();
		hits.push_back(hits.getKey());
		return;
	} else if (upCast(_context)->getRemoveAll()) {
		_dontReport = true;
		return;
	}
	// if the -N option is used ( removeSum), do not report if the percentage of
	// uniquely covered bases exceeds the overlap fraction.
	if (upCast(_context)->getRemoveSum()) {
		//determine how many bases are left uncovered.
		int numBasesUncovered = std::accumulate(keyBases.begin(), keyBases.end(), 0);
		//determine percentage that are covered.
		float pctCovered = 1.0 - (float)numBasesUncovered / (float)(keyEnd - keyStart);
		if (pctCovered > upCast(_context)->getSubtractFraction()) {
			_dontReport = true;
			return;
		} else {
            hits.clearVector();
            hits.push_back(hits.getKey());
        }
		return;
	}

	//now make "blocks" out of the query's remaining stretches of
	//uncovered bases.
	hits.clearVector();
    for (int i = 0; i < (int)keyBases.size(); i++) {
        if (keyBases[i] == true) {
            int blockStart = keyStart + i;
            while (keyBases[i] == true && i < (int)keyBases.size()) {
                i++;
            }
            int blockEnd = min(keyStart + i, keyEnd);
            hits.push_back(_tmpBlocksMgr->allocateAndAssignRecord(keyRec, blockStart, blockEnd));
        }
    }
    _deleteTmpBlocks = true;

}
Exemplo n.º 11
0
void CloseSweep::finalizeSelections(int dbIdx, RecordKeyVector &retList) {
	// If there are actual overlaps, only report those, then stop.
	ContextClosest::tieModeType tieMode = _context->getTieMode();
	const vector<const Record *>  & overlapRecs = (*(_overlapRecs[dbIdx]));
	if (!overlapRecs.empty()) {
		if (tieMode == ContextClosest::FIRST_TIE) {
			retList.push_back(overlapRecs[0]);
			_finalDistances.push_back(0);
		} else if (tieMode == ContextClosest::LAST_TIE) {
			retList.push_back(overlapRecs[overlapRecs.size()-1]);
			_finalDistances.push_back(0);
		} else {

			for (int i=0; i < (int)overlapRecs.size(); i++) {
				retList.push_back(overlapRecs[i]);
				_finalDistances.push_back(0);
			}
		}
		return;
	}
	int upStreamDist = _minUpstreamDist[dbIdx];
	int downStreamDist = _minDownstreamDist[dbIdx];
	const vector<const Record *>  & upRecs = (*(_minUpstreamRecs[dbIdx]));
	const vector<const Record *>  & downRecs = (*(_minDownstreamRecs[dbIdx]));

	if (abs(upStreamDist) < abs(downStreamDist)) {
		if (tieMode == ContextClosest::FIRST_TIE) {
			retList.push_back(upRecs[0]);
			_finalDistances.push_back(upStreamDist);
		} else if (tieMode == ContextClosest::LAST_TIE) {
			retList.push_back(upRecs[upRecs.size()-1]);
			_finalDistances.push_back(upStreamDist);
		} else {

			for (int i=0; i < (int)upRecs.size(); i++) {
				retList.push_back(upRecs[i]);
				_finalDistances.push_back(upStreamDist);
			}
		}
		return;
	}

	if (abs(downStreamDist) < abs(upStreamDist)) {
		if (tieMode == ContextClosest::FIRST_TIE) {
			retList.push_back(downRecs[0]);
			_finalDistances.push_back(downStreamDist);
		} else if (tieMode == ContextClosest::LAST_TIE) {
			retList.push_back(downRecs[downRecs.size()-1]);
			_finalDistances.push_back(downStreamDist);
		} else {

			for (int i=0; i < (int)downRecs.size(); i++) {
				retList.push_back(downRecs[i]);
				_finalDistances.push_back(downStreamDist);
			}
		}
		return;
	}


	if (downStreamDist == upStreamDist) {
		if (tieMode == ContextClosest::FIRST_TIE) {
			retList.push_back(upRecs[0]);
			_finalDistances.push_back(upStreamDist);
		} else if (tieMode == ContextClosest::LAST_TIE) {
			retList.push_back(downRecs[downRecs.size()-1]);
			_finalDistances.push_back(downStreamDist);
		} else {

			for (int i=0; i < (int)upRecs.size(); i++) {
				retList.push_back(upRecs[i]);
				_finalDistances.push_back(upStreamDist);
			}
			for (int i=0; i < (int)downRecs.size(); i++) {
				retList.push_back(downRecs[i]);
				_finalDistances.push_back(downStreamDist);
			}
		}
		return;
	}

}