Exemple #1
0
Record *FileRecordMgr::allocateAndGetNextRecord()
{
	if (!_fileReader->isOpen()) {
		return NULL;
	}
	if (!_fileReader->readEntry()) {
		return NULL;
	}
	if (!_headerSet && _fileReader->hasHeader()) {
		_context->setHeader(_contextFileIdx, _fileReader->getHeader());
		_headerSet = true;
	}
	Record *record = NULL;
	record = _recordMgr->allocateRecord();
	if (!record->initFromFile(_fileReader)) {
		_recordMgr->deleteRecord(record);
		return NULL;
	}
	// In the rare case of Bam records where both the read and it's mate failed to map,
	// Ignore the record. Delete and return null
	if (!(record->isUnmapped() && record->isMateUnmapped())) {
		if (!record->coordsValid()) {
			cerr << "Error: Invalid record in file " << _filename << ". Record is " << endl << *record << endl;
			exit(1);
		}

		//test for sorted order, if necessary.
		if (_context->getSortedInput()) {
			testInputSortOrder(record);
		}
	}
	assignChromId(record);
	_totalRecordLength += (unsigned long)(record->getEndPos() - record->getStartPos());
	return record;
}
Exemple #2
0
unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList)
{
    unsigned long intersection = 0;
    Record *key = recList.getKey();
    CHRPOS keyStart = key->getStartPos();
    CHRPOS keyEnd = key->getEndPos();

    _overlapCounts += recList.size();
    // note that we truncate to a max size of 2.1GB
    _qsizes.push_back((int)(keyEnd - keyStart));

    int hitIdx = 0;
    for (RecordKeyVector::iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) {
        CHRPOS maxStart = max((*iter)->getStartPos(), keyStart);
        CHRPOS minEnd = min((*iter)->getEndPos(), keyEnd);
        _qsizes.push_back((int)(minEnd - maxStart));
        if (_context->getObeySplits()) {
            intersection += upCast(_context)->getSplitBlockInfo()->getOverlapBases(hitIdx);
            hitIdx++;
        } else {
            intersection += (unsigned long)(minEnd - maxStart);
        }
    }
    _numIntersections += (int)recList.size();
    return intersection;
}
Exemple #3
0
unsigned long Jaccard::getTotalIntersection(RecordKeyVector &hits)
{
	unsigned long intersection = 0;
	Record *key = hits.getKey();
	CHRPOS keyStart = key->getStartPos();
	CHRPOS keyEnd = key->getEndPos();

	int hitIdx = 0;
	for (RecordKeyVector::iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) {
		Record *currRec = *iter;
		CHRPOS maxStart = max(currRec->getStartPos(), keyStart);
		CHRPOS minEnd = min(currRec->getEndPos(), keyEnd);
		if (_context->getObeySplits()) {
			intersection += upCast(_context)->getSplitBlockInfo()->getOverlapBases(hitIdx);
			hitIdx++;
		} else {
			intersection += (unsigned long)(minEnd - maxStart);
		}
	}
	_numIntersections += (int)hits.size();
	return intersection;
}
bool NewChromSweep::nextRecord(bool query, int dbIdx) {
	if (query) {
		_currQueryRec = _queryFRM->getNextRecord();
		if (_currQueryRec != NULL) {
			_queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos());
            _queryTotalRecords++;
			return true;
		}
		return false;
	} else { //database
		Record *rec = _dbFRMs[dbIdx]->getNextRecord();
		_currDbRecs[dbIdx] = rec;
		if (rec != NULL) {
			_databaseRecordsTotalLength += (unsigned long)(rec->getEndPos() - rec->getStartPos());
			_databaseTotalRecords++;
			return true;
		}
		return false;
	}
}
Exemple #5
0
Record *FileRecordMgr::getNextRecord(RecordKeyList *keyList)
{
	if (!_fileReader->isOpen()) {
		return NULL;
	}
	if (!_fileReader->readEntry()) {
		return NULL;
	}
	Record *record = NULL;
	record = _recordMgr->allocateRecord();
	if (!record->initFromFile(_fileReader)) {
		_recordMgr->deleteRecord(record);
		return NULL;
	}

	// If the record is unmapped, don't test for valid coords or sort order,
	// but still return it so the -v (noHit) option and the like will still
	// see it.

	if (!record->isUnmapped()) {
		if (!record->coordsValid()) {
			cerr << "Error: Invalid record in file " << _filename << ". Record is " << endl << *record << endl;
			exit(1);
		}

		//test for sorted order, if necessary.
		if (_isSortedInput) {
			testInputSortOrder(record);
		}
	}
	assignChromId(record);
	_totalRecordLength += (unsigned long)(record->getEndPos() - record->getStartPos());
	if (keyList != NULL) {
		keyList->setKey(record);
	}
	return record;
}
Record *FileRecordMergeMgr::getNextRecord(RecordKeyVector *recList)
{
	//clear the recList if there is one, and if it has records
	// in it.
	if (recList != NULL && !recList->allClear()) {
		deleteMergedRecord(*recList);
	}

	_mustBeForward = _desiredStrand == SAME_STRAND_FORWARD;
	_mustBeReverse = _desiredStrand == SAME_STRAND_REVERSE;

	Record *startRecord = tryToTakeFromStorage();

	// if we couldn't use a previously stored record for starters,
	//then begin with a new one that matches strand criteria.
	while (startRecord == NULL) {
		startRecord = FileRecordMgr::getNextRecord();
		if (startRecord == NULL) { //hit EOF!!
			return NULL;
		}

		if ((_mustBeForward && (startRecord->getStrandVal() != Record::FORWARD)) || (_mustBeReverse && (startRecord->getStrandVal() != Record::REVERSE))) {
			//record is reverse, only want forward, OR record is forward, wanted reverse
			deleteRecord(startRecord);
			startRecord = NULL;
			continue;
		}
		if (startRecord->getStrandVal() == Record::UNKNOWN && _desiredStrand != ANY_STRAND) {
			//there is an unknown strand, but the user specified strandedness.
			deleteRecord(startRecord);
			startRecord = NULL;
		}
	}

	// OK!! We have a start record! Re-evaluate strand requirements for next recored.

	_mustBeForward = _desiredStrand == SAME_STRAND_FORWARD || (_desiredStrand == SAME_STRAND_EITHER && (startRecord->getStrandVal() == Record::FORWARD));
	_mustBeReverse = _desiredStrand == SAME_STRAND_REVERSE || (_desiredStrand == SAME_STRAND_EITHER && (startRecord->getStrandVal() == Record::REVERSE));
	bool mustKeepOpposite = (_desiredStrand == SAME_STRAND_EITHER);

	const QuickString &currChrom = startRecord->getChrName();
	_foundChroms.insert(currChrom);

	bool madeComposite = false;
	if (recList != NULL) {
		recList->push_back(startRecord);
		recList->setKey(startRecord); //key of recList will just be the startRecord unless we're able to merge more.
	}

	Record::strandType currStrand = startRecord->getStrandVal();
	bool mustMatchStrand = _desiredStrand != ANY_STRAND;

	int currEnd = startRecord->getEndPos();
	//now look for more records to merge with this one.
	//stop when they're out of range, not on the same chromosome, or we hit EOF.
	//ignore if they don't comply with strand.
	Record *nextRecord = NULL;
	while (nextRecord == NULL) {
		bool takenFromStorage = false;
		nextRecord = mustMatchStrand ? tryToTakeFromStorage(currStrand) : tryToTakeFromStorage();
		if (nextRecord == NULL) {
			nextRecord = FileRecordMgr::getNextRecord();
		} else {
			takenFromStorage = true;
		}
		if (nextRecord == NULL) { // EOF hit
			break;
		}
		//delete any record from file with an unknown strand if we are doing stranded merge, but first check
		//that it's chrom was the same and it's not out of range. If either is true, stop scanning.
		bool mustDelete = (mustMatchStrand && nextRecord->getStrandVal() == Record::UNKNOWN);

		//check that we are still on the same chromosome.
		const QuickString &newChrom = nextRecord->getChrName();
		if (newChrom != currChrom) { //hit a different chromosome.
			//haven't seen this chromosome before, sort order is already enforced in the base class method.
			if (!mustDelete) {
				addToStorage(nextRecord);
			} else {
				deleteRecord(nextRecord);
			}
			nextRecord = NULL;
			break;
		}

		//check whether it's in range
		int nextStart = nextRecord->getStartPos();
		if (nextStart > currEnd + _maxDistance) {
			//no, it's out of range.
			if (!mustDelete) {
				addToStorage(nextRecord);
			} else {
				deleteRecord(nextRecord);
			}
			nextRecord = NULL;
			break;
		}

		// NOW, going back, we can delete any unknown strand records. But don't stop scanning.
		if (mustDelete) {
			deleteRecord(nextRecord);
			nextRecord = NULL;
			continue;
		}
		//if taken from file, and wrong strand, store or delete.
		if (!takenFromStorage && ((_mustBeForward && (nextRecord->getStrandVal() != Record::FORWARD)) || (_mustBeReverse && (nextRecord->getStrandVal() != Record::REVERSE)))) {
			if (mustKeepOpposite) {
				addToStorage(nextRecord);
			} else {
				deleteRecord(nextRecord);
			}
			nextRecord = NULL;
			continue; //get the next record
		}
		//ok, they're on the same chrom and in range, and the strand is good. Do a merge.
		if (recList != NULL) recList->push_back(nextRecord);
		madeComposite = true;
		int nextEnd = nextRecord->getEndPos();
		if (nextEnd > currEnd) {
			currEnd = nextEnd;
		}
		nextRecord = NULL;
	}
	if (madeComposite) {
		Record *newKey = _recordMgr->allocateRecord();
		(*newKey) = (*startRecord);
		newKey->setEndPos(currEnd);
		if (recList != NULL) recList->setKey(newKey);
		_totalMergedRecordLength += currEnd - newKey->getStartPos();
		return newKey;
	} else {
		_totalMergedRecordLength += currEnd - startRecord->getStartPos();
		return startRecord;
	}
//	_totalMergedRecordLength += (unsigned long)(recList->getKey()->getEndPos() - recList->getKey()->getStartPos());
//	return const_cast<Record *>(recList->getKey());
}
Exemple #7
0
bool FileRecordMgr::allocateAndGetNextMergedRecord(RecordKeyList & recList, WANT_STRAND_TYPE desiredStrand, int maxDistance)
{
	if (!recList.allClear()) {
		deleteMergedRecord(recList);
	}

	_mustBeForward = desiredStrand == SAME_STRAND_FORWARD;
	_mustBeReverse = desiredStrand == SAME_STRAND_REVERSE;

	Record *startRecord = tryToTakeFromStorage();

	// if we couldn't use a previously stored record for starters,
	//then begin with a new one that matches strand criteria.
	while (startRecord == NULL) {
		startRecord = allocateAndGetNextRecord();
		if (startRecord == NULL) { //hit EOF!!
			return false;
		}

		if (_mustBeForward && !startRecord->getStrand()) {
			//record is reverse, wanted forward.
			addToStorage(startRecord);
			startRecord = NULL;
		} else if (_mustBeReverse && startRecord->getStrand()) {
			//record is forward, wanted reverse
			addToStorage(startRecord);
			startRecord = NULL;
		}
	}

	// OK!! We have a start record!

	_mustBeForward = desiredStrand == SAME_STRAND_FORWARD || (desiredStrand == SAME_STRAND_EITHER && startRecord->getStrand());
	_mustBeReverse = desiredStrand == SAME_STRAND_REVERSE || (desiredStrand == SAME_STRAND_EITHER && !startRecord->getStrand());

	const QuickString &currChrom = startRecord->getChrName();
	_foundChroms.insert(currChrom);

	bool madeComposite = false;
	recList.push_back(startRecord);
	recList.setKey(startRecord); //key of recList will just be the startRecord unless we're able to merge more.

	bool currStrand = startRecord->getStrand();
	bool mustMatchStrand = desiredStrand != ANY_STRAND;

	int currEnd = startRecord->getEndPos();
	//now look for more records to merge with this one.
	//stop when they're out of range, not on the same chromosome, or we hit EOF.
	//ignore if they don't comply with strand.
	Record *nextRecord = NULL;
	while (nextRecord == NULL) {
		bool takenFromStorage = false;
		nextRecord = mustMatchStrand ? tryToTakeFromStorage(currStrand) : tryToTakeFromStorage();
		if (nextRecord == NULL) {
			nextRecord = allocateAndGetNextRecord();
		} else {
			takenFromStorage = true;
		}
		if (nextRecord == NULL) { // EOF hit
			break;
		}
		const QuickString &newChrom = nextRecord->getChrName();
		if (newChrom != currChrom) { //hit a different chromosome.
			if (_foundChroms.find(newChrom) == _foundChroms.end() || takenFromStorage) {
				//haven't seen this chromosome before.
				addToStorage(nextRecord);
				break;
			} else {
				//different strand, but we've already seen this chrom. File is not sorted.
				fprintf(stderr, "ERROR: Input file %s is not sorted by chromosome, startPos.\n", _context->getInputFileName(_contextFileIdx).c_str());
				deleteRecord(nextRecord);
				deleteMergedRecord(recList);
				exit(1);
			}
		}
		int nextStart = nextRecord->getStartPos();
		//is the record out of range?
		if (nextStart > currEnd + maxDistance) {
			//yes, it's out of range.
			addToStorage(nextRecord);
			break;
		}

		//ok, they're on the same chrom and in range. Are we happy with the strand?
		if (mustMatchStrand && nextRecord->getStrand() != currStrand) {
			//no, we're not.
			addToStorage(nextRecord);
			nextRecord = NULL;
			continue;
		}
		//everything's good! do a merge.
		recList.push_back(nextRecord);
		madeComposite = true;
		int nextEnd = nextRecord->getEndPos();
		if (nextEnd > currEnd) {
			currEnd = nextEnd;
		}
		nextRecord = NULL;
	}
	if (madeComposite) {
		Record *newKey = _recordMgr->allocateRecord();
		(*newKey) = (*startRecord);
		newKey->setEndPos(currEnd);
		recList.setKey(newKey);
	}
	_totalMergedRecordLength += (unsigned long)(recList.getKey()->getEndPos() - recList.getKey()->getStartPos());
	return true;
}
void SubtractFile::subtractHits(RecordKeyVector &hits) {
	if (hits.empty()) {
        // no intersection, nothing to subtract.
        // just copy key to hits as if it were a
        // self-intersection. This is just for reporting
        // purposes.
        hits.push_back(hits.getKey());
		return;
	}

	if (upCast(_context)->getRemoveAll() && upCast(_context)->getSubtractFraction() == 0.0) {
		// hits aren't empty, meaning there is intersection,
		// so we want to not report the hit.
		_dontReport = true;
		return;
	}

	//loop through hits. Track which bases in query were covered
	Record *keyRec = hits.getKey();
	int keyStart = keyRec->getStartPos();
	int keyEnd = keyRec->getEndPos();

	//this vector of bools will represent the bases of the query.
	//for each base, true means uncovered, false means covered.
	//they begin as all uncovered.
	vector<bool> keyBases(keyEnd - keyStart, true);

	//now loop through the hits, and cover corresponding query bases
	//by setting them to false.
	bool basesRemoved = false;
	for (RecordKeyVector::iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) {
		Record *hitRec = *iter;
		int hitStart = hitRec->getStartPos();
		int hitEnd = hitRec->getEndPos();

		int startIdx = max(keyStart, hitStart) - keyStart;
		int endIdx = min(keyEnd, hitEnd) - keyStart;

		int keyLen = keyEnd - keyStart;
		int coveredLen = endIdx - startIdx;
		float coveragePct = (float)coveredLen / (float)keyLen;
		//for each base in the hit, set the base in the query to false.
		//this effectively "erases" the covered bits. Only do
		if (upCast(_context)->getRemoveSum() || coveragePct >= upCast(_context)->getSubtractFraction()) {
			std::fill(keyBases.begin() + startIdx, keyBases.begin() + endIdx, false);
			basesRemoved = true;
		}
	}

	if (!basesRemoved) {
		//treat as if there were no intersection
		hits.clearVector();
		hits.push_back(hits.getKey());
		return;
	} else if (upCast(_context)->getRemoveAll()) {
		_dontReport = true;
		return;
	}
	// if the -N option is used ( removeSum), do not report if the percentage of
	// uniquely covered bases exceeds the overlap fraction.
	if (upCast(_context)->getRemoveSum()) {
		//determine how many bases are left uncovered.
		int numBasesUncovered = std::accumulate(keyBases.begin(), keyBases.end(), 0);
		//determine percentage that are covered.
		float pctCovered = 1.0 - (float)numBasesUncovered / (float)(keyEnd - keyStart);
		if (pctCovered > upCast(_context)->getSubtractFraction()) {
			_dontReport = true;
			return;
		} else {
            hits.clearVector();
            hits.push_back(hits.getKey());
        }
		return;
	}

	//now make "blocks" out of the query's remaining stretches of
	//uncovered bases.
	hits.clearVector();
    for (int i = 0; i < (int)keyBases.size(); i++) {
        if (keyBases[i] == true) {
            int blockStart = keyStart + i;
            while (keyBases[i] == true && i < (int)keyBases.size()) {
                i++;
            }
            int blockEnd = min(keyStart + i, keyEnd);
            hits.push_back(_tmpBlocksMgr->allocateAndAssignRecord(keyRec, blockStart, blockEnd));
        }
    }
    _deleteTmpBlocks = true;

}