Record *FileRecordMgr::allocateAndGetNextRecord() { if (!_fileReader->isOpen()) { return NULL; } if (!_fileReader->readEntry()) { return NULL; } if (!_headerSet && _fileReader->hasHeader()) { _context->setHeader(_contextFileIdx, _fileReader->getHeader()); _headerSet = true; } Record *record = NULL; record = _recordMgr->allocateRecord(); if (!record->initFromFile(_fileReader)) { _recordMgr->deleteRecord(record); return NULL; } // In the rare case of Bam records where both the read and it's mate failed to map, // Ignore the record. Delete and return null if (!(record->isUnmapped() && record->isMateUnmapped())) { if (!record->coordsValid()) { cerr << "Error: Invalid record in file " << _filename << ". Record is " << endl << *record << endl; exit(1); } //test for sorted order, if necessary. if (_context->getSortedInput()) { testInputSortOrder(record); } } assignChromId(record); _totalRecordLength += (unsigned long)(record->getEndPos() - record->getStartPos()); return record; }
unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList) { unsigned long intersection = 0; Record *key = recList.getKey(); CHRPOS keyStart = key->getStartPos(); CHRPOS keyEnd = key->getEndPos(); _overlapCounts += recList.size(); // note that we truncate to a max size of 2.1GB _qsizes.push_back((int)(keyEnd - keyStart)); int hitIdx = 0; for (RecordKeyVector::iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { CHRPOS maxStart = max((*iter)->getStartPos(), keyStart); CHRPOS minEnd = min((*iter)->getEndPos(), keyEnd); _qsizes.push_back((int)(minEnd - maxStart)); if (_context->getObeySplits()) { intersection += upCast(_context)->getSplitBlockInfo()->getOverlapBases(hitIdx); hitIdx++; } else { intersection += (unsigned long)(minEnd - maxStart); } } _numIntersections += (int)recList.size(); return intersection; }
unsigned long Jaccard::getTotalIntersection(RecordKeyVector &hits) { unsigned long intersection = 0; Record *key = hits.getKey(); CHRPOS keyStart = key->getStartPos(); CHRPOS keyEnd = key->getEndPos(); int hitIdx = 0; for (RecordKeyVector::iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) { Record *currRec = *iter; CHRPOS maxStart = max(currRec->getStartPos(), keyStart); CHRPOS minEnd = min(currRec->getEndPos(), keyEnd); if (_context->getObeySplits()) { intersection += upCast(_context)->getSplitBlockInfo()->getOverlapBases(hitIdx); hitIdx++; } else { intersection += (unsigned long)(minEnd - maxStart); } } _numIntersections += (int)hits.size(); return intersection; }
bool NewChromSweep::nextRecord(bool query, int dbIdx) { if (query) { _currQueryRec = _queryFRM->getNextRecord(); if (_currQueryRec != NULL) { _queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos()); _queryTotalRecords++; return true; } return false; } else { //database Record *rec = _dbFRMs[dbIdx]->getNextRecord(); _currDbRecs[dbIdx] = rec; if (rec != NULL) { _databaseRecordsTotalLength += (unsigned long)(rec->getEndPos() - rec->getStartPos()); _databaseTotalRecords++; return true; } return false; } }
Record *FileRecordMgr::getNextRecord(RecordKeyList *keyList) { if (!_fileReader->isOpen()) { return NULL; } if (!_fileReader->readEntry()) { return NULL; } Record *record = NULL; record = _recordMgr->allocateRecord(); if (!record->initFromFile(_fileReader)) { _recordMgr->deleteRecord(record); return NULL; } // If the record is unmapped, don't test for valid coords or sort order, // but still return it so the -v (noHit) option and the like will still // see it. if (!record->isUnmapped()) { if (!record->coordsValid()) { cerr << "Error: Invalid record in file " << _filename << ". Record is " << endl << *record << endl; exit(1); } //test for sorted order, if necessary. if (_isSortedInput) { testInputSortOrder(record); } } assignChromId(record); _totalRecordLength += (unsigned long)(record->getEndPos() - record->getStartPos()); if (keyList != NULL) { keyList->setKey(record); } return record; }
Record *FileRecordMergeMgr::getNextRecord(RecordKeyVector *recList) { //clear the recList if there is one, and if it has records // in it. if (recList != NULL && !recList->allClear()) { deleteMergedRecord(*recList); } _mustBeForward = _desiredStrand == SAME_STRAND_FORWARD; _mustBeReverse = _desiredStrand == SAME_STRAND_REVERSE; Record *startRecord = tryToTakeFromStorage(); // if we couldn't use a previously stored record for starters, //then begin with a new one that matches strand criteria. while (startRecord == NULL) { startRecord = FileRecordMgr::getNextRecord(); if (startRecord == NULL) { //hit EOF!! return NULL; } if ((_mustBeForward && (startRecord->getStrandVal() != Record::FORWARD)) || (_mustBeReverse && (startRecord->getStrandVal() != Record::REVERSE))) { //record is reverse, only want forward, OR record is forward, wanted reverse deleteRecord(startRecord); startRecord = NULL; continue; } if (startRecord->getStrandVal() == Record::UNKNOWN && _desiredStrand != ANY_STRAND) { //there is an unknown strand, but the user specified strandedness. deleteRecord(startRecord); startRecord = NULL; } } // OK!! We have a start record! Re-evaluate strand requirements for next recored. _mustBeForward = _desiredStrand == SAME_STRAND_FORWARD || (_desiredStrand == SAME_STRAND_EITHER && (startRecord->getStrandVal() == Record::FORWARD)); _mustBeReverse = _desiredStrand == SAME_STRAND_REVERSE || (_desiredStrand == SAME_STRAND_EITHER && (startRecord->getStrandVal() == Record::REVERSE)); bool mustKeepOpposite = (_desiredStrand == SAME_STRAND_EITHER); const QuickString &currChrom = startRecord->getChrName(); _foundChroms.insert(currChrom); bool madeComposite = false; if (recList != NULL) { recList->push_back(startRecord); recList->setKey(startRecord); //key of recList will just be the startRecord unless we're able to merge more. } Record::strandType currStrand = startRecord->getStrandVal(); bool mustMatchStrand = _desiredStrand != ANY_STRAND; int currEnd = startRecord->getEndPos(); //now look for more records to merge with this one. //stop when they're out of range, not on the same chromosome, or we hit EOF. //ignore if they don't comply with strand. Record *nextRecord = NULL; while (nextRecord == NULL) { bool takenFromStorage = false; nextRecord = mustMatchStrand ? tryToTakeFromStorage(currStrand) : tryToTakeFromStorage(); if (nextRecord == NULL) { nextRecord = FileRecordMgr::getNextRecord(); } else { takenFromStorage = true; } if (nextRecord == NULL) { // EOF hit break; } //delete any record from file with an unknown strand if we are doing stranded merge, but first check //that it's chrom was the same and it's not out of range. If either is true, stop scanning. bool mustDelete = (mustMatchStrand && nextRecord->getStrandVal() == Record::UNKNOWN); //check that we are still on the same chromosome. const QuickString &newChrom = nextRecord->getChrName(); if (newChrom != currChrom) { //hit a different chromosome. //haven't seen this chromosome before, sort order is already enforced in the base class method. if (!mustDelete) { addToStorage(nextRecord); } else { deleteRecord(nextRecord); } nextRecord = NULL; break; } //check whether it's in range int nextStart = nextRecord->getStartPos(); if (nextStart > currEnd + _maxDistance) { //no, it's out of range. if (!mustDelete) { addToStorage(nextRecord); } else { deleteRecord(nextRecord); } nextRecord = NULL; break; } // NOW, going back, we can delete any unknown strand records. But don't stop scanning. if (mustDelete) { deleteRecord(nextRecord); nextRecord = NULL; continue; } //if taken from file, and wrong strand, store or delete. if (!takenFromStorage && ((_mustBeForward && (nextRecord->getStrandVal() != Record::FORWARD)) || (_mustBeReverse && (nextRecord->getStrandVal() != Record::REVERSE)))) { if (mustKeepOpposite) { addToStorage(nextRecord); } else { deleteRecord(nextRecord); } nextRecord = NULL; continue; //get the next record } //ok, they're on the same chrom and in range, and the strand is good. Do a merge. if (recList != NULL) recList->push_back(nextRecord); madeComposite = true; int nextEnd = nextRecord->getEndPos(); if (nextEnd > currEnd) { currEnd = nextEnd; } nextRecord = NULL; } if (madeComposite) { Record *newKey = _recordMgr->allocateRecord(); (*newKey) = (*startRecord); newKey->setEndPos(currEnd); if (recList != NULL) recList->setKey(newKey); _totalMergedRecordLength += currEnd - newKey->getStartPos(); return newKey; } else { _totalMergedRecordLength += currEnd - startRecord->getStartPos(); return startRecord; } // _totalMergedRecordLength += (unsigned long)(recList->getKey()->getEndPos() - recList->getKey()->getStartPos()); // return const_cast<Record *>(recList->getKey()); }
bool FileRecordMgr::allocateAndGetNextMergedRecord(RecordKeyList & recList, WANT_STRAND_TYPE desiredStrand, int maxDistance) { if (!recList.allClear()) { deleteMergedRecord(recList); } _mustBeForward = desiredStrand == SAME_STRAND_FORWARD; _mustBeReverse = desiredStrand == SAME_STRAND_REVERSE; Record *startRecord = tryToTakeFromStorage(); // if we couldn't use a previously stored record for starters, //then begin with a new one that matches strand criteria. while (startRecord == NULL) { startRecord = allocateAndGetNextRecord(); if (startRecord == NULL) { //hit EOF!! return false; } if (_mustBeForward && !startRecord->getStrand()) { //record is reverse, wanted forward. addToStorage(startRecord); startRecord = NULL; } else if (_mustBeReverse && startRecord->getStrand()) { //record is forward, wanted reverse addToStorage(startRecord); startRecord = NULL; } } // OK!! We have a start record! _mustBeForward = desiredStrand == SAME_STRAND_FORWARD || (desiredStrand == SAME_STRAND_EITHER && startRecord->getStrand()); _mustBeReverse = desiredStrand == SAME_STRAND_REVERSE || (desiredStrand == SAME_STRAND_EITHER && !startRecord->getStrand()); const QuickString &currChrom = startRecord->getChrName(); _foundChroms.insert(currChrom); bool madeComposite = false; recList.push_back(startRecord); recList.setKey(startRecord); //key of recList will just be the startRecord unless we're able to merge more. bool currStrand = startRecord->getStrand(); bool mustMatchStrand = desiredStrand != ANY_STRAND; int currEnd = startRecord->getEndPos(); //now look for more records to merge with this one. //stop when they're out of range, not on the same chromosome, or we hit EOF. //ignore if they don't comply with strand. Record *nextRecord = NULL; while (nextRecord == NULL) { bool takenFromStorage = false; nextRecord = mustMatchStrand ? tryToTakeFromStorage(currStrand) : tryToTakeFromStorage(); if (nextRecord == NULL) { nextRecord = allocateAndGetNextRecord(); } else { takenFromStorage = true; } if (nextRecord == NULL) { // EOF hit break; } const QuickString &newChrom = nextRecord->getChrName(); if (newChrom != currChrom) { //hit a different chromosome. if (_foundChroms.find(newChrom) == _foundChroms.end() || takenFromStorage) { //haven't seen this chromosome before. addToStorage(nextRecord); break; } else { //different strand, but we've already seen this chrom. File is not sorted. fprintf(stderr, "ERROR: Input file %s is not sorted by chromosome, startPos.\n", _context->getInputFileName(_contextFileIdx).c_str()); deleteRecord(nextRecord); deleteMergedRecord(recList); exit(1); } } int nextStart = nextRecord->getStartPos(); //is the record out of range? if (nextStart > currEnd + maxDistance) { //yes, it's out of range. addToStorage(nextRecord); break; } //ok, they're on the same chrom and in range. Are we happy with the strand? if (mustMatchStrand && nextRecord->getStrand() != currStrand) { //no, we're not. addToStorage(nextRecord); nextRecord = NULL; continue; } //everything's good! do a merge. recList.push_back(nextRecord); madeComposite = true; int nextEnd = nextRecord->getEndPos(); if (nextEnd > currEnd) { currEnd = nextEnd; } nextRecord = NULL; } if (madeComposite) { Record *newKey = _recordMgr->allocateRecord(); (*newKey) = (*startRecord); newKey->setEndPos(currEnd); recList.setKey(newKey); } _totalMergedRecordLength += (unsigned long)(recList.getKey()->getEndPos() - recList.getKey()->getStartPos()); return true; }
void SubtractFile::subtractHits(RecordKeyVector &hits) { if (hits.empty()) { // no intersection, nothing to subtract. // just copy key to hits as if it were a // self-intersection. This is just for reporting // purposes. hits.push_back(hits.getKey()); return; } if (upCast(_context)->getRemoveAll() && upCast(_context)->getSubtractFraction() == 0.0) { // hits aren't empty, meaning there is intersection, // so we want to not report the hit. _dontReport = true; return; } //loop through hits. Track which bases in query were covered Record *keyRec = hits.getKey(); int keyStart = keyRec->getStartPos(); int keyEnd = keyRec->getEndPos(); //this vector of bools will represent the bases of the query. //for each base, true means uncovered, false means covered. //they begin as all uncovered. vector<bool> keyBases(keyEnd - keyStart, true); //now loop through the hits, and cover corresponding query bases //by setting them to false. bool basesRemoved = false; for (RecordKeyVector::iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) { Record *hitRec = *iter; int hitStart = hitRec->getStartPos(); int hitEnd = hitRec->getEndPos(); int startIdx = max(keyStart, hitStart) - keyStart; int endIdx = min(keyEnd, hitEnd) - keyStart; int keyLen = keyEnd - keyStart; int coveredLen = endIdx - startIdx; float coveragePct = (float)coveredLen / (float)keyLen; //for each base in the hit, set the base in the query to false. //this effectively "erases" the covered bits. Only do if (upCast(_context)->getRemoveSum() || coveragePct >= upCast(_context)->getSubtractFraction()) { std::fill(keyBases.begin() + startIdx, keyBases.begin() + endIdx, false); basesRemoved = true; } } if (!basesRemoved) { //treat as if there were no intersection hits.clearVector(); hits.push_back(hits.getKey()); return; } else if (upCast(_context)->getRemoveAll()) { _dontReport = true; return; } // if the -N option is used ( removeSum), do not report if the percentage of // uniquely covered bases exceeds the overlap fraction. if (upCast(_context)->getRemoveSum()) { //determine how many bases are left uncovered. int numBasesUncovered = std::accumulate(keyBases.begin(), keyBases.end(), 0); //determine percentage that are covered. float pctCovered = 1.0 - (float)numBasesUncovered / (float)(keyEnd - keyStart); if (pctCovered > upCast(_context)->getSubtractFraction()) { _dontReport = true; return; } else { hits.clearVector(); hits.push_back(hits.getKey()); } return; } //now make "blocks" out of the query's remaining stretches of //uncovered bases. hits.clearVector(); for (int i = 0; i < (int)keyBases.size(); i++) { if (keyBases[i] == true) { int blockStart = keyStart + i; while (keyBases[i] == true && i < (int)keyBases.size()) { i++; } int blockEnd = min(keyStart + i, keyEnd); hits.push_back(_tmpBlocksMgr->allocateAndAssignRecord(keyRec, blockStart, blockEnd)); } } _deleteTmpBlocks = true; }