vector<string> CFG::computeFirst(const vector<string>& str) // for any string of grammar symbols { // str = Y1 ... vector<string> firstStr; if(str.size() == 1 && str[0] == "") { firstStr = {""}; } else { size_t i = 0; for(; i < str.size(); ) { vector<string> firstY; if(str[i] == "$") { firstY = {"$"}; } else { firstY = first[getFirstIndex(str[i])].second; } bool epsilon = del(firstY, string("")); add(firstStr, firstY); if(epsilon) { ++i; } else { break; } } if(i == str.size()) { if(add(firstStr, {""})); } } return firstStr; }
// When a record is read, check if it is a duplicate or // store for future checking. void Dedup_LowMem::checkDups(SamRecord& record, uint32_t recordCount) { // Only inside this method if the record is mapped. // Get the key for this record. static DupKey key; key.initKey(record, getLibraryID(record)); int flag = record.getFlag(); bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag); int sumBaseQual = getBaseQuality(record); int32_t chromID = record.getReferenceID(); int32_t mateChromID = record.getMateReferenceID(); // If we are one-chrom and the mate is not on the same chromosome, // mark it as not paired. if(myOneChrom && (chromID != mateChromID)) { recordPaired = false; } // Look in the fragment map to see if an entry for this key exists. FragmentMapInsertReturn ireturn = myFragmentMap.insert(std::make_pair(key, FragData())); FragData* fragData = &(ireturn.first->second); // Enter the new record in the fragData if (any of the below): // 1) there is no previous entry for this key (ireturn.second == true) // or // 2) the previous entry is not paired // AND // a) the new record is paired // or // b) the new record has higher quality if((ireturn.second == true) || ((fragData->paired == false) && (recordPaired || (sumBaseQual > fragData->sumBaseQual)))) { // Check if this is a new key. if(ireturn.second == true) { // New entry, so build the recalibration table now. if(myDoRecab) { myRecab.processReadBuildTable(record); } } else if(fragData->paired == false) { // There was a previous record and it is not paired, // so mark it as a duplicate. // Duplicate checking/marking for pairs is handled below. handleDuplicate(fragData->recordIndex); } // Store this record for later duplicate checking. fragData->sumBaseQual = sumBaseQual; fragData->recordIndex = recordCount; fragData->paired = recordPaired; } else { // Leave the old record in fragData. // If the new record is not paired, handle it as a duplicate. if(recordPaired == false) { // This record is a duplicate, so mark it and release it. handleDuplicate(recordCount); } } // Only paired processing is left, so return if not paired. if(recordPaired == false) { // Not paired, no more operations required, so return. return; } // This is a paired record, so check for its mate. uint64_t readPos = SamHelper::combineChromPos(chromID, record.get0BasedPosition()); uint64_t matePos = SamHelper::combineChromPos(mateChromID, record.get0BasedMatePosition()); int mateIndex = -1; MateData* mateData = NULL; // Check to see if the mate is prior to this record. if(matePos <= readPos) { // The mate map is stored by the mate position, so look for this // record's position. // The mate should be in the mate map, so find it. std::pair<MateMap::iterator,MateMap::iterator> matches = myMateMap.equal_range(readPos); // Loop through the elements that matched the pos looking for the mate. for(MateMap::iterator iter = matches.first; iter != matches.second; iter++) { if(strcmp((*iter).second.readName.c_str(), record.getReadName()) == 0) { // Found the match. mateData = &((*iter).second); // Update the quality and track the mate record and index. sumBaseQual += mateData->sumBaseQual; mateIndex = mateData->recordIndex; // Remove the entry from the map. myMateMap.erase(iter); break; } } } if(mateData == NULL) { if(matePos >= readPos) { // Haven't gotten to the mate yet, so store this record. MateMap::iterator mateIter = myMateMap.insert(std::make_pair(matePos, MateData())); mateIter->second.sumBaseQual = sumBaseQual; mateIter->second.recordIndex = recordCount; mateIter->second.key.copy(key); mateIter->second.readName = record.getReadName(); } else { // Passed the mate, but it was not found. handleMissingMate(record.getReferenceID(), record.getMateReferenceID()); } return; } // Make the paired key. PairedKey pkey(key, mateData->key); // Check to see if this pair is a duplicate. PairedMapInsertReturn pairedReturn = myPairedMap.insert(std::make_pair(pkey,PairedData())); PairedData* storedPair = &(pairedReturn.first->second); // Get the index for "record 1" - the one with the earlier coordinate. int record1Index = getFirstIndex(key, recordCount, mateData->key, mateIndex); // Check if we have already found a duplicate pair. // If there is no duplicate found, there is nothing more to do. if(pairedReturn.second == false) { // Duplicate found. bool keepStored = true; if(pairedReturn.first->second.sumBaseQual < sumBaseQual) { // The new pair has higher quality, so keep that. keepStored = false; } else if(pairedReturn.first->second.sumBaseQual == sumBaseQual) { // Same quality, so keep the one with the earlier record1Index. if(record1Index < storedPair->record1Index) { // The new pair has an earlier lower coordinate read, // so keep that. keepStored = false; } } // Check to see which one should be kept by checking qualities. if(keepStored) { // The old pair had higher quality so mark the new pair as a // duplicate and release them. handleDuplicate(mateIndex); handleDuplicate(recordCount); } else { // The new pair has higher quality, so keep that. // First mark the previous one as duplicates and release them. handleDuplicate(storedPair->record1Index); handleDuplicate(storedPair->record2Index); // Store this pair's information. if(record1Index == mateIndex) { // Mate has a lower coordinate, so make mate // record1. storedPair->sumBaseQual = sumBaseQual; storedPair->record1Index = mateIndex; storedPair->record2Index = recordCount; } else { // This record has a lower coordinate, so make it // record1. storedPair->sumBaseQual = sumBaseQual; storedPair->record1Index = recordCount; storedPair->record2Index = mateIndex; } } } else { // Store this pair's information. storedPair->sumBaseQual = sumBaseQual; if(record1Index == mateIndex) { // Mate has a lower coordinate, so make mate // record1. storedPair->record1Index = mateIndex; storedPair->record2Index = recordCount; } else { // This record has a lower coordinate, so make it // record1. storedPair->record1Index = recordCount; storedPair->record2Index = mateIndex; } } }
void CFG::computeFirst() // elements are terminals // for all grammar symbols { first.clear(); for(size_t i = 0; i < t.size(); ++i) { first.push_back(make_pair(t[i], vector<string>({t[i]}))); } for(size_t i = 0; i < v.size(); ++i) { first.push_back(make_pair(v[i], vector<string>())); } bool updated = false; do { updated = false; for(size_t i = 0; i < v.size(); ++i) { int xFirstIndex = getFirstIndex(v[i]); vector<string> firstX = first[xFirstIndex].second; for(size_t j = 0; j < p.size(); ++j) { if(p[j].left == v[i]) // X -> Y1 ... { if(p[j].right.size() == 1 && p[j].right[0] == "") { if(add(firstX, {""})) { updated = true; } } else { size_t k = 0; for(; k < p[j].right.size(); ) { vector<string> firstY = first[getFirstIndex(p[j].right[k])].second; del(firstY, string("")); if(add(firstX, firstY)) { updated = true; } if(in(string(""), firstY)) { ++k; } else { break; } } if(k == p[j].right.size()) { if(add(firstX, {""})); } } } } first[xFirstIndex].second = firstX; } }while(updated); }
// When a record is read, check if it is a duplicate or // store for future checking. void Dedup::checkDups(SamRecord& record, uint32_t recordCount) { // Only inside this method if the record is mapped. // Get the key for this record. static DupKey key; static DupKey mateKey; key.updateKey(record, getLibraryID(record)); int flag = record.getFlag(); bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag); int sumBaseQual = getBaseQuality(record); int32_t chromID = record.getReferenceID(); int32_t mateChromID = record.getMateReferenceID(); // If we are one-chrom and the mate is not on the same chromosome, // mark it as not paired. if(myOneChrom && (chromID != mateChromID)) { recordPaired = false; } // Look in the map to see if an entry for this key exists. FragmentMapInsertReturn ireturn = myFragmentMap.insert(std::make_pair(key, ReadData())); ReadData* readData = &(ireturn.first->second); // Mark this record's data in the fragment record if this is the first // entry or if it is a duplicate and the old record is not paired and // the new record is paired or the has a higher quality. if((ireturn.second == true) || ((readData->paired == false) && (recordPaired || (sumBaseQual > readData->sumBaseQual)))) { // If there was a previous record, mark it duplicate and release // the old record if(ireturn.second == false) { // Mark the old record as a DUPLICATE! handleDuplicate(readData->recordIndex, readData->recordPtr); } // Store this record for later duplicate checking. readData->sumBaseQual = sumBaseQual; readData->recordIndex = recordCount; readData->paired = recordPaired; if(recordPaired) { readData->recordPtr = NULL; } else { readData->recordPtr = &record; } } else { // The old record is not a duplicate so the new record is // a duplicate if it is not paired. if(recordPaired == false) { // This record is a duplicate, so mark it and release it. handleDuplicate(recordCount, &record); } } // Only paired processing is left, so return if not paired. if(recordPaired == false) { // Not paired, no more operations required, so return. return; } // This is a paired record, so check for its mate. uint64_t readPos = SamHelper::combineChromPos(chromID, record.get0BasedPosition()); uint64_t matePos = SamHelper::combineChromPos(mateChromID, record.get0BasedMatePosition()); SamRecord* mateRecord = NULL; int mateIndex = 0; // Check to see if the mate is prior to this record. if(matePos <= readPos) { // The mate map is stored by the mate position, so look for this // record's position. // The mate should be in the mate map, so find it. std::pair<MateMap::iterator,MateMap::iterator> matches = myMateMap.equal_range(readPos); // Loop through the elements that matched the pos looking for the mate. for(MateMap::iterator iter = matches.first; iter != matches.second; iter++) { if(strcmp((*iter).second.recordPtr->getReadName(), record.getReadName()) == 0) { // Found the match. ReadData* mateData = &((*iter).second); // Update the quality and track the mate record and index. sumBaseQual += mateData->sumBaseQual; mateIndex = mateData->recordIndex; mateRecord = mateData->recordPtr; // Remove the entry from the map. myMateMap.erase(iter); break; } } } if((mateRecord == NULL) && (matePos >= readPos)) { // Haven't gotten to the mate yet, so store this record. MateMap::iterator mateIter = myMateMap.insert(std::make_pair(matePos, ReadData())); mateIter->second.sumBaseQual = sumBaseQual; mateIter->second.recordPtr = &record; mateIter->second.recordIndex = recordCount; // No more processing for this record is necessary. return; } if(mateRecord == NULL) { // Passed the mate, but it was not found. handleMissingMate(&record); return; } // Make the paired key. mateKey.updateKey(*mateRecord, getLibraryID(*mateRecord)); PairedKey pkey(key, mateKey); // Check to see if this pair is a duplicate. PairedMapInsertReturn pairedReturn = myPairedMap.insert(std::make_pair(pkey,PairedData())); PairedData* storedPair = &(pairedReturn.first->second); // Get the index for "record 1" - the one with the earlier coordinate. int record1Index = getFirstIndex(key, recordCount, mateKey, mateIndex); // Check if we have already found a duplicate pair. // If there is no duplicate found, there is nothing more to do. if(pairedReturn.second == false) { // Duplicate found. bool keepStored = true; if(pairedReturn.first->second.sumBaseQual < sumBaseQual) { // The new pair has higher quality, so keep that. keepStored = false; } else if(pairedReturn.first->second.sumBaseQual == sumBaseQual) { // Same quality, so keep the one with the earlier record1Index. if(record1Index < storedPair->record1Index) { // The new pair has an earlier lower coordinate read, // so keep that. keepStored = false; } } // Check to see which one should be kept by checking qualities. if(keepStored) { // The old pair had higher quality so mark the new pair as a // duplicate and release them. handleDuplicate(mateIndex, mateRecord); handleDuplicate(recordCount, &record); } else { // The new pair has higher quality, so keep that. // First mark the previous one as duplicates and release them. handleDuplicate(storedPair->record1Index, storedPair->record1Ptr); handleDuplicate(storedPair->record2Index, storedPair->record2Ptr); // Store this pair's information. if(record1Index == mateIndex) { // Mate has a lower coordinate, so make mate // record1. storedPair->sumBaseQual = sumBaseQual; storedPair->record1Ptr = mateRecord; storedPair->record2Ptr = &record; storedPair->record1Index = mateIndex; storedPair->record2Index = recordCount; } else { // This record has a lower coordinate, so make it // record1. storedPair->sumBaseQual = sumBaseQual; storedPair->record1Ptr = &record; storedPair->record2Ptr = mateRecord; storedPair->record1Index = recordCount; storedPair->record2Index = mateIndex; } } } else { // Store this pair's information. storedPair->sumBaseQual = sumBaseQual; if(record1Index == mateIndex) { // Mate has a lower coordinate, so make mate // record1. storedPair->record1Ptr = mateRecord; storedPair->record2Ptr = &record; storedPair->record1Index = mateIndex; storedPair->record2Index = recordCount; } else { // This record has a lower coordinate, so make it // record1. storedPair->record1Ptr = &record; storedPair->record2Ptr = mateRecord; storedPair->record1Index = recordCount; storedPair->record2Index = mateIndex; } } }