Ejemplo n.º 1
0
vector<string> CFG::computeFirst(const vector<string>& str)
// for any string of grammar symbols
{
	// str = Y1 ...
	vector<string> firstStr;

	if(str.size() == 1 && str[0] == "")
	{
		firstStr = {""};
	}
	else
	{
		size_t i = 0;
		for(; i < str.size(); )
		{
			vector<string> firstY;
			if(str[i] == "$")
			{
				firstY = {"$"};
			}
			else
			{
				firstY = first[getFirstIndex(str[i])].second;
			}

			bool epsilon = del(firstY, string(""));
			add(firstStr, firstY);

			if(epsilon)
			{
				++i;
			}
			else
			{
				break;
			}
		}
		if(i == str.size())
		{
			if(add(firstStr, {""}));
		}
	}
	
	return firstStr;
}
Ejemplo n.º 2
0
// When a record is read, check if it is a duplicate or
// store for future checking.
void Dedup_LowMem::checkDups(SamRecord& record, uint32_t recordCount)
{
    // Only inside this method if the record is mapped.

    // Get the key for this record.
    static DupKey key;
    key.initKey(record, getLibraryID(record));

    int flag = record.getFlag();
    bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
    int sumBaseQual = getBaseQuality(record);

    int32_t chromID = record.getReferenceID();
    int32_t mateChromID = record.getMateReferenceID();

    // If we are one-chrom and the mate is not on the same chromosome,
    // mark it as not paired.
    if(myOneChrom && (chromID != mateChromID))
    {
        recordPaired = false;
    }

    // Look in the fragment map to see if an entry for this key exists.
    FragmentMapInsertReturn ireturn =
        myFragmentMap.insert(std::make_pair(key, FragData()));

    FragData* fragData = &(ireturn.first->second);

    // Enter the new record in the fragData if (any of the below):
    // 1) there is no previous entry for this key (ireturn.second == true)
    // or
    // 2) the previous entry is not paired
    //    AND
    //     a) the new record is paired
    //     or
    //     b) the new record has higher quality
    if((ireturn.second == true) ||
            ((fragData->paired == false) &&
             (recordPaired || (sumBaseQual > fragData->sumBaseQual))))
    {
        // Check if this is a new key.
        if(ireturn.second == true)
        {
            // New entry, so build the recalibration table now.
            if(myDoRecab)
            {
                myRecab.processReadBuildTable(record);
            }
        }
        else if(fragData->paired == false)
        {
            // There was a previous record and it is not paired,
            // so mark it as a duplicate.
            // Duplicate checking/marking for pairs is handled below.
            handleDuplicate(fragData->recordIndex);
        }

        // Store this record for later duplicate checking.
        fragData->sumBaseQual = sumBaseQual;
        fragData->recordIndex = recordCount;
        fragData->paired = recordPaired;
    }
    else
    {
        // Leave the old record in fragData.
        // If the new record is not paired, handle it as a duplicate.
        if(recordPaired == false)
        {
            // This record is a duplicate, so mark it and release it.
            handleDuplicate(recordCount);
        }
    }

    // Only paired processing is left, so return if not paired.
    if(recordPaired == false)
    {
        // Not paired, no more operations required, so return.
        return;
    }

    // This is a paired record, so check for its mate.
    uint64_t readPos =
        SamHelper::combineChromPos(chromID,
                                   record.get0BasedPosition());
    uint64_t matePos =
        SamHelper::combineChromPos(mateChromID,
                                   record.get0BasedMatePosition());
    int mateIndex = -1;
    MateData* mateData = NULL;

    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate map is stored by the mate position, so look for this
        // record's position.
        // The mate should be in the mate map, so find it.
        std::pair<MateMap::iterator,MateMap::iterator> matches =
            myMateMap.equal_range(readPos);
        // Loop through the elements that matched the pos looking for the mate.
        for(MateMap::iterator iter = matches.first;
                iter != matches.second; iter++)
        {
            if(strcmp((*iter).second.readName.c_str(),
                      record.getReadName()) == 0)
            {
                // Found the match.
                mateData = &((*iter).second);
                // Update the quality and track the mate record and index.
                sumBaseQual += mateData->sumBaseQual;
                mateIndex = mateData->recordIndex;
                // Remove the entry from the map.
                myMateMap.erase(iter);
                break;
            }
        }
    }
    if(mateData == NULL)
    {
        if(matePos >= readPos)
        {
            // Haven't gotten to the mate yet, so store this record.
            MateMap::iterator mateIter =
                myMateMap.insert(std::make_pair(matePos, MateData()));
            mateIter->second.sumBaseQual = sumBaseQual;
            mateIter->second.recordIndex = recordCount;
            mateIter->second.key.copy(key);
            mateIter->second.readName = record.getReadName();
        }
        else
        {
            // Passed the mate, but it was not found.
            handleMissingMate(record.getReferenceID(), record.getMateReferenceID());
        }
        return;
    }

    // Make the paired key.
    PairedKey pkey(key, mateData->key);

    // Check to see if this pair is a duplicate.
    PairedMapInsertReturn pairedReturn =
        myPairedMap.insert(std::make_pair(pkey,PairedData()));
    PairedData* storedPair = &(pairedReturn.first->second);

    // Get the index for "record 1" - the one with the earlier coordinate.
    int record1Index = getFirstIndex(key, recordCount,
                                     mateData->key, mateIndex);

    // Check if we have already found a duplicate pair.
    // If there is no duplicate found, there is nothing more to do.
    if(pairedReturn.second == false)
    {
        // Duplicate found.
        bool keepStored = true;
        if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
        {
            // The new pair has higher quality, so keep that.
            keepStored = false;
        }
        else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
        {
            // Same quality, so keep the one with the earlier record1Index.
            if(record1Index < storedPair->record1Index)
            {
                // The new pair has an earlier lower coordinate read,
                // so keep that.
                keepStored = false;
            }
        }
        // Check to see which one should be kept by checking qualities.
        if(keepStored)
        {
            // The old pair had higher quality so mark the new pair as a
            // duplicate and release them.
            handleDuplicate(mateIndex);
            handleDuplicate(recordCount);
        }
        else
        {
            // The new pair has higher quality, so keep that.
            // First mark the previous one as duplicates and release them.
            handleDuplicate(storedPair->record1Index);
            handleDuplicate(storedPair->record2Index);
            // Store this pair's information.
            if(record1Index == mateIndex)
            {
                // Mate has a lower coordinate, so make mate
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Index = mateIndex;
                storedPair->record2Index = recordCount;
            }
            else
            {
                // This record has a lower coordinate, so make it
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Index = recordCount;
                storedPair->record2Index = mateIndex;
            }
        }
    }
    else
    {
        // Store this pair's information.
        storedPair->sumBaseQual = sumBaseQual;

        if(record1Index == mateIndex)
        {
            // Mate has a lower coordinate, so make mate
            // record1.
            storedPair->record1Index = mateIndex;
            storedPair->record2Index = recordCount;
        }
        else
        {
            // This record has a lower coordinate, so make it
            // record1.
            storedPair->record1Index = recordCount;
            storedPair->record2Index = mateIndex;
        }
    }
}
Ejemplo n.º 3
0
void CFG::computeFirst()
// elements are terminals
// for all grammar symbols
{
	first.clear();

	for(size_t i = 0; i < t.size(); ++i)
	{
		first.push_back(make_pair(t[i], vector<string>({t[i]})));
	}

	for(size_t i = 0; i < v.size(); ++i)
	{
		first.push_back(make_pair(v[i], vector<string>()));
	}

	bool updated = false;
	do
	{
		updated = false;

		for(size_t i = 0; i < v.size(); ++i)
		{
			int xFirstIndex = getFirstIndex(v[i]);
			vector<string> firstX = first[xFirstIndex].second;

			for(size_t j = 0; j < p.size(); ++j)
			{
				if(p[j].left == v[i])
				// X -> Y1 ...
				{
					if(p[j].right.size() == 1 && p[j].right[0] == "")
					{
						if(add(firstX, {""}))
						{
							updated = true;
						}
					}
					else
					{
						size_t k = 0;
						for(; k < p[j].right.size(); )
						{
							vector<string> firstY = first[getFirstIndex(p[j].right[k])].second;

							del(firstY, string(""));
							if(add(firstX, firstY))
							{
								updated = true;
							}

							if(in(string(""), firstY))
							{
								++k;
							}
							else
							{
								break;
							}
						}
						if(k == p[j].right.size())
						{
							if(add(firstX, {""}));
						}
					}
				}
			}

			first[xFirstIndex].second = firstX;
		}
	}while(updated);
}
Ejemplo n.º 4
0
// When a record is read, check if it is a duplicate or
// store for future checking.
void Dedup::checkDups(SamRecord& record, uint32_t recordCount)
{
    // Only inside this method if the record is mapped.

    // Get the key for this record.
    static DupKey key;
    static DupKey mateKey;
    key.updateKey(record, getLibraryID(record));

    int flag = record.getFlag(); 
    bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
    int sumBaseQual = getBaseQuality(record);

    int32_t chromID = record.getReferenceID();
    int32_t mateChromID = record.getMateReferenceID();

    // If we are one-chrom and the mate is not on the same chromosome, 
    // mark it as not paired.
    if(myOneChrom && (chromID != mateChromID))
    {
        recordPaired = false;
    }
    
    // Look in the map to see if an entry for this key exists.
    FragmentMapInsertReturn ireturn = 
        myFragmentMap.insert(std::make_pair(key, ReadData()));

    ReadData* readData = &(ireturn.first->second);

    // Mark this record's data in the fragment record if this is the first
    // entry or if it is a duplicate and the old record is not paired and 
    // the new record is paired or the has a higher quality.
    if((ireturn.second == true) ||
       ((readData->paired == false) && 
        (recordPaired || (sumBaseQual > readData->sumBaseQual))))
    {
        // If there was a previous record, mark it duplicate and release
        // the old record
        if(ireturn.second == false)
        {
            // Mark the old record as a DUPLICATE!
            handleDuplicate(readData->recordIndex, readData->recordPtr);
        }
        // Store this record for later duplicate checking.
        readData->sumBaseQual = sumBaseQual;
        readData->recordIndex = recordCount;
        readData->paired = recordPaired;
        if(recordPaired)
        {
            readData->recordPtr = NULL;
        }
        else
        {
            readData->recordPtr = &record;
        }
    }
    else
    {
        // The old record is not a duplicate so the new record is
        // a duplicate if it is not paired.
        if(recordPaired == false)
        {
            // This record is a duplicate, so mark it and release it.
            handleDuplicate(recordCount, &record);
        }
    }

    // Only paired processing is left, so return if not paired.
    if(recordPaired == false)
    {
        // Not paired, no more operations required, so return.
        return;
    }
    
    // This is a paired record, so check for its mate.
    uint64_t readPos = 
        SamHelper::combineChromPos(chromID,
                                   record.get0BasedPosition());
    uint64_t matePos =
        SamHelper::combineChromPos(mateChromID, 
                                   record.get0BasedMatePosition());
    SamRecord* mateRecord = NULL;
    int mateIndex = 0;
    
    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate map is stored by the mate position, so look for this 
        // record's position.
        // The mate should be in the mate map, so find it.
        std::pair<MateMap::iterator,MateMap::iterator> matches =
            myMateMap.equal_range(readPos);
        // Loop through the elements that matched the pos looking for the mate.
        for(MateMap::iterator iter = matches.first; 
            iter != matches.second; iter++)
        {
            if(strcmp((*iter).second.recordPtr->getReadName(), 
                      record.getReadName()) == 0)
            {
                // Found the match.
                ReadData* mateData = &((*iter).second);
                // Update the quality and track the mate record and index.
                sumBaseQual += mateData->sumBaseQual;
                mateIndex = mateData->recordIndex;
                mateRecord = mateData->recordPtr;
                // Remove the entry from the map.
                myMateMap.erase(iter);
                break;
            }
        }
    }
    if((mateRecord == NULL) && (matePos >= readPos))
    {
        // Haven't gotten to the mate yet, so store this record.
        MateMap::iterator mateIter = 
            myMateMap.insert(std::make_pair(matePos, ReadData()));
        mateIter->second.sumBaseQual = sumBaseQual;
        mateIter->second.recordPtr = &record;
        mateIter->second.recordIndex = recordCount;
        // No more processing for this record is necessary.
        return;
    }

    if(mateRecord == NULL)
    {
        // Passed the mate, but it was not found.
        handleMissingMate(&record);
        return;
    }

    // Make the paired key.
    mateKey.updateKey(*mateRecord, getLibraryID(*mateRecord));
    PairedKey pkey(key, mateKey);

    // Check to see if this pair is a duplicate.
    PairedMapInsertReturn pairedReturn = 
        myPairedMap.insert(std::make_pair(pkey,PairedData()));
    PairedData* storedPair = &(pairedReturn.first->second);

    // Get the index for "record 1" - the one with the earlier coordinate.
    int record1Index = getFirstIndex(key, recordCount,
                                          mateKey, mateIndex);

    // Check if we have already found a duplicate pair.
    // If there is no duplicate found, there is nothing more to do.
    if(pairedReturn.second == false)
    {
        // Duplicate found.
        bool keepStored = true;
        if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
        {
            // The new pair has higher quality, so keep that.
            keepStored = false;
        }
        else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
        {
            // Same quality, so keep the one with the earlier record1Index.
            if(record1Index < storedPair->record1Index)
            {
                // The new pair has an earlier lower coordinate read,
                // so keep that.
                keepStored = false;
            }
        }
        // Check to see which one should be kept by checking qualities.
        if(keepStored)
        {
            // The old pair had higher quality so mark the new pair as a
            // duplicate and release them.
            handleDuplicate(mateIndex, mateRecord);
            handleDuplicate(recordCount, &record);
        }
        else
        {
            // The new pair has higher quality, so keep that.
            // First mark the previous one as duplicates and release them.
            handleDuplicate(storedPair->record1Index, storedPair->record1Ptr);
            handleDuplicate(storedPair->record2Index, storedPair->record2Ptr);
            // Store this pair's information.
            if(record1Index == mateIndex)
            {
                // Mate has a lower coordinate, so make mate
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Ptr = mateRecord;
                storedPair->record2Ptr = &record;
                storedPair->record1Index = mateIndex;
                storedPair->record2Index = recordCount;
            }
            else
            {
                // This record has a lower coordinate, so make it
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Ptr = &record;
                storedPair->record2Ptr = mateRecord;
                storedPair->record1Index = recordCount;
                storedPair->record2Index = mateIndex;
            }
        }
    }
    else
    {
        // Store this pair's information.
        storedPair->sumBaseQual = sumBaseQual;

        if(record1Index == mateIndex)
        {
            // Mate has a lower coordinate, so make mate
            // record1.
            storedPair->record1Ptr = mateRecord;
            storedPair->record2Ptr = &record;
            storedPair->record1Index = mateIndex;
            storedPair->record2Index = recordCount;
        }
        else
        {
            // This record has a lower coordinate, so make it
            // record1.
            storedPair->record1Ptr = &record;
            storedPair->record2Ptr = mateRecord;
            storedPair->record1Index = recordCount;
            storedPair->record2Index = mateIndex;
        }
    }
}