Ejemplo n.º 1
0
// make a 64-bit genomic coordinate [24bit-chr][32bit-pos][8bit-orientation]
uint64_t getGenomicCoordinate(SamRecord& r) {
  // 64bit string consisting of
  // 24bit refID, 32bit pos, 8bit orientation
  if ( ( r.getReferenceID() < 0 ) || ( r.get0BasedPosition() < 0 ) ) {
    return UNMAPPED_GENOMIC_COORDINATE;
  }
  else {
    return ( ( static_cast<uint64_t>(r.getReferenceID()) << 40 ) | ( static_cast<uint64_t>(r.get0BasedPosition()) << 8 ) | static_cast<uint64_t>( r.getFlag() & 0x0010 ) );
  }
}
Ejemplo n.º 2
0
// determine whether the record's position is different from the previous record
bool Dedup_LowMem::hasPositionChanged(SamRecord& record)
{
    if (lastReference != record.getReferenceID() ||
            lastCoordinate < record.get0BasedPosition())
    {
        if (lastReference != record.getReferenceID())
        {
            lastReference = record.getReferenceID();
            Logger::gLogger->writeLog("Reading ReferenceID %d\n", lastReference);
        }
        lastCoordinate = record.get0BasedPosition();
        return true;
    }
    return false;
}
Ejemplo n.º 3
0
bool SamFile::checkRecordInSection(SamRecord& record)
{
    bool recordFound = true;
    if(myRefID == BamIndex::REF_ID_ALL)
    {
        return(true);
    }
    // Check to see if it is in the correct reference/position.
    if(record.getReferenceID() != myRefID)
    {
        // Incorrect reference ID, return no more records.
        myStatus = SamStatus::NO_MORE_RECS;
        return(false);
    }
   
    // Found a record.
    recordFound = true;

    // If start/end position are set, verify that the alignment falls
    // within those.
    // If the alignment start is greater than the end of the region,
    // return NO_MORE_RECS.
    // Since myEndPos is Exclusive 0-based, anything >= myEndPos is outside
    // of the region.
    if((myEndPos != -1) && (record.get0BasedPosition() >= myEndPos))
    {
        myStatus = SamStatus::NO_MORE_RECS;
        return(false);
    }
        
    // We know the start is less than the end position, so the alignment
    // overlaps the region if the alignment end position is greater than the
    // start of the region.
    if((myStartPos != -1) && (record.get0BasedAlignmentEnd() < myStartPos))
    {
        // If it does not overlap the region, so go to the next
        // record...set recordFound back to false.
        recordFound = false;
    }

    if(!myOverlapSection)
    {
        // Needs to be fully contained.  Not fully contained if
        // 1) the record start position is < the region start position.
        // or
        // 2) the end position is specified and the record end position
        //    is greater than or equal to the region end position.
        //    (equal to since the region is exclusive.
        if((record.get0BasedPosition() < myStartPos) ||
           ((myEndPos != -1) && 
            (record.get0BasedAlignmentEnd() >= myEndPos)))
        {
            // This record is not fully contained, so move on to the next
            // record.
            recordFound = false;
        }
    }

    return(recordFound);
}
Ejemplo n.º 4
0
bool ClipOverlap::flushOutputBuffer(MateMapByCoord& mateMap,
                                    SamCoordOutput& outputBuffer,
                                    int32_t prevChrom,
                                    int32_t prevPos)
{
    // We will flush the output buffer up to the first record left in the
    // mateMap.  If there are no records left in the mate map, then we
    // flush everything up to the previous chrom/pos that was processed since
    // any new records will have a higher coordinate.
    SamRecord* firstRec = mateMap.first();
    if(firstRec != NULL)
    {
        return(outputBuffer.flush(firstRec->getReferenceID(), 
                                  firstRec->get0BasedPosition()));
    }
    // Otherwise, flush based on the previous 
    return(outputBuffer.flush(prevChrom, prevPos));
}
Ejemplo n.º 5
0
// When a record is read, check if it is a duplicate or
// store for future checking.
void Dedup_LowMem::checkDups(SamRecord& record, uint32_t recordCount)
{
    // Only inside this method if the record is mapped.

    // Get the key for this record.
    static DupKey key;
    key.initKey(record, getLibraryID(record));

    int flag = record.getFlag();
    bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
    int sumBaseQual = getBaseQuality(record);

    int32_t chromID = record.getReferenceID();
    int32_t mateChromID = record.getMateReferenceID();

    // If we are one-chrom and the mate is not on the same chromosome,
    // mark it as not paired.
    if(myOneChrom && (chromID != mateChromID))
    {
        recordPaired = false;
    }

    // Look in the fragment map to see if an entry for this key exists.
    FragmentMapInsertReturn ireturn =
        myFragmentMap.insert(std::make_pair(key, FragData()));

    FragData* fragData = &(ireturn.first->second);

    // Enter the new record in the fragData if (any of the below):
    // 1) there is no previous entry for this key (ireturn.second == true)
    // or
    // 2) the previous entry is not paired
    //    AND
    //     a) the new record is paired
    //     or
    //     b) the new record has higher quality
    if((ireturn.second == true) ||
            ((fragData->paired == false) &&
             (recordPaired || (sumBaseQual > fragData->sumBaseQual))))
    {
        // Check if this is a new key.
        if(ireturn.second == true)
        {
            // New entry, so build the recalibration table now.
            if(myDoRecab)
            {
                myRecab.processReadBuildTable(record);
            }
        }
        else if(fragData->paired == false)
        {
            // There was a previous record and it is not paired,
            // so mark it as a duplicate.
            // Duplicate checking/marking for pairs is handled below.
            handleDuplicate(fragData->recordIndex);
        }

        // Store this record for later duplicate checking.
        fragData->sumBaseQual = sumBaseQual;
        fragData->recordIndex = recordCount;
        fragData->paired = recordPaired;
    }
    else
    {
        // Leave the old record in fragData.
        // If the new record is not paired, handle it as a duplicate.
        if(recordPaired == false)
        {
            // This record is a duplicate, so mark it and release it.
            handleDuplicate(recordCount);
        }
    }

    // Only paired processing is left, so return if not paired.
    if(recordPaired == false)
    {
        // Not paired, no more operations required, so return.
        return;
    }

    // This is a paired record, so check for its mate.
    uint64_t readPos =
        SamHelper::combineChromPos(chromID,
                                   record.get0BasedPosition());
    uint64_t matePos =
        SamHelper::combineChromPos(mateChromID,
                                   record.get0BasedMatePosition());
    int mateIndex = -1;
    MateData* mateData = NULL;

    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate map is stored by the mate position, so look for this
        // record's position.
        // The mate should be in the mate map, so find it.
        std::pair<MateMap::iterator,MateMap::iterator> matches =
            myMateMap.equal_range(readPos);
        // Loop through the elements that matched the pos looking for the mate.
        for(MateMap::iterator iter = matches.first;
                iter != matches.second; iter++)
        {
            if(strcmp((*iter).second.readName.c_str(),
                      record.getReadName()) == 0)
            {
                // Found the match.
                mateData = &((*iter).second);
                // Update the quality and track the mate record and index.
                sumBaseQual += mateData->sumBaseQual;
                mateIndex = mateData->recordIndex;
                // Remove the entry from the map.
                myMateMap.erase(iter);
                break;
            }
        }
    }
    if(mateData == NULL)
    {
        if(matePos >= readPos)
        {
            // Haven't gotten to the mate yet, so store this record.
            MateMap::iterator mateIter =
                myMateMap.insert(std::make_pair(matePos, MateData()));
            mateIter->second.sumBaseQual = sumBaseQual;
            mateIter->second.recordIndex = recordCount;
            mateIter->second.key.copy(key);
            mateIter->second.readName = record.getReadName();
        }
        else
        {
            // Passed the mate, but it was not found.
            handleMissingMate(record.getReferenceID(), record.getMateReferenceID());
        }
        return;
    }

    // Make the paired key.
    PairedKey pkey(key, mateData->key);

    // Check to see if this pair is a duplicate.
    PairedMapInsertReturn pairedReturn =
        myPairedMap.insert(std::make_pair(pkey,PairedData()));
    PairedData* storedPair = &(pairedReturn.first->second);

    // Get the index for "record 1" - the one with the earlier coordinate.
    int record1Index = getFirstIndex(key, recordCount,
                                     mateData->key, mateIndex);

    // Check if we have already found a duplicate pair.
    // If there is no duplicate found, there is nothing more to do.
    if(pairedReturn.second == false)
    {
        // Duplicate found.
        bool keepStored = true;
        if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
        {
            // The new pair has higher quality, so keep that.
            keepStored = false;
        }
        else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
        {
            // Same quality, so keep the one with the earlier record1Index.
            if(record1Index < storedPair->record1Index)
            {
                // The new pair has an earlier lower coordinate read,
                // so keep that.
                keepStored = false;
            }
        }
        // Check to see which one should be kept by checking qualities.
        if(keepStored)
        {
            // The old pair had higher quality so mark the new pair as a
            // duplicate and release them.
            handleDuplicate(mateIndex);
            handleDuplicate(recordCount);
        }
        else
        {
            // The new pair has higher quality, so keep that.
            // First mark the previous one as duplicates and release them.
            handleDuplicate(storedPair->record1Index);
            handleDuplicate(storedPair->record2Index);
            // Store this pair's information.
            if(record1Index == mateIndex)
            {
                // Mate has a lower coordinate, so make mate
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Index = mateIndex;
                storedPair->record2Index = recordCount;
            }
            else
            {
                // This record has a lower coordinate, so make it
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Index = recordCount;
                storedPair->record2Index = mateIndex;
            }
        }
    }
    else
    {
        // Store this pair's information.
        storedPair->sumBaseQual = sumBaseQual;

        if(record1Index == mateIndex)
        {
            // Mate has a lower coordinate, so make mate
            // record1.
            storedPair->record1Index = mateIndex;
            storedPair->record2Index = recordCount;
        }
        else
        {
            // This record has a lower coordinate, so make it
            // record1.
            storedPair->record1Index = recordCount;
            storedPair->record2Index = mateIndex;
        }
    }
}
Ejemplo n.º 6
0
void Bam2FastQ::handlePairedCoord(SamRecord& samRec)
{
    static uint64_t readPos;
    static uint64_t matePos;
    static SamRecord* mateRec;

    // This is a paired record, so check for its mate.
    readPos = SamHelper::combineChromPos(samRec.getReferenceID(),
                                         samRec.get0BasedPosition());
    matePos = SamHelper::combineChromPos(samRec.getMateReferenceID(), 
                                         samRec.get0BasedMatePosition());
 
    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate is prior to this position, so look for this record.
        mateRec = myMateMap.getMate(samRec);
        if(mateRec == NULL)
        {
            // If they are the same position, add it to the map.
            if(matePos == readPos)
            {
                myMateMap.add(samRec);

                // Check to see if the mate map can be cleaned up prior
                // to this position.
                cleanUpMateMap(readPos);
            }
            else
            {
                // Paired Read, but could not find mate.
                std::cerr << "Paired Read, " << samRec.getReadName()
                          << " but couldn't find mate, so writing as "
                          << "unpaired (single-ended)\n";
                ++myNumMateFailures;
                writeFastQ(samRec, myUnpairedFile, myUnpairedFileNameExt);
            }
        }
        else
        {
            // Found the mate.
            ++myNumPairs;
            // Check which is the first in the pair.
            if(SamFlag::isFirstFragment(samRec.getFlag()))
            {
                if(SamFlag::isFirstFragment(mateRec->getFlag()))
                {
                    std::cerr << "Both reads of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(samRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str());
                writeFastQ(*mateRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str());
            }
            else
            {
                if(!SamFlag::isFirstFragment(mateRec->getFlag()))
                {
                    std::cerr << "Neither read of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(*mateRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str());
                writeFastQ(samRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str());
            }
        }
    }
    else
    {
        // Haven't gotten to the mate yet, so store it.
        myMateMap.add(samRec);

        // Check to see if the mate map can be cleaned up.
        cleanUpMateMap(readPos);
    }
}
Ejemplo n.º 7
0
///////////////////////////////////////////////////////////////////
// Methods to handle flushing records from the mate map and/or
// the output buffer.
bool ClipOverlap::forceRecordFlush(MateMapByCoord& mateMap,
                                   SamCoordOutput* outputBufferPtr)
{
    // The previous standard flush did not free up any records, so pop
    // the first record off of the mate map if there is one and process
    // it without waiting for its mate.
    SamRecord* firstRec = mateMap.first();
    int32_t flushChrom = -1;
    int32_t flushPos = -1;
    bool updated = false;

    // Increment number of pool failures.
    ++myNumPoolFail;

    if(firstRec != NULL)
    {
        // Flush up to & including this record's position.
        flushChrom = firstRec->getReferenceID();
        flushPos = firstRec->get0BasedPosition();

        // Remove this record.
        mateMap.popFirst();

        // Ran out of records, so can't wait until the mate's position.
        if(!myPoolSkipOverlap)
        {
            // Handle the single entry of the pair.
            updated = 
                myOverlapHandler->handleOverlapWithoutMate(*firstRec);
        }

        if(!updated)
        {
            ++myNumPoolFailNoHandle;
        }
        else
        {
            ++myNumPoolFailHandled;
        }

        // Add the record to the output buffer.
        if((outputBufferPtr != NULL) && (updated || !myOverlapsOnly))
        {
            outputBufferPtr->add(firstRec);
        }
        else
        {
            myPool.releaseRecord(firstRec);
        }
    }
    else
    {
        ++myNumOutOfOrder;
    }

    if(outputBufferPtr != NULL)
    {
        return(outputBufferPtr->flush(flushChrom, flushPos));
    }
    // No output buffer, return true.
    return(true);
}
Ejemplo n.º 8
0
SamStatus::Status ClipOverlap::handleSortedByCoord(SamFile& samIn, 
                                                   SamCoordOutput* outputBufferPtr)
{
    MateMapByCoord mateMap;

    // Get record & track success/fail
    SamStatus::Status returnStatus = SamStatus::SUCCESS;
    
    OverlapHandler::OverlapInfo overlapInfo = OverlapHandler::UNKNOWN_OVERLAP;
    SamRecord* matePtr = NULL;

    // Get the first read.
    SamRecord* recordPtr = myPool.getRecord();
    if(recordPtr == NULL)
    {
        // No records in the pool, can't process.
        std::cerr <<
            "ERROR: No records in the pool, can't process by coordinate.\n";
        return(SamStatus::FAIL_MEM);
    }

    // Track the original chrom/position of the record being processed
    // for flushing.
    int32_t chrom = -1;
    int32_t position = -1;

    bool overlap = false;
    while(returnStatus == SamStatus::SUCCESS)
    {
        // Get the next Record.
        returnStatus = readCoordRecord(samIn, &recordPtr, 
                                       mateMap, outputBufferPtr);
        if(returnStatus != SamStatus::SUCCESS)
        {
            break;
        }

        chrom = recordPtr->getReferenceID();
        position = recordPtr->get0BasedPosition();

        // Cleanup the mate map based on the newly read record.
        cleanupMateMap(mateMap, outputBufferPtr, chrom, position);

        // Check the read for overlaps.
        overlapInfo = myOverlapHandler->getOverlapInfo(*recordPtr, myIntExcludeFlags);
        // Handle the types of overlaps.
        switch(overlapInfo)
        {
            case OverlapHandler::OVERLAP:
                overlap = true;
                // 1st read, so store it in the mate map.
                mateMap.add(*recordPtr);
                // Clear the pointer so a new one is used next time.
                recordPtr = NULL;
                break;
            case OverlapHandler::NO_OVERLAP_WRONG_ORIENT:
                overlap = true;
                myOverlapHandler->handleNoOverlapWrongOrientation(*recordPtr);
                break;
            case OverlapHandler::SAME_START:
                overlap = true;
                // First check the mate map for the mate.
                matePtr = mateMap.getMate(*recordPtr);
                if(matePtr != NULL)
                {
                    // Mate was found, so handle the overlap.
                    myOverlapHandler->handleOverlapPair(*matePtr, *recordPtr);
                }
                else
                {
                    // Mate not found, so store this one.
                    mateMap.add(*recordPtr);
                    // Clear the pointer so a new one is used next time.
                    recordPtr = NULL;
                }
                break;
            case OverlapHandler::UNKNOWN_OVERLAP:
                matePtr = mateMap.getMate(*recordPtr);
                if(matePtr != NULL)
                {
                    // Mate was found, there is an overlap.
                    overlap = true;
                    myOverlapHandler->handleOverlapPair(*matePtr, *recordPtr);
                }
                else
                {
                    // No overlap if mate not found.
                    overlap = false;
                }
                break;
            case OverlapHandler::UNKNOWN_OVERLAP_WRONG_ORIENT:
                overlap = true;
                matePtr = mateMap.getMate(*recordPtr);
                if(matePtr != NULL)
                {
                    // Mate was found, there is an overlap..
                    myOverlapHandler->handleOverlapPair(*matePtr, *recordPtr);
                }
                else
                {
                    // Mate not found, so handle wrong orientation.
                    // Don't update stats since this is the 2nd in the pair
                    myOverlapHandler->handleNoOverlapWrongOrientation(*recordPtr,
                                                                      false);
                }
                break;
            case OverlapHandler::NO_OVERLAP:
            default:
                // No overlap
                overlap = false;
                break;
        }

        // Handle writing the record if necessary.
        if(outputBufferPtr != NULL)
        {
            // Writing step.
            if((matePtr != NULL) && (!myOverlapsOnly || overlap))
            {
                // Add the mate to the output buffer.
                outputBufferPtr->add(matePtr);
                matePtr = NULL;
            }
            if((recordPtr != NULL) && (!myOverlapsOnly || overlap))
            {
                // Add this record to the output buffer.
                outputBufferPtr->add(recordPtr);
                recordPtr = NULL;
            }

            // Flush the output buffer
            if(!flushOutputBuffer(mateMap, *outputBufferPtr,
                                  chrom, position))
            {
                std::cerr << "ERROR: Failed to flush the output buffer\n";
                returnStatus = SamStatus::FAIL_IO;
            }
        }
        else
        {
            // Not writing.
            // Release the mate if it is set.
            if(matePtr != NULL)
            {
                myPool.releaseRecord(matePtr);
            }
        }
    }

    // Done with the file, cleanup the mate map.
    // The calling method will cleanup the output buffer.
    cleanupMateMap(mateMap, outputBufferPtr);

    if(returnStatus != SamStatus::NO_MORE_RECS)
    {
        // Failure.
        std::cerr << "ERROR reading file, exiting.\n";
        return(returnStatus);
    }
    return(SamStatus::SUCCESS);
}
Ejemplo n.º 9
0
// Validate that the record is sorted compared to the previously read record
// if there is one, according to the specified sort order.
// If the sort order is UNSORTED, true is returned.
bool SamFile::validateSortOrder(SamRecord& record, SamFileHeader& header)
{
    if(myRefPtr != NULL)
    {
        record.setReference(myRefPtr);
    }
    record.setSequenceTranslation(myReadTranslation);

    bool status = false;
    if(mySortedType == UNSORTED)
    {
        // Unsorted, so nothing to validate, just return true.
        status = true;
    }
    else 
    {
        // Check to see if mySortedType is based on the header.
        if(mySortedType == FLAG)
        {
            // Determine the sorted type from what was read out of the header.
            mySortedType = getSortOrderFromHeader(header);
        }

        if(mySortedType == QUERY_NAME)
        {
            // Validate that it is sorted by query name.
            // Get the query name from the record.
            const char* readName = record.getReadName();

            // Check if it is sorted either in samtools way or picard's way.
            if((myPrevReadName.Compare(readName) > 0) &&
               (strcmp(myPrevReadName.c_str(), readName) > 0))
            {
                // return false.
                String errorMessage = "ERROR: File is not sorted by read name at record ";
                errorMessage += myRecordCount;
                errorMessage += "\n\tPrevious record was ";
                errorMessage += myPrevReadName;
                errorMessage += ", but this record is ";
                errorMessage += readName;
                myStatus.setStatus(SamStatus::INVALID_SORT, 
                                   errorMessage.c_str());
                status = false;
            }
            else
            {
                myPrevReadName = readName;
                status = true;
            }
        }
        else 
        {
            // Validate that it is sorted by COORDINATES.
            // Get the leftmost coordinate and the reference index.
            int32_t refID = record.getReferenceID();
            int32_t coord = record.get0BasedPosition();
            // The unmapped reference id is at the end of a sorted file.
            if(refID == BamIndex::REF_ID_UNMAPPED)
            {
                // A new reference ID that is for the unmapped reads
                // is always valid.
                status = true;
                myPrevRefID = refID;
                myPrevCoord = coord;
            }
            else if(myPrevRefID == BamIndex::REF_ID_UNMAPPED)
            {
                // Previous reference ID was for unmapped reads, but the
                // current one is not, so this is not sorted.
                String errorMessage = "ERROR: File is not coordinate sorted at record ";
                errorMessage += myRecordCount;
                errorMessage += "\n\tPrevious record was unmapped, but this record is ";
                errorMessage += header.getReferenceLabel(refID) + ":" + coord;
                myStatus.setStatus(SamStatus::INVALID_SORT, 
                                   errorMessage.c_str());
                status = false;
            }
            else if(refID < myPrevRefID)
            {
                // Current reference id is less than the previous one, 
                //meaning that it is not sorted.
                String errorMessage = "ERROR: File is not coordinate sorted at record ";
                errorMessage += myRecordCount;
                errorMessage += "\n\tPrevious record was ";
                errorMessage += header.getReferenceLabel(myPrevRefID) + ":" + myPrevCoord;
                errorMessage += ", but this record is ";
                errorMessage += header.getReferenceLabel(refID) + ":" + coord;
                myStatus.setStatus(SamStatus::INVALID_SORT, 
                                   errorMessage.c_str());
                status = false;
            }
            else
            {
                // The reference IDs are in the correct order.
                if(refID > myPrevRefID)
                {
                    // New reference id, so set the previous coordinate to -1
                    myPrevCoord = -1;
                }
            
                // Check the coordinates.
                if(coord < myPrevCoord)
                {
                    // New Coord is less than the previous position.
                    String errorMessage = "ERROR: File is not coordinate sorted at record ";
                    errorMessage += myRecordCount;
                    errorMessage += "\n\tPreviousRecord was ";
                    errorMessage += header.getReferenceLabel(myPrevRefID) + ":" + myPrevCoord;
                    errorMessage += ", but this record is ";
                    errorMessage += header.getReferenceLabel(refID) + ":" + coord;
                    myStatus.setStatus(SamStatus::INVALID_SORT, 
                                       errorMessage.c_str());
                    status = false;
                }
                else
                {
                    myPrevRefID = refID;
                    myPrevCoord = coord;
                    status = true;
                }
            }
        }
    }

    return(status);
}
Ejemplo n.º 10
0
void validateRead1ModQuality(SamRecord& samRecord)
{
    //////////////////////////////////////////
    // Validate Record 1
    // Create record structure for validating.
    int expectedBlockSize = 89;
    const char* expectedReferenceName = "1";
    const char* expectedMateReferenceName = "1";
    const char* expectedMateReferenceNameOrEqual = "=";

    bamRecordStruct* expectedRecordPtr =
        (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int));

    char tag[3];
    char type;
    void* value;
    bamRecordStruct* bufferPtr;
    unsigned char* varPtr;

    expectedRecordPtr->myBlockSize = expectedBlockSize;
    expectedRecordPtr->myReferenceID = 0;
    expectedRecordPtr->myPosition = 1010;
    expectedRecordPtr->myReadNameLength = 23;
    expectedRecordPtr->myMapQuality = 0;
    expectedRecordPtr->myBin = 4681;
    expectedRecordPtr->myCigarLength = 2;
    expectedRecordPtr->myFlag = 73;
    expectedRecordPtr->myReadLength = 5;
    expectedRecordPtr->myMateReferenceID = 0;
    expectedRecordPtr->myMatePosition = 1010;
    expectedRecordPtr->myInsertSize = 0;
   
    // Check the alignment end
    assert(samRecord.get0BasedAlignmentEnd() == 1016);
    assert(samRecord.get1BasedAlignmentEnd() == 1017);
    assert(samRecord.getAlignmentLength() == 7);
    assert(samRecord.get0BasedUnclippedStart() == 1010);
    assert(samRecord.get1BasedUnclippedStart() == 1011);
    assert(samRecord.get0BasedUnclippedEnd() == 1016);
    assert(samRecord.get1BasedUnclippedEnd() == 1017);

    // Check the accessors.
    assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize);
    assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID);
    assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0);
    assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1);
    assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition);
    assert(samRecord.getReadNameLength() == 
           expectedRecordPtr->myReadNameLength);
    assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality);
    assert(samRecord.getBin() == expectedRecordPtr->myBin);
    assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength);
    assert(samRecord.getFlag() == expectedRecordPtr->myFlag);
    assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength);
    assert(samRecord.getMateReferenceID() ==
           expectedRecordPtr->myMateReferenceID);
    assert(strcmp(samRecord.getMateReferenceName(), 
                  expectedMateReferenceName) == 0);
    assert(strcmp(samRecord.getMateReferenceNameOrEqual(), 
                  expectedMateReferenceNameOrEqual) == 0);
    assert(samRecord.get1BasedMatePosition() == 
           expectedRecordPtr->myMatePosition + 1);
    assert(samRecord.get0BasedMatePosition() ==
           expectedRecordPtr->myMatePosition);
    assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize);
    assert(strcmp(samRecord.getReadName(), "1:1011:F:255+17M15D20M") == 0);
    assert(strcmp(samRecord.getCigar(), "5M2D") == 0);
    assert(strcmp(samRecord.getSequence(), "CCGAA") == 0);
    assert(strcmp(samRecord.getQuality(), "ABCDE") == 0);
    assert(samRecord.getNumOverlaps(1010, 1017) == 5);
    assert(samRecord.getNumOverlaps(1010, 1016) == 5);
    assert(samRecord.getNumOverlaps(1012, 1017) == 3);
    assert(samRecord.getNumOverlaps(1015, 1017) == 0);
    assert(samRecord.getNumOverlaps(1017, 1010) == 0);
    assert(samRecord.getNumOverlaps(1013, 1011) == 0);
    assert(samRecord.getNumOverlaps(-1, 1017) == 5);

    // Reset the tag iter, since the tags have already been read.
    samRecord.resetTagIter();

    // Check the tags.
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'A');
    assert(tag[1] == 'M');
    assert(type == 'i');
    assert(*(char*)value == 0);
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'M');
    assert(tag[1] == 'D');
    assert(type == 'Z');
    assert(*(String*)value == "37");
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'N');
    assert(tag[1] == 'M');
    assert(type == 'i');
    assert(*(char*)value == 0);
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'X');
    assert(tag[1] == 'T');
    assert(type == 'A');
    assert(*(char*)value == 'R');
    // No more tags, should return false.
    assert(samRecord.getNextSamTag(tag, type, &value) == false);
    assert(samRecord.getNextSamTag(tag, type, &value) == false);

    // Get the record ptr.   
    bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer();
    // Validate the buffers match.
    assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize);
    assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID);
    assert(bufferPtr->myPosition == expectedRecordPtr->myPosition);
    assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength);
    assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality);
    assert(bufferPtr->myBin == expectedRecordPtr->myBin);
    assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength);
    assert(bufferPtr->myFlag == expectedRecordPtr->myFlag);
    assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength);
    assert(bufferPtr->myMateReferenceID ==
           expectedRecordPtr->myMateReferenceID);
    assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition);
    assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize);

    // Validate the variable length fields in the buffer.
    // Set the pointer to the start of the variable fields.
    varPtr = (unsigned char*)(&(bufferPtr->myData[0]));

    // Validate the readname.
    for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++)
    {
        assert(*varPtr == samRecord.getReadName()[i]);
        varPtr++;
    }

    // Validate the cigar.
    // The First cigar is 5M which is 5 << 4 | 0 = 80
    assert(*(unsigned int*)varPtr == 80);
    // Increment the varptr the size of an int.
    varPtr += 4;
    // The 2nd cigar is 2D which is 2 << 4 | 2 = 34
    assert(*(unsigned int*)varPtr == 34);
    // Increment the varptr the size of an int.
    varPtr += 4;
   
    // Validate the sequence.
    // CC = 0x22
    assert(*varPtr == 0x22);
    varPtr++;
    // GA = 0x41
    assert(*varPtr == 0x41);
    varPtr++;
    // A  = 0x10
    assert(*varPtr == 0x10);
    varPtr++;
  
    // Validate the Quality
    for(int i = 0; i < expectedRecordPtr->myReadLength; i++)
    {
        assert(*varPtr == samRecord.getQuality()[i] - 33);
        varPtr++;
    }

    // Validate the tags.  
    assert(*varPtr == 'A');
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'C');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'D');
    varPtr++;
    assert(*varPtr == 'Z');
    varPtr++;
    assert(*varPtr == '3');
    varPtr++;
    assert(*varPtr == '7');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'N');
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'C');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'X');
    varPtr++;
    assert(*varPtr == 'T');
    varPtr++;
    assert(*varPtr == 'A');
    varPtr++;
    assert(*varPtr == 'R');
    varPtr++;
}
Ejemplo n.º 11
0
int GapInfo::processFile(const char* inputFileName, const char* outputFileName,
                         const char* refFile, bool detailed,
                         bool checkFirst, bool checkStrand)
{
    // Open the file for reading.
    SamFile samIn;
    samIn.OpenForRead(inputFileName);

    // Read the sam header.
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);

    SamRecord samRecord;

    GenomeSequence* refPtr = NULL;
    if(strcmp(refFile, "") != 0)
    {
        refPtr = new GenomeSequence(refFile);
    }

    IFILE outFile = ifopen(outputFileName, "w");

    // Map for summary.
    std::map<int, int> gapInfoMap;


    // Keep reading records until ReadRecord returns false.
    while(samIn.ReadRecord(samHeader, samRecord))
    {
        uint16_t samFlags = samRecord.getFlag();

        if((!SamFlag::isMapped(samFlags)) || 
           (!SamFlag::isMateMapped(samFlags)) ||
           (!SamFlag::isPaired(samFlags)) ||
           (samFlags & SamFlag::SECONDARY_ALIGNMENT) || 
           (SamFlag::isDuplicate(samFlags)) ||
           (SamFlag::isQCFailure(samFlags)))
        {
            // unmapped, mate unmapped, not paired,
            // not the primary alignment,
            // duplicate, fails vendor quality check 
            continue;
        }

        // No gap info if the chromosome names are different or
        // are unknown.
        int32_t refID = samRecord.getReferenceID();
        if((refID != samRecord.getMateReferenceID()) || (refID == -1))
        {
            continue;
        }

        int32_t readStart = samRecord.get0BasedPosition();
        int32_t mateStart = samRecord.get0BasedMatePosition();

        // If the mate starts first, then the pair was processed by
        // the mate.
        if(mateStart < readStart)
        {
            continue;
        }
        if((mateStart == readStart) && (SamFlag::isReverse(samFlags)))
        {
            // read and mate start at the same position, so 
            // only process the forward strand.
            continue;
        }

        // Process this read pair.
        int32_t readEnd = samRecord.get0BasedAlignmentEnd();
        
        int32_t gapSize = mateStart - readEnd - 1;

        if(detailed)
        {
            // Output the gap info.
            ifprintf(outFile, "%s\t%d\t%d", 
                     samRecord.getReferenceName(), readEnd+1, gapSize);
            
            // Check if it is not the first or if it is not the forward strand.
            if(checkFirst && !SamFlag::isFirstFragment(samFlags))
            {
                ifprintf(outFile, "\tNotFirst");
            }
            if(checkStrand && SamFlag::isReverse(samFlags))
            {
                ifprintf(outFile, "\tReverse");
            }
            ifprintf(outFile, "\n");
        }
        else
        {
            // Summary.
            // Skip reads that are not the forward strand.
            if(SamFlag::isReverse(samFlags))
            {
                // continue
                continue;
            }

            // Forward.
            // Check the reference for 'N's.
            if(refPtr != NULL)
            {
                genomeIndex_t chromStartIndex = 
                    refPtr->getGenomePosition(samRecord.getReferenceName());
                if(chromStartIndex == INVALID_GENOME_INDEX)
                {
                    // Invalid position, so continue to the next one.
                    continue;
                }
                bool skipRead = false;
                for(int i = readEnd + 1; i < mateStart; i++)
                {
                    if((*refPtr)[i] == 'N')
                    {
                        // 'N' in the reference, so continue to the next read.
                        skipRead = true;
                        break;
                    }
                }
                if(skipRead)
                {
                    continue;
                }
            }
            
            // Update the gapInfo.
            gapInfoMap[gapSize]++;
        }
    }

    if(!detailed)
    {
        // Output the summary.
        ifprintf(outFile, "GapSize\tNumPairs\n");
        for(std::map<int,int>::iterator iter = gapInfoMap.begin(); 
            iter != gapInfoMap.end(); iter++)
        {
            ifprintf(outFile, "%d\t%d\n", (*iter).first, (*iter).second);
        }
    }
    

    SamStatus::Status returnStatus = samIn.GetStatus();
    if(returnStatus == SamStatus::NO_MORE_RECS)
    {
        return(SamStatus::SUCCESS);
    }
    return(returnStatus);
}
Ejemplo n.º 12
0
// When a record is read, check if it is a duplicate or
// store for future checking.
void Dedup::checkDups(SamRecord& record, uint32_t recordCount)
{
    // Only inside this method if the record is mapped.

    // Get the key for this record.
    static DupKey key;
    static DupKey mateKey;
    key.updateKey(record, getLibraryID(record));

    int flag = record.getFlag(); 
    bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
    int sumBaseQual = getBaseQuality(record);

    int32_t chromID = record.getReferenceID();
    int32_t mateChromID = record.getMateReferenceID();

    // If we are one-chrom and the mate is not on the same chromosome, 
    // mark it as not paired.
    if(myOneChrom && (chromID != mateChromID))
    {
        recordPaired = false;
    }
    
    // Look in the map to see if an entry for this key exists.
    FragmentMapInsertReturn ireturn = 
        myFragmentMap.insert(std::make_pair(key, ReadData()));

    ReadData* readData = &(ireturn.first->second);

    // Mark this record's data in the fragment record if this is the first
    // entry or if it is a duplicate and the old record is not paired and 
    // the new record is paired or the has a higher quality.
    if((ireturn.second == true) ||
       ((readData->paired == false) && 
        (recordPaired || (sumBaseQual > readData->sumBaseQual))))
    {
        // If there was a previous record, mark it duplicate and release
        // the old record
        if(ireturn.second == false)
        {
            // Mark the old record as a DUPLICATE!
            handleDuplicate(readData->recordIndex, readData->recordPtr);
        }
        // Store this record for later duplicate checking.
        readData->sumBaseQual = sumBaseQual;
        readData->recordIndex = recordCount;
        readData->paired = recordPaired;
        if(recordPaired)
        {
            readData->recordPtr = NULL;
        }
        else
        {
            readData->recordPtr = &record;
        }
    }
    else
    {
        // The old record is not a duplicate so the new record is
        // a duplicate if it is not paired.
        if(recordPaired == false)
        {
            // This record is a duplicate, so mark it and release it.
            handleDuplicate(recordCount, &record);
        }
    }

    // Only paired processing is left, so return if not paired.
    if(recordPaired == false)
    {
        // Not paired, no more operations required, so return.
        return;
    }
    
    // This is a paired record, so check for its mate.
    uint64_t readPos = 
        SamHelper::combineChromPos(chromID,
                                   record.get0BasedPosition());
    uint64_t matePos =
        SamHelper::combineChromPos(mateChromID, 
                                   record.get0BasedMatePosition());
    SamRecord* mateRecord = NULL;
    int mateIndex = 0;
    
    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate map is stored by the mate position, so look for this 
        // record's position.
        // The mate should be in the mate map, so find it.
        std::pair<MateMap::iterator,MateMap::iterator> matches =
            myMateMap.equal_range(readPos);
        // Loop through the elements that matched the pos looking for the mate.
        for(MateMap::iterator iter = matches.first; 
            iter != matches.second; iter++)
        {
            if(strcmp((*iter).second.recordPtr->getReadName(), 
                      record.getReadName()) == 0)
            {
                // Found the match.
                ReadData* mateData = &((*iter).second);
                // Update the quality and track the mate record and index.
                sumBaseQual += mateData->sumBaseQual;
                mateIndex = mateData->recordIndex;
                mateRecord = mateData->recordPtr;
                // Remove the entry from the map.
                myMateMap.erase(iter);
                break;
            }
        }
    }
    if((mateRecord == NULL) && (matePos >= readPos))
    {
        // Haven't gotten to the mate yet, so store this record.
        MateMap::iterator mateIter = 
            myMateMap.insert(std::make_pair(matePos, ReadData()));
        mateIter->second.sumBaseQual = sumBaseQual;
        mateIter->second.recordPtr = &record;
        mateIter->second.recordIndex = recordCount;
        // No more processing for this record is necessary.
        return;
    }

    if(mateRecord == NULL)
    {
        // Passed the mate, but it was not found.
        handleMissingMate(&record);
        return;
    }

    // Make the paired key.
    mateKey.updateKey(*mateRecord, getLibraryID(*mateRecord));
    PairedKey pkey(key, mateKey);

    // Check to see if this pair is a duplicate.
    PairedMapInsertReturn pairedReturn = 
        myPairedMap.insert(std::make_pair(pkey,PairedData()));
    PairedData* storedPair = &(pairedReturn.first->second);

    // Get the index for "record 1" - the one with the earlier coordinate.
    int record1Index = getFirstIndex(key, recordCount,
                                          mateKey, mateIndex);

    // Check if we have already found a duplicate pair.
    // If there is no duplicate found, there is nothing more to do.
    if(pairedReturn.second == false)
    {
        // Duplicate found.
        bool keepStored = true;
        if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
        {
            // The new pair has higher quality, so keep that.
            keepStored = false;
        }
        else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
        {
            // Same quality, so keep the one with the earlier record1Index.
            if(record1Index < storedPair->record1Index)
            {
                // The new pair has an earlier lower coordinate read,
                // so keep that.
                keepStored = false;
            }
        }
        // Check to see which one should be kept by checking qualities.
        if(keepStored)
        {
            // The old pair had higher quality so mark the new pair as a
            // duplicate and release them.
            handleDuplicate(mateIndex, mateRecord);
            handleDuplicate(recordCount, &record);
        }
        else
        {
            // The new pair has higher quality, so keep that.
            // First mark the previous one as duplicates and release them.
            handleDuplicate(storedPair->record1Index, storedPair->record1Ptr);
            handleDuplicate(storedPair->record2Index, storedPair->record2Ptr);
            // Store this pair's information.
            if(record1Index == mateIndex)
            {
                // Mate has a lower coordinate, so make mate
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Ptr = mateRecord;
                storedPair->record2Ptr = &record;
                storedPair->record1Index = mateIndex;
                storedPair->record2Index = recordCount;
            }
            else
            {
                // This record has a lower coordinate, so make it
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Ptr = &record;
                storedPair->record2Ptr = mateRecord;
                storedPair->record1Index = recordCount;
                storedPair->record2Index = mateIndex;
            }
        }
    }
    else
    {
        // Store this pair's information.
        storedPair->sumBaseQual = sumBaseQual;

        if(record1Index == mateIndex)
        {
            // Mate has a lower coordinate, so make mate
            // record1.
            storedPair->record1Ptr = mateRecord;
            storedPair->record2Ptr = &record;
            storedPair->record1Index = mateIndex;
            storedPair->record2Index = recordCount;
        }
        else
        {
            // This record has a lower coordinate, so make it
            // record1.
            storedPair->record1Ptr = &record;
            storedPair->record2Ptr = mateRecord;
            storedPair->record1Index = recordCount;
            storedPair->record2Index = mateIndex;
        }
    }
}
Ejemplo n.º 13
0
// Dump the reference information from specified SAM/BAM file.
int DumpRefInfo::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    bool noeof = false;
    bool printRecordRefs = false;
    bool params = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("printRecordRefs", &printRecordRefs)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the input file for reading.
    SamFile samIn;
    samIn.OpenForRead(inFile);

    // Read the sam header.
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);

    const SamReferenceInfo& refInfo = samHeader.getReferenceInfo();
    int numReferences = refInfo.getNumEntries();
    
    for(int i = 0; i < numReferences; i++)
    {
        std::cout << "Reference Index " << i;
        std::cout << "; Name: " << refInfo.getReferenceName(i)
                  << std::endl;
    }
    if(numReferences == 0)
    {
        // There is no reference info.
        std::cerr << "The header contains no reference information.\n";
    }

    // If we are to print the references as found in the records, loop
    // through reading the records.
    if(printRecordRefs)
    {
        SamRecord samRecord;

        // Track the prev name/id.
        std::string prevName = "";
        int prevID = -2;
        int recCount = 0; // track the num records in a ref.
        // Keep reading records until ReadRecord returns false.
        while(samIn.ReadRecord(samHeader, samRecord))
        {
            const char* name = samRecord.getReferenceName();
            int id = samRecord.getReferenceID();
            if((strcmp(name, prevName.c_str()) != 0) || (id != prevID))
            {
                if(prevID != -2)
                {
                    std::cout << "\tRef ID: " << prevID
                              << "\tRef Name: " << prevName 
                              << "\tNumRecs: " << recCount
                              << std::endl;
                }
                recCount = 0;
                prevID = id;
                prevName = name;
            }
            ++recCount;
        }
        // Print the last index.
        if(prevID != -2)
        {
            std::cout << "\tRef ID: " << prevID
                      << "\tRef Name: " << prevName 
                      << "\tNumRecs: " << recCount
                      << std::endl;
        }
    }
    return(SamStatus::SUCCESS);
}