Ejemplo n.º 1
0
bool SamFile::checkRecordInSection(SamRecord& record)
{
    bool recordFound = true;
    if(myRefID == BamIndex::REF_ID_ALL)
    {
        return(true);
    }
    // Check to see if it is in the correct reference/position.
    if(record.getReferenceID() != myRefID)
    {
        // Incorrect reference ID, return no more records.
        myStatus = SamStatus::NO_MORE_RECS;
        return(false);
    }
   
    // Found a record.
    recordFound = true;

    // If start/end position are set, verify that the alignment falls
    // within those.
    // If the alignment start is greater than the end of the region,
    // return NO_MORE_RECS.
    // Since myEndPos is Exclusive 0-based, anything >= myEndPos is outside
    // of the region.
    if((myEndPos != -1) && (record.get0BasedPosition() >= myEndPos))
    {
        myStatus = SamStatus::NO_MORE_RECS;
        return(false);
    }
        
    // We know the start is less than the end position, so the alignment
    // overlaps the region if the alignment end position is greater than the
    // start of the region.
    if((myStartPos != -1) && (record.get0BasedAlignmentEnd() < myStartPos))
    {
        // If it does not overlap the region, so go to the next
        // record...set recordFound back to false.
        recordFound = false;
    }

    if(!myOverlapSection)
    {
        // Needs to be fully contained.  Not fully contained if
        // 1) the record start position is < the region start position.
        // or
        // 2) the end position is specified and the record end position
        //    is greater than or equal to the region end position.
        //    (equal to since the region is exclusive.
        if((record.get0BasedPosition() < myStartPos) ||
           ((myEndPos != -1) && 
            (record.get0BasedAlignmentEnd() >= myEndPos)))
        {
            // This record is not fully contained, so move on to the next
            // record.
            recordFound = false;
        }
    }

    return(recordFound);
}
Ejemplo n.º 2
0
void validateRead1ModQuality(SamRecord& samRecord)
{
    //////////////////////////////////////////
    // Validate Record 1
    // Create record structure for validating.
    int expectedBlockSize = 89;
    const char* expectedReferenceName = "1";
    const char* expectedMateReferenceName = "1";
    const char* expectedMateReferenceNameOrEqual = "=";

    bamRecordStruct* expectedRecordPtr =
        (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int));

    char tag[3];
    char type;
    void* value;
    bamRecordStruct* bufferPtr;
    unsigned char* varPtr;

    expectedRecordPtr->myBlockSize = expectedBlockSize;
    expectedRecordPtr->myReferenceID = 0;
    expectedRecordPtr->myPosition = 1010;
    expectedRecordPtr->myReadNameLength = 23;
    expectedRecordPtr->myMapQuality = 0;
    expectedRecordPtr->myBin = 4681;
    expectedRecordPtr->myCigarLength = 2;
    expectedRecordPtr->myFlag = 73;
    expectedRecordPtr->myReadLength = 5;
    expectedRecordPtr->myMateReferenceID = 0;
    expectedRecordPtr->myMatePosition = 1010;
    expectedRecordPtr->myInsertSize = 0;
   
    // Check the alignment end
    assert(samRecord.get0BasedAlignmentEnd() == 1016);
    assert(samRecord.get1BasedAlignmentEnd() == 1017);
    assert(samRecord.getAlignmentLength() == 7);
    assert(samRecord.get0BasedUnclippedStart() == 1010);
    assert(samRecord.get1BasedUnclippedStart() == 1011);
    assert(samRecord.get0BasedUnclippedEnd() == 1016);
    assert(samRecord.get1BasedUnclippedEnd() == 1017);

    // Check the accessors.
    assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize);
    assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID);
    assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0);
    assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1);
    assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition);
    assert(samRecord.getReadNameLength() == 
           expectedRecordPtr->myReadNameLength);
    assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality);
    assert(samRecord.getBin() == expectedRecordPtr->myBin);
    assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength);
    assert(samRecord.getFlag() == expectedRecordPtr->myFlag);
    assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength);
    assert(samRecord.getMateReferenceID() ==
           expectedRecordPtr->myMateReferenceID);
    assert(strcmp(samRecord.getMateReferenceName(), 
                  expectedMateReferenceName) == 0);
    assert(strcmp(samRecord.getMateReferenceNameOrEqual(), 
                  expectedMateReferenceNameOrEqual) == 0);
    assert(samRecord.get1BasedMatePosition() == 
           expectedRecordPtr->myMatePosition + 1);
    assert(samRecord.get0BasedMatePosition() ==
           expectedRecordPtr->myMatePosition);
    assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize);
    assert(strcmp(samRecord.getReadName(), "1:1011:F:255+17M15D20M") == 0);
    assert(strcmp(samRecord.getCigar(), "5M2D") == 0);
    assert(strcmp(samRecord.getSequence(), "CCGAA") == 0);
    assert(strcmp(samRecord.getQuality(), "ABCDE") == 0);
    assert(samRecord.getNumOverlaps(1010, 1017) == 5);
    assert(samRecord.getNumOverlaps(1010, 1016) == 5);
    assert(samRecord.getNumOverlaps(1012, 1017) == 3);
    assert(samRecord.getNumOverlaps(1015, 1017) == 0);
    assert(samRecord.getNumOverlaps(1017, 1010) == 0);
    assert(samRecord.getNumOverlaps(1013, 1011) == 0);
    assert(samRecord.getNumOverlaps(-1, 1017) == 5);

    // Reset the tag iter, since the tags have already been read.
    samRecord.resetTagIter();

    // Check the tags.
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'A');
    assert(tag[1] == 'M');
    assert(type == 'i');
    assert(*(char*)value == 0);
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'M');
    assert(tag[1] == 'D');
    assert(type == 'Z');
    assert(*(String*)value == "37");
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'N');
    assert(tag[1] == 'M');
    assert(type == 'i');
    assert(*(char*)value == 0);
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'X');
    assert(tag[1] == 'T');
    assert(type == 'A');
    assert(*(char*)value == 'R');
    // No more tags, should return false.
    assert(samRecord.getNextSamTag(tag, type, &value) == false);
    assert(samRecord.getNextSamTag(tag, type, &value) == false);

    // Get the record ptr.   
    bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer();
    // Validate the buffers match.
    assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize);
    assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID);
    assert(bufferPtr->myPosition == expectedRecordPtr->myPosition);
    assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength);
    assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality);
    assert(bufferPtr->myBin == expectedRecordPtr->myBin);
    assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength);
    assert(bufferPtr->myFlag == expectedRecordPtr->myFlag);
    assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength);
    assert(bufferPtr->myMateReferenceID ==
           expectedRecordPtr->myMateReferenceID);
    assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition);
    assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize);

    // Validate the variable length fields in the buffer.
    // Set the pointer to the start of the variable fields.
    varPtr = (unsigned char*)(&(bufferPtr->myData[0]));

    // Validate the readname.
    for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++)
    {
        assert(*varPtr == samRecord.getReadName()[i]);
        varPtr++;
    }

    // Validate the cigar.
    // The First cigar is 5M which is 5 << 4 | 0 = 80
    assert(*(unsigned int*)varPtr == 80);
    // Increment the varptr the size of an int.
    varPtr += 4;
    // The 2nd cigar is 2D which is 2 << 4 | 2 = 34
    assert(*(unsigned int*)varPtr == 34);
    // Increment the varptr the size of an int.
    varPtr += 4;
   
    // Validate the sequence.
    // CC = 0x22
    assert(*varPtr == 0x22);
    varPtr++;
    // GA = 0x41
    assert(*varPtr == 0x41);
    varPtr++;
    // A  = 0x10
    assert(*varPtr == 0x10);
    varPtr++;
  
    // Validate the Quality
    for(int i = 0; i < expectedRecordPtr->myReadLength; i++)
    {
        assert(*varPtr == samRecord.getQuality()[i] - 33);
        varPtr++;
    }

    // Validate the tags.  
    assert(*varPtr == 'A');
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'C');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'D');
    varPtr++;
    assert(*varPtr == 'Z');
    varPtr++;
    assert(*varPtr == '3');
    varPtr++;
    assert(*varPtr == '7');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'N');
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'C');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'X');
    varPtr++;
    assert(*varPtr == 'T');
    varPtr++;
    assert(*varPtr == 'A');
    varPtr++;
    assert(*varPtr == 'R');
    varPtr++;
}
Ejemplo n.º 3
0
int GapInfo::processFile(const char* inputFileName, const char* outputFileName,
                         const char* refFile, bool detailed,
                         bool checkFirst, bool checkStrand)
{
    // Open the file for reading.
    SamFile samIn;
    samIn.OpenForRead(inputFileName);

    // Read the sam header.
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);

    SamRecord samRecord;

    GenomeSequence* refPtr = NULL;
    if(strcmp(refFile, "") != 0)
    {
        refPtr = new GenomeSequence(refFile);
    }

    IFILE outFile = ifopen(outputFileName, "w");

    // Map for summary.
    std::map<int, int> gapInfoMap;


    // Keep reading records until ReadRecord returns false.
    while(samIn.ReadRecord(samHeader, samRecord))
    {
        uint16_t samFlags = samRecord.getFlag();

        if((!SamFlag::isMapped(samFlags)) || 
           (!SamFlag::isMateMapped(samFlags)) ||
           (!SamFlag::isPaired(samFlags)) ||
           (samFlags & SamFlag::SECONDARY_ALIGNMENT) || 
           (SamFlag::isDuplicate(samFlags)) ||
           (SamFlag::isQCFailure(samFlags)))
        {
            // unmapped, mate unmapped, not paired,
            // not the primary alignment,
            // duplicate, fails vendor quality check 
            continue;
        }

        // No gap info if the chromosome names are different or
        // are unknown.
        int32_t refID = samRecord.getReferenceID();
        if((refID != samRecord.getMateReferenceID()) || (refID == -1))
        {
            continue;
        }

        int32_t readStart = samRecord.get0BasedPosition();
        int32_t mateStart = samRecord.get0BasedMatePosition();

        // If the mate starts first, then the pair was processed by
        // the mate.
        if(mateStart < readStart)
        {
            continue;
        }
        if((mateStart == readStart) && (SamFlag::isReverse(samFlags)))
        {
            // read and mate start at the same position, so 
            // only process the forward strand.
            continue;
        }

        // Process this read pair.
        int32_t readEnd = samRecord.get0BasedAlignmentEnd();
        
        int32_t gapSize = mateStart - readEnd - 1;

        if(detailed)
        {
            // Output the gap info.
            ifprintf(outFile, "%s\t%d\t%d", 
                     samRecord.getReferenceName(), readEnd+1, gapSize);
            
            // Check if it is not the first or if it is not the forward strand.
            if(checkFirst && !SamFlag::isFirstFragment(samFlags))
            {
                ifprintf(outFile, "\tNotFirst");
            }
            if(checkStrand && SamFlag::isReverse(samFlags))
            {
                ifprintf(outFile, "\tReverse");
            }
            ifprintf(outFile, "\n");
        }
        else
        {
            // Summary.
            // Skip reads that are not the forward strand.
            if(SamFlag::isReverse(samFlags))
            {
                // continue
                continue;
            }

            // Forward.
            // Check the reference for 'N's.
            if(refPtr != NULL)
            {
                genomeIndex_t chromStartIndex = 
                    refPtr->getGenomePosition(samRecord.getReferenceName());
                if(chromStartIndex == INVALID_GENOME_INDEX)
                {
                    // Invalid position, so continue to the next one.
                    continue;
                }
                bool skipRead = false;
                for(int i = readEnd + 1; i < mateStart; i++)
                {
                    if((*refPtr)[i] == 'N')
                    {
                        // 'N' in the reference, so continue to the next read.
                        skipRead = true;
                        break;
                    }
                }
                if(skipRead)
                {
                    continue;
                }
            }
            
            // Update the gapInfo.
            gapInfoMap[gapSize]++;
        }
    }

    if(!detailed)
    {
        // Output the summary.
        ifprintf(outFile, "GapSize\tNumPairs\n");
        for(std::map<int,int>::iterator iter = gapInfoMap.begin(); 
            iter != gapInfoMap.end(); iter++)
        {
            ifprintf(outFile, "%d\t%d\n", (*iter).first, (*iter).second);
        }
    }
    

    SamStatus::Status returnStatus = samIn.GetStatus();
    if(returnStatus == SamStatus::NO_MORE_RECS)
    {
        return(SamStatus::SUCCESS);
    }
    return(returnStatus);
}
Ejemplo n.º 4
0
// Soft Clip from the end of the read at the specified reference position.
int32_t CigarHelper::softClipEndByRefPos(SamRecord& record, 
                                         int32_t refPosition0Based,
                                         CigarRoller& newCigar)
{
    newCigar.clear();
    Cigar* cigar = record.getCigarInfo();
    if(cigar == NULL)
    {
        // Failed to get the cigar.
        ErrorHandler::handleError("Soft clipping, but failed to read the cigar");
        return(NO_CLIP);
    }

    // No cigar or position in the record, so return no clip.
    if((cigar->size() == 0) || (record.get0BasedPosition() == -1))
    {
        return(NO_CLIP);
    }

    // Check to see if the reference position occurs after the record ends,
    // if so, do no clipping.
    if(refPosition0Based > record.get0BasedAlignmentEnd())
    {
        // Not within this read, so nothing to clip.
        newCigar.Set(record.getCigar());
        return(NO_CLIP);
    }

    // The position falls before the read ends, so loop through until the
    // position is found.
    int32_t currentRefPosition = record.get0BasedPosition();
    int32_t readClipPosition = 0;
    for(int i = 0; i < cigar->size(); i++)
    {
        const Cigar::CigarOperator* op = &(cigar->getOperator(i));

        // If the operation is found in the reference, increase the
        // reference position.
        if(Cigar::foundInReference(*op))
        {
            // match, mismatch, deletion, skip
            // increment the current reference position to just past
            // this operation.
            currentRefPosition += op->count;
        }
         
        // Check to see if we have hit our clip position.
        if(refPosition0Based < currentRefPosition)
        {
            // If this read is also in the query (match/mismatch), 
            // write the partial op to the new cigar.
            int32_t numKeep = 0;
            if(Cigar::foundInQuery(*op))
            {
                numKeep = op->count - (currentRefPosition - refPosition0Based);
                if(numKeep > 0)
                {
                    newCigar.Add(op->operation, numKeep);
                    readClipPosition += numKeep;
                }
            }
            else if(Cigar::isClip(*op))
            {
                // This is a hard clip, so write it.
                newCigar.Add(op->operation, op->count);
            }
            else
            {

                // Not found in the query (skip/deletion),
                // so don't write any of the operation.
            }
            // Found the clip point, so break.
            break;
        }
        else if(refPosition0Based == currentRefPosition)
        {
            newCigar += *op;
            if(Cigar::foundInQuery(*op))
            {
                readClipPosition += op->count;
            }
        }
        else
        {
            // Not yet to the clip position, so add this operation/size to
            // the new cigar.
            newCigar += *op;
            if(Cigar::foundInQuery(*op))
            {
                // Found in the query, so update the read clip position.
                readClipPosition += op->count;
            }
        }
    } // End loop through cigar.

    // Before adding the softclip, read from the end of the cigar checking to
    // see if the operations are in the query, removing operations that are
    // not (pad/delete/skip) until a hardclip or an operation in the query is
    // found. We do not want a pad/delete/skip right before a softclip.
    for(int j = newCigar.size() - 1; j >= 0; j--)
    {
        const Cigar::CigarOperator* op = &(newCigar.getOperator(j));
        if(!Cigar::foundInQuery(*op) && !Cigar::isClip(*op))
        {
            // pad/delete/skip
            newCigar.Remove(j);
        }
        else if(Cigar::foundInQuery(*op) & Cigar::isClip(*op))
        {
            // Soft clip, so increment the clip position for the return value.
            // Remove the softclip since the readClipPosition is used to
            // calculate teh size of the soft clip added.
            readClipPosition -= op->count;
            newCigar.Remove(j);
        }
        else
        {
            // Found a cigar operation that should not be deleted, so stop deleting.
            break;
        }
    } 

    // Determine the number of soft clips.
    int32_t numSoftClips = record.getReadLength() - readClipPosition;
    // NOTE that if the previous operation is a softclip, the CigarRoller logic
    // will merge this with that one.
    newCigar.Add(Cigar::softClip, numSoftClips);

    // Check if an ending hard clip needs to be added.
    if(cigar->size() != 0)
    {
        const Cigar::CigarOperator* lastOp = 
            &(cigar->getOperator(cigar->size() - 1));
        if(lastOp->operation == Cigar::hardClip)
        {
            newCigar += *lastOp;
        }
    }

    return(readClipPosition);
}
Ejemplo n.º 5
0
int WriteRegion::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String indexFile = "";
    String readName = "";
    String bed = "";
    myStart = UNSPECIFIED_INT;
    myEnd = UNSPECIFIED_INT;
    myPrevStart = UNSPECIFIED_INT;
    myPrevEnd = UNSPECIFIED_INT;
    myRefID = UNSET_REF;
    myRefName.Clear();
    myPrevRefName.Clear();
    myBedRefID = SamReferenceInfo::NO_REF_ID;
    bool lshift = false;
    bool noeof = false;
    bool params = false;
    myWithinReg = false;
    myWroteReg = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_PARAMETER_GROUP("Optional Region Parameters")        
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("refName", &myRefName)
        LONG_INTPARAMETER("refID", &myRefID)
        LONG_INTPARAMETER("start", &myStart)
        LONG_INTPARAMETER("end", &myEnd)
        LONG_STRINGPARAMETER("bed", &bed)
        LONG_PARAMETER("withinReg", &myWithinReg)
        LONG_STRINGPARAMETER("readName", &readName)
        LONG_PARAMETER_GROUP("Optional Other Parameters")
        LONG_PARAMETER("lshift", &lshift)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        // mandatory argument was not specified.
        inputParameters.Status();
        std::cerr << "Missing mandatory argument: --in" << std::endl;
        return(-1);
    }
    if(outFile == "")
    {
        usage();
        // mandatory argument was not specified.
        inputParameters.Status();
        std::cerr << "Missing mandatory argument: --out" << std::endl;
        return(-1);
    }
    
    if(indexFile == "")
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }

    if(myRefID != UNSET_REF && myRefName.Length() != 0)
    {
        std::cerr << "Can't specify both refID and refName" << std::endl;
        inputParameters.Status();
        return(-1);
    }
    if(myRefID != UNSET_REF && bed.Length() != 0)
    {
        std::cerr << "Can't specify both refID and bed" << std::endl;
        inputParameters.Status();
        return(-1);
    }
    if(myRefName.Length() != 0 && bed.Length() != 0)
    {
        std::cerr << "Can't specify both refName and bed" << std::endl;
        inputParameters.Status();
        return(-1);
    }

    if(!bed.IsEmpty())
    {
        myBedFile = ifopen(bed, "r");
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the file for reading.   
    mySamIn.OpenForRead(inFile);

    // Open the output file for writing.
    SamFile samOut;
    samOut.OpenForWrite(outFile);

    // Open the bam index file for reading if a region was specified.
    if((myRefName.Length() != 0) || (myRefID != UNSET_REF) || (myBedFile != NULL))
    {
        mySamIn.ReadBamIndex(indexFile);
    }

    // Read & write the sam header.
    mySamIn.ReadHeader(mySamHeader);
    samOut.WriteHeader(mySamHeader);

    // Read the sam records.
    SamRecord samRecord;
    // Track the status.
    int numSectionRecords = 0;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;
        
    while(getNextSection())
    {
        // Keep reading records until they aren't anymore.
        while(mySamIn.ReadRecord(mySamHeader, samRecord))
        {
            if(!readName.IsEmpty())
            {
                // Check for readname.
                if(strcmp(samRecord.getReadName(), readName.c_str()) != 0)
                {
                    // not a matching read name, so continue to the next record.
                    continue;
                }
            }
            
            // Check to see if the read has already been processed.
            if(myPrevEnd != UNSPECIFIED_INT)
            {
                // Because we already know that the bed was sorted, 
                // we know that the previous section started before
                // this one, so if the previous end is greater than
                // this record's end position we know that it
                // was already written in the previous section.
                // Note: can't be equal to the previous end since
                // the end range was exclusive, while
                // get0BasedAlignmentEnd is inclusive.
                // myPrevEnd is reset by getNextSection when a new
                // chromosome is hit.
                if(samRecord.get0BasedAlignmentEnd() < myPrevEnd)
                {
                    // This record was already written.
                    continue;
                }
            }

            // Shift left if applicable.
            if(lshift)
            {
                samRecord.shiftIndelsLeft();
            }

            // Successfully read a record from the file, so write it.
            samOut.WriteRecord(mySamHeader, samRecord);
            ++numSectionRecords;
        }
        myWroteReg = true;
    }

    if(myBedFile != NULL)
    {
        ifclose(myBedFile);
    }
    std::cerr << "Wrote " << outFile << " with " << numSectionRecords
              << " records.\n";
    return(returnStatus);
}