Пример #1
0
// Soft clip the record from the front and/or the back.
SamFilter::FilterStatus SamFilter::softClip(SamRecord& record, 
                                            int32_t numFrontClips,
                                            int32_t numBackClips)
{
    //////////////////////////////////////////////////////////
    Cigar* cigar = record.getCigarInfo();
    FilterStatus status = NONE;
    int32_t startPos = record.get0BasedPosition();
    CigarRoller updatedCigar;

    status = softClip(*cigar, numFrontClips, numBackClips, 
                      startPos, updatedCigar);

    if(status == FILTERED)
    {
        /////////////////////////////
        // The entire read is clipped, so rather than clipping it,
        // filter it out.
        filterRead(record);
        return(FILTERED);
    }
    else if(status == CLIPPED)
    {
        // Part of the read was clipped, and now that we have
        // an updated cigar, update the read.
        record.setCigar(updatedCigar);

        // Update the starting position.
        record.set0BasedPosition(startPos);
    }
    return(status);
}
Пример #2
0
float MultiDistortion::processSample(float input, int type)
{
 
    // process input sample based on type
    switch (type) {
        case distTypeTube:
            return tubeClip(input);
            break;
        case distTypeSoftClip:
            return softClip(input);
            break;
        case distTypeExpClip:
            return expClip(input);
            
        default:
            return input;
            break;
    }
    
}
Пример #3
0
// Process a single read by quality trimming, filtering
// returns true if the read should be kept
bool processRead(SeqRecord& record)
{
    // let's remove the adapter if the user has requested so
    // before doing any filtering
    if(!opt::adapterF.empty())
    {
        std::string _tmp(record.seq.toString());
        size_t found = _tmp.find(opt::adapterF);
        int _length;

        if(found != std::string::npos)
        {
            _length = opt::adapterF.length();
        }
        else
        { 
            // Couldn't find the fwd adapter; Try the reverse version
            found = _tmp.find(opt::adapterR);
           _length = opt::adapterR.length();
        }

        if(found != std::string::npos) // found the adapter
        {
            _tmp.erase(found, _length);
            record.seq = _tmp;

            // We have to remove the qualities of the adapter
            if(!record.qual.empty())
            {
                _tmp = record.qual;
                _tmp.erase(found, _length);
                record.qual = _tmp;
            }
        }
    }

    // Check if the sequence has uncalled bases
    std::string seqStr = record.seq.toString();
    std::string qualStr = record.qual;

    ++s_numReadsRead;
    s_numBasesRead += seqStr.size();

    // If ambiguity codes are present in the sequence
    // and the user wants to keep them, we randomly
    // select one of the DNA symbols from the set of
    // possible bases
    if(!opt::bDiscardAmbiguous)
    {
        for(size_t i = 0; i < seqStr.size(); ++i)
        {
            // Convert '.' to 'N'
            if(seqStr[i] == '.')
                seqStr[i] = 'N';

            if(!IUPAC::isAmbiguous(seqStr[i]))
                continue;

            // Get the string of possible bases for this ambiguity code
            std::string possibles = IUPAC::getPossibleSymbols(seqStr[i]);

            // select one of the bases at random
            int j = rand() % possibles.size();
            seqStr[i] = possibles[j];
        }
    }

    // Ensure sequence is entirely ACGT
    size_t pos = seqStr.find_first_not_of("ACGT");
    if(pos != std::string::npos)
        return false;

    // Validate the quality string (if present) and
    // perform any necessary transformations
    if(!qualStr.empty())
    {
        // Calculate the range of phred scores for validation
        bool allValid = true;
        for(size_t i = 0; i < qualStr.size(); ++i)
        {
            if(opt::qualityScale == QS_PHRED64)
                qualStr[i] = Quality::phred64toPhred33(qualStr[i]);
            allValid = Quality::isValidPhred33(qualStr[i]) && allValid;
        }

        if(!allValid)
        {
            std::cerr << "Error: read " << record.id << " has out of range quality values.\n";
            std::cerr << "Expected phred" << (opt::qualityScale == QS_SANGER ? "33" : "64") << ".\n";
            std::cerr << "Quality string: "  << qualStr << "\n";
            std::cerr << "Check your data and re-run preprocess with the correct quality scaling flag.\n";
            exit(EXIT_FAILURE);
        }
    }

    // Hard clip
    if(opt::hardClip > 0)
    {
        seqStr = seqStr.substr(0, opt::hardClip);
        if(!qualStr.empty())
            qualStr = qualStr.substr(0, opt::hardClip);
    }

    // Quality trim
    if(opt::qualityTrim > 0 && !qualStr.empty())
        softClip(opt::qualityTrim, seqStr, qualStr);

    // Quality filter
    if(opt::qualityFilter >= 0 && !qualStr.empty())
    {
        int numLowQuality = countLowQuality(seqStr, qualStr);
        if(numLowQuality > opt::qualityFilter)
            return false;
    }

    // Dust filter
    if(opt::bDustFilter)
    {
        double dustScore = calculateDustScore(seqStr);
        bool bAcceptDust = dustScore < opt::dustThreshold;

        if(!bAcceptDust)
        {
            s_numFailedDust += 1;
            if(opt::verbose >= 1)
            {
                printf("Failed dust: %s %s %lf\n", record.id.c_str(),
                                                   seqStr.c_str(),
                                                   dustScore);
            }
            return false;
        }
    }

    // Filter by GC content
    if(opt::bFilterGC)
    {
        double gc = calcGC(seqStr);
        if(gc < opt::minGC || gc > opt::maxGC)
            return false;
    }

    // Primer screen
    if(!opt::bDisablePrimerCheck)
    {
        bool containsPrimer = PrimerScreen::containsPrimer(seqStr);
        if(containsPrimer)
        {
            ++s_numReadsPrimer;
            return false;
        }
    }

    record.seq = seqStr;
    record.qual = qualStr;

    if(record.seq.length() == 0 || record.seq.length() < opt::minLength)
        return false;

    return true;
}
Пример #4
0
SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold(SamRecord& record, 
                                                           GenomeSequence& refSequence,
                                                           double mismatchThreshold)
{
    // Read & clip from the left & right.    
    SamQuerySeqWithRefIter iterFromFront(record, refSequence, true);
    SamQuerySeqWithRefIter iterFromBack(record, refSequence, false);

    SamSingleBaseMatchInfo baseMatchInfo;

    int32_t readLength = record.getReadLength();
    // Init last front clip to be prior to the lastFront index (0).
    const int32_t initialLastFrontClipPos = -1;
    int32_t lastFrontClipPos = initialLastFrontClipPos;
    // Init first back clip to be past the last index (readLength).
    int32_t firstBackClipPos = readLength;

    bool fromFrontComplete = false;
    bool fromBackComplete = false;
    int32_t numBasesFromFront = 0;
    int32_t numBasesFromBack = 0;
    int32_t numMismatchFromFront = 0;
    int32_t numMismatchFromBack = 0;

    //////////////////////////////////////////////////////////
    // Determining the clip positions.
    while(!fromFrontComplete || !fromBackComplete)
    {
        // Read from the front (left to right) of the read until
        // more have been read from that direction than the opposite direction.
        while(!fromFrontComplete && 
              ((numBasesFromFront <= numBasesFromBack) ||
               (fromBackComplete)))
        {
            if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false)
            {
                // Nothing more to read in this direction.
                fromFrontComplete = true;
                break;
            }
            // Got a read.  Check to see if it is to or past the last clip.
            if(baseMatchInfo.getQueryIndex() >= firstBackClipPos)
            {
                // This base is past where we are clipping, so we
                // are done reading in this direction.
                fromFrontComplete = true;
                break;
            }
            // This is an actual base read from the left to the
            // right, so up the counter and determine if it was a mismatch.
            ++numBasesFromFront;

            if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
            {
                // Mismatch
                ++numMismatchFromFront;
                // Check to see if it is over the threshold.
                double mismatchPercent = 
                    (double)numMismatchFromFront / numBasesFromFront;
                if(mismatchPercent > mismatchThreshold)
                {
                    // Need to clip.
                    lastFrontClipPos = baseMatchInfo.getQueryIndex();
                    // Reset the counters.
                    numBasesFromFront = 0;
                    numMismatchFromFront = 0;
                }
            }
        }

        // Now, read from right to left until more have been read
        // from the back than from the front.
        while(!fromBackComplete && 
              ((numBasesFromBack <= numBasesFromFront) ||
               (fromFrontComplete)))
        {
            if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false)
            {
                // Nothing more to read in this direction.
                fromBackComplete = true;
                break;
            }
            // Got a read.  Check to see if it is to or past the first clip.
            if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos)
            {
                // This base is past where we are clipping, so we
                // are done reading in this direction.
                fromBackComplete = true;
                break;
            }
            // This is an actual base read from the right to the
            // left, so up the counter and determine if it was a mismatch.
            ++numBasesFromBack;

            if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
            {
                // Mismatch
                ++numMismatchFromBack;
                // Check to see if it is over the threshold.
                double mismatchPercent = 
                    (double)numMismatchFromBack / numBasesFromBack;
                if(mismatchPercent > mismatchThreshold)
                {
                    // Need to clip.
                    firstBackClipPos = baseMatchInfo.getQueryIndex();
                    // Reset the counters.
                    numBasesFromBack = 0;
                    numMismatchFromBack = 0;
                }
            }
        }
    }

    //////////////////////////////////////////////////////////
    // Done determining the clip positions, so clip.
    // To determine the number of clips from the front, add 1 to the
    // lastFrontClipPos since the index starts at 0.
    // To determine the number of clips from the back, subtract the
    // firstBackClipPos from the readLength.
    // Example:
    // Pos:  012345
    // Read: AAAAAA
    // Read Length = 6.  If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2.
    return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos));
}