Exemple #1
0
// Returns the window over seq with the highest dust score
double maxDustWindow(const std::string& seq, size_t windowSize, size_t minWindow)
{
    double maxScore = 0.0f;
    for(size_t i = 0; i < seq.size(); i += 1)
    {
        size_t r = seq.size() - i;
        size_t w = r < windowSize ? r : windowSize;
        if(w >= minWindow) // don't calculate score for small windows
        {
            double s = calculateDustScore(seq.substr(i, w));
            if(s > maxScore)
                maxScore = s;
        }
    }
    return maxScore;
}
// Process a single read by quality trimming, filtering
// returns true if the read should be kept
bool processRead(SeqRecord& record)
{
    // let's remove the adapter if the user has requested so
    // before doing any filtering
    if(!opt::adapterF.empty())
    {
        std::string _tmp(record.seq.toString());
        size_t found = _tmp.find(opt::adapterF);
        int _length;

        if(found != std::string::npos)
        {
            _length = opt::adapterF.length();
        }
        else
        { 
            // Couldn't find the fwd adapter; Try the reverse version
            found = _tmp.find(opt::adapterR);
           _length = opt::adapterR.length();
        }

        if(found != std::string::npos) // found the adapter
        {
            _tmp.erase(found, _length);
            record.seq = _tmp;

            // We have to remove the qualities of the adapter
            if(!record.qual.empty())
            {
                _tmp = record.qual;
                _tmp.erase(found, _length);
                record.qual = _tmp;
            }
        }
    }

    // Check if the sequence has uncalled bases
    std::string seqStr = record.seq.toString();
    std::string qualStr = record.qual;

    ++s_numReadsRead;
    s_numBasesRead += seqStr.size();

    // If ambiguity codes are present in the sequence
    // and the user wants to keep them, we randomly
    // select one of the DNA symbols from the set of
    // possible bases
    if(!opt::bDiscardAmbiguous)
    {
        for(size_t i = 0; i < seqStr.size(); ++i)
        {
            // Convert '.' to 'N'
            if(seqStr[i] == '.')
                seqStr[i] = 'N';

            if(!IUPAC::isAmbiguous(seqStr[i]))
                continue;

            // Get the string of possible bases for this ambiguity code
            std::string possibles = IUPAC::getPossibleSymbols(seqStr[i]);

            // select one of the bases at random
            int j = rand() % possibles.size();
            seqStr[i] = possibles[j];
        }
    }

    // Ensure sequence is entirely ACGT
    size_t pos = seqStr.find_first_not_of("ACGT");
    if(pos != std::string::npos)
        return false;

    // Validate the quality string (if present) and
    // perform any necessary transformations
    if(!qualStr.empty())
    {
        // Calculate the range of phred scores for validation
        bool allValid = true;
        for(size_t i = 0; i < qualStr.size(); ++i)
        {
            if(opt::qualityScale == QS_PHRED64)
                qualStr[i] = Quality::phred64toPhred33(qualStr[i]);
            allValid = Quality::isValidPhred33(qualStr[i]) && allValid;
        }

        if(!allValid)
        {
            std::cerr << "Error: read " << record.id << " has out of range quality values.\n";
            std::cerr << "Expected phred" << (opt::qualityScale == QS_SANGER ? "33" : "64") << ".\n";
            std::cerr << "Quality string: "  << qualStr << "\n";
            std::cerr << "Check your data and re-run preprocess with the correct quality scaling flag.\n";
            exit(EXIT_FAILURE);
        }
    }

    // Hard clip
    if(opt::hardClip > 0)
    {
        seqStr = seqStr.substr(0, opt::hardClip);
        if(!qualStr.empty())
            qualStr = qualStr.substr(0, opt::hardClip);
    }

    // Quality trim
    if(opt::qualityTrim > 0 && !qualStr.empty())
        softClip(opt::qualityTrim, seqStr, qualStr);

    // Quality filter
    if(opt::qualityFilter >= 0 && !qualStr.empty())
    {
        int numLowQuality = countLowQuality(seqStr, qualStr);
        if(numLowQuality > opt::qualityFilter)
            return false;
    }

    // Dust filter
    if(opt::bDustFilter)
    {
        double dustScore = calculateDustScore(seqStr);
        bool bAcceptDust = dustScore < opt::dustThreshold;

        if(!bAcceptDust)
        {
            s_numFailedDust += 1;
            if(opt::verbose >= 1)
            {
                printf("Failed dust: %s %s %lf\n", record.id.c_str(),
                                                   seqStr.c_str(),
                                                   dustScore);
            }
            return false;
        }
    }

    // Filter by GC content
    if(opt::bFilterGC)
    {
        double gc = calcGC(seqStr);
        if(gc < opt::minGC || gc > opt::maxGC)
            return false;
    }

    // Primer screen
    if(!opt::bDisablePrimerCheck)
    {
        bool containsPrimer = PrimerScreen::containsPrimer(seqStr);
        if(containsPrimer)
        {
            ++s_numReadsPrimer;
            return false;
        }
    }

    record.seq = seqStr;
    record.qual = qualStr;

    if(record.seq.length() == 0 || record.seq.length() < opt::minLength)
        return false;

    return true;
}