// Returns the window over seq with the highest dust score double maxDustWindow(const std::string& seq, size_t windowSize, size_t minWindow) { double maxScore = 0.0f; for(size_t i = 0; i < seq.size(); i += 1) { size_t r = seq.size() - i; size_t w = r < windowSize ? r : windowSize; if(w >= minWindow) // don't calculate score for small windows { double s = calculateDustScore(seq.substr(i, w)); if(s > maxScore) maxScore = s; } } return maxScore; }
// Process a single read by quality trimming, filtering // returns true if the read should be kept bool processRead(SeqRecord& record) { // let's remove the adapter if the user has requested so // before doing any filtering if(!opt::adapterF.empty()) { std::string _tmp(record.seq.toString()); size_t found = _tmp.find(opt::adapterF); int _length; if(found != std::string::npos) { _length = opt::adapterF.length(); } else { // Couldn't find the fwd adapter; Try the reverse version found = _tmp.find(opt::adapterR); _length = opt::adapterR.length(); } if(found != std::string::npos) // found the adapter { _tmp.erase(found, _length); record.seq = _tmp; // We have to remove the qualities of the adapter if(!record.qual.empty()) { _tmp = record.qual; _tmp.erase(found, _length); record.qual = _tmp; } } } // Check if the sequence has uncalled bases std::string seqStr = record.seq.toString(); std::string qualStr = record.qual; ++s_numReadsRead; s_numBasesRead += seqStr.size(); // If ambiguity codes are present in the sequence // and the user wants to keep them, we randomly // select one of the DNA symbols from the set of // possible bases if(!opt::bDiscardAmbiguous) { for(size_t i = 0; i < seqStr.size(); ++i) { // Convert '.' to 'N' if(seqStr[i] == '.') seqStr[i] = 'N'; if(!IUPAC::isAmbiguous(seqStr[i])) continue; // Get the string of possible bases for this ambiguity code std::string possibles = IUPAC::getPossibleSymbols(seqStr[i]); // select one of the bases at random int j = rand() % possibles.size(); seqStr[i] = possibles[j]; } } // Ensure sequence is entirely ACGT size_t pos = seqStr.find_first_not_of("ACGT"); if(pos != std::string::npos) return false; // Validate the quality string (if present) and // perform any necessary transformations if(!qualStr.empty()) { // Calculate the range of phred scores for validation bool allValid = true; for(size_t i = 0; i < qualStr.size(); ++i) { if(opt::qualityScale == QS_PHRED64) qualStr[i] = Quality::phred64toPhred33(qualStr[i]); allValid = Quality::isValidPhred33(qualStr[i]) && allValid; } if(!allValid) { std::cerr << "Error: read " << record.id << " has out of range quality values.\n"; std::cerr << "Expected phred" << (opt::qualityScale == QS_SANGER ? "33" : "64") << ".\n"; std::cerr << "Quality string: " << qualStr << "\n"; std::cerr << "Check your data and re-run preprocess with the correct quality scaling flag.\n"; exit(EXIT_FAILURE); } } // Hard clip if(opt::hardClip > 0) { seqStr = seqStr.substr(0, opt::hardClip); if(!qualStr.empty()) qualStr = qualStr.substr(0, opt::hardClip); } // Quality trim if(opt::qualityTrim > 0 && !qualStr.empty()) softClip(opt::qualityTrim, seqStr, qualStr); // Quality filter if(opt::qualityFilter >= 0 && !qualStr.empty()) { int numLowQuality = countLowQuality(seqStr, qualStr); if(numLowQuality > opt::qualityFilter) return false; } // Dust filter if(opt::bDustFilter) { double dustScore = calculateDustScore(seqStr); bool bAcceptDust = dustScore < opt::dustThreshold; if(!bAcceptDust) { s_numFailedDust += 1; if(opt::verbose >= 1) { printf("Failed dust: %s %s %lf\n", record.id.c_str(), seqStr.c_str(), dustScore); } return false; } } // Filter by GC content if(opt::bFilterGC) { double gc = calcGC(seqStr); if(gc < opt::minGC || gc > opt::maxGC) return false; } // Primer screen if(!opt::bDisablePrimerCheck) { bool containsPrimer = PrimerScreen::containsPrimer(seqStr); if(containsPrimer) { ++s_numReadsPrimer; return false; } } record.seq = seqStr; record.qual = qualStr; if(record.seq.length() == 0 || record.seq.length() < opt::minLength) return false; return true; }