// Soft clip the record from the front and/or the back. SamFilter::FilterStatus SamFilter::softClip(SamRecord& record, int32_t numFrontClips, int32_t numBackClips) { ////////////////////////////////////////////////////////// Cigar* cigar = record.getCigarInfo(); FilterStatus status = NONE; int32_t startPos = record.get0BasedPosition(); CigarRoller updatedCigar; status = softClip(*cigar, numFrontClips, numBackClips, startPos, updatedCigar); if(status == FILTERED) { ///////////////////////////// // The entire read is clipped, so rather than clipping it, // filter it out. filterRead(record); return(FILTERED); } else if(status == CLIPPED) { // Part of the read was clipped, and now that we have // an updated cigar, update the read. record.setCigar(updatedCigar); // Update the starting position. record.set0BasedPosition(startPos); } return(status); }
float MultiDistortion::processSample(float input, int type) { // process input sample based on type switch (type) { case distTypeTube: return tubeClip(input); break; case distTypeSoftClip: return softClip(input); break; case distTypeExpClip: return expClip(input); default: return input; break; } }
// Process a single read by quality trimming, filtering // returns true if the read should be kept bool processRead(SeqRecord& record) { // let's remove the adapter if the user has requested so // before doing any filtering if(!opt::adapterF.empty()) { std::string _tmp(record.seq.toString()); size_t found = _tmp.find(opt::adapterF); int _length; if(found != std::string::npos) { _length = opt::adapterF.length(); } else { // Couldn't find the fwd adapter; Try the reverse version found = _tmp.find(opt::adapterR); _length = opt::adapterR.length(); } if(found != std::string::npos) // found the adapter { _tmp.erase(found, _length); record.seq = _tmp; // We have to remove the qualities of the adapter if(!record.qual.empty()) { _tmp = record.qual; _tmp.erase(found, _length); record.qual = _tmp; } } } // Check if the sequence has uncalled bases std::string seqStr = record.seq.toString(); std::string qualStr = record.qual; ++s_numReadsRead; s_numBasesRead += seqStr.size(); // If ambiguity codes are present in the sequence // and the user wants to keep them, we randomly // select one of the DNA symbols from the set of // possible bases if(!opt::bDiscardAmbiguous) { for(size_t i = 0; i < seqStr.size(); ++i) { // Convert '.' to 'N' if(seqStr[i] == '.') seqStr[i] = 'N'; if(!IUPAC::isAmbiguous(seqStr[i])) continue; // Get the string of possible bases for this ambiguity code std::string possibles = IUPAC::getPossibleSymbols(seqStr[i]); // select one of the bases at random int j = rand() % possibles.size(); seqStr[i] = possibles[j]; } } // Ensure sequence is entirely ACGT size_t pos = seqStr.find_first_not_of("ACGT"); if(pos != std::string::npos) return false; // Validate the quality string (if present) and // perform any necessary transformations if(!qualStr.empty()) { // Calculate the range of phred scores for validation bool allValid = true; for(size_t i = 0; i < qualStr.size(); ++i) { if(opt::qualityScale == QS_PHRED64) qualStr[i] = Quality::phred64toPhred33(qualStr[i]); allValid = Quality::isValidPhred33(qualStr[i]) && allValid; } if(!allValid) { std::cerr << "Error: read " << record.id << " has out of range quality values.\n"; std::cerr << "Expected phred" << (opt::qualityScale == QS_SANGER ? "33" : "64") << ".\n"; std::cerr << "Quality string: " << qualStr << "\n"; std::cerr << "Check your data and re-run preprocess with the correct quality scaling flag.\n"; exit(EXIT_FAILURE); } } // Hard clip if(opt::hardClip > 0) { seqStr = seqStr.substr(0, opt::hardClip); if(!qualStr.empty()) qualStr = qualStr.substr(0, opt::hardClip); } // Quality trim if(opt::qualityTrim > 0 && !qualStr.empty()) softClip(opt::qualityTrim, seqStr, qualStr); // Quality filter if(opt::qualityFilter >= 0 && !qualStr.empty()) { int numLowQuality = countLowQuality(seqStr, qualStr); if(numLowQuality > opt::qualityFilter) return false; } // Dust filter if(opt::bDustFilter) { double dustScore = calculateDustScore(seqStr); bool bAcceptDust = dustScore < opt::dustThreshold; if(!bAcceptDust) { s_numFailedDust += 1; if(opt::verbose >= 1) { printf("Failed dust: %s %s %lf\n", record.id.c_str(), seqStr.c_str(), dustScore); } return false; } } // Filter by GC content if(opt::bFilterGC) { double gc = calcGC(seqStr); if(gc < opt::minGC || gc > opt::maxGC) return false; } // Primer screen if(!opt::bDisablePrimerCheck) { bool containsPrimer = PrimerScreen::containsPrimer(seqStr); if(containsPrimer) { ++s_numReadsPrimer; return false; } } record.seq = seqStr; record.qual = qualStr; if(record.seq.length() == 0 || record.seq.length() < opt::minLength) return false; return true; }
SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold(SamRecord& record, GenomeSequence& refSequence, double mismatchThreshold) { // Read & clip from the left & right. SamQuerySeqWithRefIter iterFromFront(record, refSequence, true); SamQuerySeqWithRefIter iterFromBack(record, refSequence, false); SamSingleBaseMatchInfo baseMatchInfo; int32_t readLength = record.getReadLength(); // Init last front clip to be prior to the lastFront index (0). const int32_t initialLastFrontClipPos = -1; int32_t lastFrontClipPos = initialLastFrontClipPos; // Init first back clip to be past the last index (readLength). int32_t firstBackClipPos = readLength; bool fromFrontComplete = false; bool fromBackComplete = false; int32_t numBasesFromFront = 0; int32_t numBasesFromBack = 0; int32_t numMismatchFromFront = 0; int32_t numMismatchFromBack = 0; ////////////////////////////////////////////////////////// // Determining the clip positions. while(!fromFrontComplete || !fromBackComplete) { // Read from the front (left to right) of the read until // more have been read from that direction than the opposite direction. while(!fromFrontComplete && ((numBasesFromFront <= numBasesFromBack) || (fromBackComplete))) { if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false) { // Nothing more to read in this direction. fromFrontComplete = true; break; } // Got a read. Check to see if it is to or past the last clip. if(baseMatchInfo.getQueryIndex() >= firstBackClipPos) { // This base is past where we are clipping, so we // are done reading in this direction. fromFrontComplete = true; break; } // This is an actual base read from the left to the // right, so up the counter and determine if it was a mismatch. ++numBasesFromFront; if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) { // Mismatch ++numMismatchFromFront; // Check to see if it is over the threshold. double mismatchPercent = (double)numMismatchFromFront / numBasesFromFront; if(mismatchPercent > mismatchThreshold) { // Need to clip. lastFrontClipPos = baseMatchInfo.getQueryIndex(); // Reset the counters. numBasesFromFront = 0; numMismatchFromFront = 0; } } } // Now, read from right to left until more have been read // from the back than from the front. while(!fromBackComplete && ((numBasesFromBack <= numBasesFromFront) || (fromFrontComplete))) { if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false) { // Nothing more to read in this direction. fromBackComplete = true; break; } // Got a read. Check to see if it is to or past the first clip. if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos) { // This base is past where we are clipping, so we // are done reading in this direction. fromBackComplete = true; break; } // This is an actual base read from the right to the // left, so up the counter and determine if it was a mismatch. ++numBasesFromBack; if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) { // Mismatch ++numMismatchFromBack; // Check to see if it is over the threshold. double mismatchPercent = (double)numMismatchFromBack / numBasesFromBack; if(mismatchPercent > mismatchThreshold) { // Need to clip. firstBackClipPos = baseMatchInfo.getQueryIndex(); // Reset the counters. numBasesFromBack = 0; numMismatchFromBack = 0; } } } } ////////////////////////////////////////////////////////// // Done determining the clip positions, so clip. // To determine the number of clips from the front, add 1 to the // lastFrontClipPos since the index starts at 0. // To determine the number of clips from the back, subtract the // firstBackClipPos from the readLength. // Example: // Pos: 012345 // Read: AAAAAA // Read Length = 6. If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2. return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos)); }