// Process a single read by quality trimming, filtering // returns true if the read should be kept bool processRead(SeqRecord& record) { // let's remove the adapter if the user has requested so // before doing any filtering if(!opt::adapterF.empty()) { std::string _tmp(record.seq.toString()); size_t found = _tmp.find(opt::adapterF); int _length; if(found != std::string::npos) { _length = opt::adapterF.length(); } else { // Couldn't find the fwd adapter; Try the reverse version found = _tmp.find(opt::adapterR); _length = opt::adapterR.length(); } if(found != std::string::npos) // found the adapter { _tmp.erase(found, _length); record.seq = _tmp; // We have to remove the qualities of the adapter if(!record.qual.empty()) { _tmp = record.qual; _tmp.erase(found, _length); record.qual = _tmp; } } } // Check if the sequence has uncalled bases std::string seqStr = record.seq.toString(); std::string qualStr = record.qual; ++s_numReadsRead; s_numBasesRead += seqStr.size(); // If ambiguity codes are present in the sequence // and the user wants to keep them, we randomly // select one of the DNA symbols from the set of // possible bases if(!opt::bDiscardAmbiguous) { for(size_t i = 0; i < seqStr.size(); ++i) { // Convert '.' to 'N' if(seqStr[i] == '.') seqStr[i] = 'N'; if(!IUPAC::isAmbiguous(seqStr[i])) continue; // Get the string of possible bases for this ambiguity code std::string possibles = IUPAC::getPossibleSymbols(seqStr[i]); // select one of the bases at random int j = rand() % possibles.size(); seqStr[i] = possibles[j]; } } // Ensure sequence is entirely ACGT size_t pos = seqStr.find_first_not_of("ACGT"); if(pos != std::string::npos) return false; // Validate the quality string (if present) and // perform any necessary transformations if(!qualStr.empty()) { // Calculate the range of phred scores for validation bool allValid = true; for(size_t i = 0; i < qualStr.size(); ++i) { if(opt::qualityScale == QS_PHRED64) qualStr[i] = Quality::phred64toPhred33(qualStr[i]); allValid = Quality::isValidPhred33(qualStr[i]) && allValid; } if(!allValid) { std::cerr << "Error: read " << record.id << " has out of range quality values.\n"; std::cerr << "Expected phred" << (opt::qualityScale == QS_SANGER ? "33" : "64") << ".\n"; std::cerr << "Quality string: " << qualStr << "\n"; std::cerr << "Check your data and re-run preprocess with the correct quality scaling flag.\n"; exit(EXIT_FAILURE); } } // Hard clip if(opt::hardClip > 0) { seqStr = seqStr.substr(0, opt::hardClip); if(!qualStr.empty()) qualStr = qualStr.substr(0, opt::hardClip); } // Quality trim if(opt::qualityTrim > 0 && !qualStr.empty()) softClip(opt::qualityTrim, seqStr, qualStr); // Quality filter if(opt::qualityFilter >= 0 && !qualStr.empty()) { int numLowQuality = countLowQuality(seqStr, qualStr); if(numLowQuality > opt::qualityFilter) return false; } // Dust filter if(opt::bDustFilter) { double dustScore = calculateDustScore(seqStr); bool bAcceptDust = dustScore < opt::dustThreshold; if(!bAcceptDust) { s_numFailedDust += 1; if(opt::verbose >= 1) { printf("Failed dust: %s %s %lf\n", record.id.c_str(), seqStr.c_str(), dustScore); } return false; } } // Filter by GC content if(opt::bFilterGC) { double gc = calcGC(seqStr); if(gc < opt::minGC || gc > opt::maxGC) return false; } // Primer screen if(!opt::bDisablePrimerCheck) { bool containsPrimer = PrimerScreen::containsPrimer(seqStr); if(containsPrimer) { ++s_numReadsPrimer; return false; } } record.seq = seqStr; record.qual = qualStr; if(record.seq.length() == 0 || record.seq.length() < opt::minLength) return false; return true; }
int scanBam() { std::vector< std::map<std::string, ScanResults> > resultlist; // std::ostream* pWriter; // pWriter = &std::cout; bool isExome = opt::exomebedfile.size()==0? false: true; std::cout << opt::bamlist << "\n"; std::cout << opt::bamlist.size() << " BAMs" << std::endl; for(std::size_t i=0; i<opt::bamlist.size(); i++) { // storing results for each read group (RG tag). use // read group ID as key. std::map<std::string, ScanResults> resultmap; // store where the overlap was last found in the case of exome seq std::map<std::string, std::vector<range>::iterator> lastfound; std::vector<range>::iterator searchhint; std::cerr << "Start analysing BAM " << opt::bamlist[i] << "\n"; // Open the bam files for reading/writing BamTools::BamReader* pBamReader = new BamTools::BamReader; pBamReader->Open(opt::bamlist[i]); // get bam headers const BamTools::SamHeader header = pBamReader ->GetHeader(); // for(BamTools::SamSequenceConstIterator it = header.Sequences.Begin(); // it != header.Sequences.End();++it){ // std::cout << "Assembly ID:" << it->AssemblyID << ", Name:" << it->Name << std::endl; // } // exit(0); bool rggroups=false; if(opt::ignorerg){ // ignore read groups std::cerr << "Treat all reads in BAM as if they were from a same sample" << std::endl; ScanResults results; results.sample = opt::unknown; resultmap[opt::unknown]=results; }else{ std::map <std::string, std::string> readgroups; std::map <std::string, std::string> readlibs; rggroups = header.HasReadGroups(); if(rggroups){ for(BamTools::SamReadGroupConstIterator it = header.ReadGroups.Begin(); it != header.ReadGroups.End();++it){ readgroups[it->ID]= it->Sample; if(it->HasLibrary()){ readlibs[it->ID] = it -> Library; }else{ readlibs[it->ID] = opt::unknown; } } std::cerr<<"Specified BAM has "<< readgroups.size()<< " read groups" << std::endl; for(std::map<std::string, std::string>::iterator it = readgroups.begin(); it != readgroups.end(); ++it){ ScanResults results; std::string rgid = it -> first; results.sample = it -> second; results.lib = readlibs[rgid]; resultmap[rgid]=results; //results are identified by RG tag. } }else{ std::cerr << "Warning: can't find RG tag in the BAM header" << std::endl; std::cerr << "Warning: treat all reads in BAM as if they were from a same sample" << std::endl; ScanResults results; results.sample = opt::unknown; results.lib = opt::unknown; resultmap[opt::unknown]=results; } } BamTools::BamAlignment record1; bool done = false; int nprocessed=0; // number of reads analyzed int ntotal=0; // number of reads scanned in bam (we skip some reads, see below) while(!done) { ntotal ++; done = !pBamReader -> GetNextAlignment(record1); std::string tag = opt::unknown; if(rggroups){ // skip reads that do not have read group tag if(record1.HasTag("RG")){ record1.GetTag("RG", tag); // std::cerr << c << " reads:{" << record1.QueryBases << "} tag:{" << tag << "}\n"; }else{ std::cerr << "can't find RG tag for read at position {" << record1.RefID << ":" << record1.Position << "}" << std::endl; std::cerr << "skip this read" << std::endl; continue; } } // skip reads with readgroup not defined in BAM header if(resultmap.find(tag) == resultmap.end()){ std::cerr << "RG tag {" << tag << "} for read at position "; std::cerr << "{" << record1.RefID << ":" << record1.Position << "} doesn't exist in BAM header."; continue; } // for exome, exclude reads mapped to the exome regions. if(isExome){ range rg; rg.first = record1.Position; rg.second = record1.Position + record1.Length; std::string chrm = refID2Name(record1.RefID); if(chrm != "-1"){ // check if overlap exome when the read is mapped to chr1-22, X, Y // std::cerr << "read: " << chrm << " " << rg << "\n" << std::endl; std::map<std::string, std::vector<range> >::iterator chrmit = opt::exomebed.find(chrm); if(chrmit == opt::exomebed.end()) { // std::cerr<<"chromosome or reference sequence: " << chrm << " is not present in the specified exome bed file." <<std::endl; // std::cerr<<"please check sequence name encoding, i.e. for chromosome one, is it chr1 or 1" << std::endl; // unmapped reads can have chr names as a star (*). We also don't consider MT reads. resultmap[tag].n_exreadsChrUnmatched +=1; }else{ std::vector<range>::iterator itend = opt::exomebed[chrm].end(); std::map<std::string, std::vector<range>::iterator>::iterator lastfoundchrmit = lastfound.find(chrm); if(lastfoundchrmit == lastfound.end()){ // first entry to this chrm lastfound[chrm] = chrmit->second.begin();// start from begining } // set the hint to where the previous found is searchhint = lastfound[chrm]; std::vector<range>::iterator itsearch = searchRange(searchhint, itend, rg); // if found if(itsearch != itend){// if found searchhint = itsearch; resultmap[tag].n_exreadsExcluded +=1; lastfound[chrm] = searchhint; // update search hint continue; } } } } resultmap[tag].numTotal +=1; if(record1.IsMapped()) { resultmap[tag].numMapped += 1; } if(record1.IsDuplicate()){ resultmap[tag].numDuplicates +=1; } double gc = calcGC(record1.QueryBases); int ptn_count = countMotif(record1.QueryBases, opt::PATTERN, opt::PATTERN_REV); // when the read length exceeds 100bp, number of patterns might exceed the boundary if (ptn_count > ScanParameters::TEL_MOTIF_N-1){ continue; } resultmap[tag].telcounts[ptn_count]+=1; if(gc >= ScanParameters::GC_LOWERBOUND && gc <= ScanParameters::GC_UPPERBOUND){ // get index for GC bin. int idx = floor((gc-ScanParameters::GC_LOWERBOUND)/ScanParameters::GC_BINSIZE); assert(idx >=0 && idx <= ScanParameters::GC_BIN_N-1); // std::cerr << c << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC idx{" << idx << "}\n"; if(idx > ScanParameters::GC_BIN_N-1){ std::cerr << nprocessed << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC bin index out of bound:" << idx << "\n"; exit(EXIT_FAILURE); } resultmap[tag].gccounts[idx]+=1; } // if(resultmap[tag].n_exreadsChrUnmatched > 1000){ // std::cerr<<"too many reads found with unmatched chromosome ID between BAM and exome BED. \n" << std::endl; // } nprocessed++; if( nprocessed%10000000 == 0){ std::cerr << "[scan] processed " << nprocessed << " reads \n" ; } } pBamReader->Close(); delete pBamReader; // consider each BAM separately resultlist.push_back(resultmap); std::cerr << "[scan] total reads in BAM scanned " << ntotal << std::endl; std::cerr << "Completed scanning BAM\n"; } if(opt::onebam){ merge_results_by_readgroup(resultlist); } outputresults(resultlist); if(isExome){ printlog(resultlist); } std::cerr << "Completed writing results\n"; return 0; }