示例#1
0
// Process a single read by quality trimming, filtering
// returns true if the read should be kept
bool processRead(SeqRecord& record)
{
    // let's remove the adapter if the user has requested so
    // before doing any filtering
    if(!opt::adapterF.empty())
    {
        std::string _tmp(record.seq.toString());
        size_t found = _tmp.find(opt::adapterF);
        int _length;

        if(found != std::string::npos)
        {
            _length = opt::adapterF.length();
        }
        else
        { 
            // Couldn't find the fwd adapter; Try the reverse version
            found = _tmp.find(opt::adapterR);
           _length = opt::adapterR.length();
        }

        if(found != std::string::npos) // found the adapter
        {
            _tmp.erase(found, _length);
            record.seq = _tmp;

            // We have to remove the qualities of the adapter
            if(!record.qual.empty())
            {
                _tmp = record.qual;
                _tmp.erase(found, _length);
                record.qual = _tmp;
            }
        }
    }

    // Check if the sequence has uncalled bases
    std::string seqStr = record.seq.toString();
    std::string qualStr = record.qual;

    ++s_numReadsRead;
    s_numBasesRead += seqStr.size();

    // If ambiguity codes are present in the sequence
    // and the user wants to keep them, we randomly
    // select one of the DNA symbols from the set of
    // possible bases
    if(!opt::bDiscardAmbiguous)
    {
        for(size_t i = 0; i < seqStr.size(); ++i)
        {
            // Convert '.' to 'N'
            if(seqStr[i] == '.')
                seqStr[i] = 'N';

            if(!IUPAC::isAmbiguous(seqStr[i]))
                continue;

            // Get the string of possible bases for this ambiguity code
            std::string possibles = IUPAC::getPossibleSymbols(seqStr[i]);

            // select one of the bases at random
            int j = rand() % possibles.size();
            seqStr[i] = possibles[j];
        }
    }

    // Ensure sequence is entirely ACGT
    size_t pos = seqStr.find_first_not_of("ACGT");
    if(pos != std::string::npos)
        return false;

    // Validate the quality string (if present) and
    // perform any necessary transformations
    if(!qualStr.empty())
    {
        // Calculate the range of phred scores for validation
        bool allValid = true;
        for(size_t i = 0; i < qualStr.size(); ++i)
        {
            if(opt::qualityScale == QS_PHRED64)
                qualStr[i] = Quality::phred64toPhred33(qualStr[i]);
            allValid = Quality::isValidPhred33(qualStr[i]) && allValid;
        }

        if(!allValid)
        {
            std::cerr << "Error: read " << record.id << " has out of range quality values.\n";
            std::cerr << "Expected phred" << (opt::qualityScale == QS_SANGER ? "33" : "64") << ".\n";
            std::cerr << "Quality string: "  << qualStr << "\n";
            std::cerr << "Check your data and re-run preprocess with the correct quality scaling flag.\n";
            exit(EXIT_FAILURE);
        }
    }

    // Hard clip
    if(opt::hardClip > 0)
    {
        seqStr = seqStr.substr(0, opt::hardClip);
        if(!qualStr.empty())
            qualStr = qualStr.substr(0, opt::hardClip);
    }

    // Quality trim
    if(opt::qualityTrim > 0 && !qualStr.empty())
        softClip(opt::qualityTrim, seqStr, qualStr);

    // Quality filter
    if(opt::qualityFilter >= 0 && !qualStr.empty())
    {
        int numLowQuality = countLowQuality(seqStr, qualStr);
        if(numLowQuality > opt::qualityFilter)
            return false;
    }

    // Dust filter
    if(opt::bDustFilter)
    {
        double dustScore = calculateDustScore(seqStr);
        bool bAcceptDust = dustScore < opt::dustThreshold;

        if(!bAcceptDust)
        {
            s_numFailedDust += 1;
            if(opt::verbose >= 1)
            {
                printf("Failed dust: %s %s %lf\n", record.id.c_str(),
                                                   seqStr.c_str(),
                                                   dustScore);
            }
            return false;
        }
    }

    // Filter by GC content
    if(opt::bFilterGC)
    {
        double gc = calcGC(seqStr);
        if(gc < opt::minGC || gc > opt::maxGC)
            return false;
    }

    // Primer screen
    if(!opt::bDisablePrimerCheck)
    {
        bool containsPrimer = PrimerScreen::containsPrimer(seqStr);
        if(containsPrimer)
        {
            ++s_numReadsPrimer;
            return false;
        }
    }

    record.seq = seqStr;
    record.qual = qualStr;

    if(record.seq.length() == 0 || record.seq.length() < opt::minLength)
        return false;

    return true;
}
示例#2
0
int scanBam()
{

	std::vector< std::map<std::string, ScanResults> > resultlist;
//	std::ostream* pWriter;
//	pWriter = &std::cout;
    bool isExome = opt::exomebedfile.size()==0? false: true;

    std::cout << opt::bamlist << "\n";
    std::cout << opt::bamlist.size() << " BAMs" <<  std::endl;

    for(std::size_t i=0; i<opt::bamlist.size(); i++) {

        // storing results for each read group (RG tag). use
        // read group ID as key.
        std::map<std::string, ScanResults> resultmap;
        // store where the overlap was last found in the case of exome seq
    	std::map<std::string, std::vector<range>::iterator> lastfound;
    	std::vector<range>::iterator searchhint;
        
        std::cerr << "Start analysing BAM " << opt::bamlist[i] << "\n";

        // Open the bam files for reading/writing
        BamTools::BamReader* pBamReader = new BamTools::BamReader;

        pBamReader->Open(opt::bamlist[i]);

        // get bam headers
        const BamTools::SamHeader header = pBamReader ->GetHeader();

//        for(BamTools::SamSequenceConstIterator it = header.Sequences.Begin();
//        						it != header.Sequences.End();++it){
//        	std::cout << "Assembly ID:" << it->AssemblyID << ", Name:" << it->Name << std::endl;
//        }
//        exit(0);

        bool rggroups=false;
        
        if(opt::ignorerg){ // ignore read groups
        	std::cerr << "Treat all reads in BAM as if they were from a same sample" << std::endl;
        	ScanResults results;
        	results.sample = opt::unknown;
        	resultmap[opt::unknown]=results;
        }else{
			std::map <std::string, std::string> readgroups;
			std::map <std::string, std::string> readlibs;

			rggroups = header.HasReadGroups();

			if(rggroups){
				for(BamTools::SamReadGroupConstIterator it = header.ReadGroups.Begin();
						it != header.ReadGroups.End();++it){
					readgroups[it->ID]= it->Sample;
					if(it->HasLibrary()){
						readlibs[it->ID] = it -> Library;
					}else{
						readlibs[it->ID] = opt::unknown;
					}
				}
				std::cerr<<"Specified BAM has "<< readgroups.size()<< " read groups" << std::endl;

				for(std::map<std::string, std::string>::iterator it = readgroups.begin(); it != readgroups.end(); ++it){
					ScanResults results;
					std::string rgid = it -> first;
					results.sample = it -> second;
					results.lib = readlibs[rgid];
					resultmap[rgid]=results; //results are identified by RG tag.
				}

			}else{
				std::cerr << "Warning: can't find RG tag in the BAM header" << std::endl;
				std::cerr << "Warning: treat all reads in BAM as if they were from a same sample" << std::endl;
				ScanResults results;
				results.sample = opt::unknown;
				results.lib = opt::unknown;
				resultmap[opt::unknown]=results;
			}
        }

        BamTools::BamAlignment record1;
        bool done = false;
        
        int nprocessed=0; // number of reads analyzed
        int ntotal=0; // number of reads scanned in bam (we skip some reads, see below)
        while(!done)
        {
            ntotal ++;
            done = !pBamReader -> GetNextAlignment(record1);
            std::string tag = opt::unknown;
            if(rggroups){
                
                // skip reads that do not have read group tag
            	if(record1.HasTag("RG")){
					record1.GetTag("RG", tag);
	//            	std::cerr << c << " reads:{" << record1.QueryBases << "} tag:{" << tag << "}\n";
				}else{
					std::cerr << "can't find RG tag for read at position {" << record1.RefID << ":" << record1.Position << "}" << std::endl;
					std::cerr << "skip this read" << std::endl;
					continue;
				}
            }
            
            // skip reads with readgroup not defined in BAM header
            if(resultmap.find(tag) == resultmap.end()){
				std::cerr << "RG tag {" << tag << "} for read at position ";
				std::cerr << "{" << record1.RefID << ":" << record1.Position << "} doesn't exist in BAM header.";
				continue;
            }

            // for exome, exclude reads mapped to the exome regions.
            if(isExome){
				range rg;
				rg.first = record1.Position;
				rg.second = record1.Position + record1.Length;
				std::string chrm =  refID2Name(record1.RefID);

				if(chrm != "-1"){ // check if overlap exome when the read is mapped to chr1-22, X, Y
					// std::cerr << "read: " << chrm << " " << rg << "\n" << std::endl;
					std::map<std::string, std::vector<range> >::iterator chrmit = opt::exomebed.find(chrm);
					if(chrmit == opt::exomebed.end())
					{
						// std::cerr<<"chromosome or reference sequence: " << chrm << " is not present in the specified exome bed file." <<std::endl;
						// std::cerr<<"please check sequence name encoding, i.e. for chromosome one, is it chr1 or 1" << std::endl;
                        // unmapped reads can have chr names as a star (*). We also don't consider MT reads. 
						resultmap[tag].n_exreadsChrUnmatched +=1; 
					}else{
						std::vector<range>::iterator itend = opt::exomebed[chrm].end();
						std::map<std::string, std::vector<range>::iterator>::iterator lastfoundchrmit = lastfound.find(chrm);
						if(lastfoundchrmit == lastfound.end()){ // first entry to this chrm
							lastfound[chrm] = chrmit->second.begin();// start from begining
						}

						// set the hint to where the previous found is
						searchhint = lastfound[chrm];
						std::vector<range>::iterator itsearch = searchRange(searchhint, itend, rg);
						// if found
						if(itsearch != itend){// if found
							searchhint = itsearch;
							resultmap[tag].n_exreadsExcluded +=1;
							lastfound[chrm] = searchhint; // update search hint
							continue;
						}
					}

				}
            }

            resultmap[tag].numTotal +=1;

            if(record1.IsMapped())
            {
            	resultmap[tag].numMapped += 1;
            }
            if(record1.IsDuplicate()){
            	resultmap[tag].numDuplicates +=1;
            }

            double gc = calcGC(record1.QueryBases);
            int ptn_count = countMotif(record1.QueryBases, opt::PATTERN, opt::PATTERN_REV);
            // when the read length exceeds 100bp, number of patterns might exceed the boundary
            if (ptn_count > ScanParameters::TEL_MOTIF_N-1){
                continue;
            }
            resultmap[tag].telcounts[ptn_count]+=1;

            if(gc >= ScanParameters::GC_LOWERBOUND && gc <= ScanParameters::GC_UPPERBOUND){
            	// get index for GC bin.
            	int idx = floor((gc-ScanParameters::GC_LOWERBOUND)/ScanParameters::GC_BINSIZE);
            	assert(idx >=0 && idx <= ScanParameters::GC_BIN_N-1);
//            	std::cerr << c << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC idx{" << idx << "}\n";
            	if(idx > ScanParameters::GC_BIN_N-1){
            		std::cerr << nprocessed << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC bin index out of bound:" << idx << "\n";
            		exit(EXIT_FAILURE);
            	}
            	resultmap[tag].gccounts[idx]+=1;
            }

            // if(resultmap[tag].n_exreadsChrUnmatched > 1000){
            // 	std::cerr<<"too many reads found with unmatched chromosome ID between BAM and exome BED. \n" << std::endl;
            // }

            nprocessed++;

            if( nprocessed%10000000 == 0){
            	std::cerr << "[scan] processed " << nprocessed << " reads \n" ;
            }
        }
        
		pBamReader->Close();
        delete pBamReader;
        
        // consider each BAM separately
        resultlist.push_back(resultmap);

        std::cerr << "[scan] total reads in BAM scanned " << ntotal << std::endl;
        std::cerr << "Completed scanning BAM\n";

    }

    if(opt::onebam){
        merge_results_by_readgroup(resultlist);
    }
    
    outputresults(resultlist);

    if(isExome){
    	printlog(resultlist);
    }
    
    std::cerr << "Completed writing results\n";

    return 0;
}