void ReadContainer::GetSampleInfo() {
  BamTools::SamHeader header = reader.GetHeader();
  if (!header.HasReadGroups()) {
    PrintMessageDieOnError("No read groups found", ERROR);
  }
  BamTools::SamReadGroupDictionary read_groups = header.ReadGroups;
  for (BamTools::SamReadGroupIterator it = read_groups.Begin();
       it != read_groups.End(); it++) {
    const BamTools::SamReadGroup& rg = *it;
    string rg_sample;
    if (!rg.HasSample()) {
      PrintMessageDieOnError("No sample in read group for " + rg.ID, WARNING);
      rg_sample = rg.ID;
    } else {
      rg_sample = rg.Sample;
    }
    if (my_verbose) {
      PrintMessageDieOnError("Adding sample " + rg_sample + " " + rg.ID, PROGRESS);
    }
    rg_id_to_sample.insert(pair<string,string>(rg.ID,rg_sample));
    if (find(samples_list.begin(), samples_list.end(), rg_sample) ==
    	samples_list.end()) {
      samples_list.push_back(rg_sample);
    }
  }
}
Exemple #2
0
void BamHeaderHelper::GetFlowOrder(BamTools::BamReader &bamReader){
  BamTools::SamHeader samHeader = bamReader.GetHeader();
  if (!samHeader.HasReadGroups()) {
    //bamReader.Close();
    cerr << "ERROR: there is no read group in " << "this file" << endl;
    //exit(1);
  }
    for (BamTools::SamReadGroupIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr) {
    if (itr->HasFlowOrder()) {
      flow_order_set.push_back(itr->FlowOrder);
      //flowKey = itr->KeySequence;
    }
  }
}
Exemple #3
0
int scanBam()
{

	std::vector< std::map<std::string, ScanResults> > resultlist;
//	std::ostream* pWriter;
//	pWriter = &std::cout;
    bool isExome = opt::exomebedfile.size()==0? false: true;

    std::cout << opt::bamlist << "\n";
    std::cout << opt::bamlist.size() << " BAMs" <<  std::endl;

    for(std::size_t i=0; i<opt::bamlist.size(); i++) {

        // storing results for each read group (RG tag). use
        // read group ID as key.
        std::map<std::string, ScanResults> resultmap;
        // store where the overlap was last found in the case of exome seq
    	std::map<std::string, std::vector<range>::iterator> lastfound;
    	std::vector<range>::iterator searchhint;
        
        std::cerr << "Start analysing BAM " << opt::bamlist[i] << "\n";

        // Open the bam files for reading/writing
        BamTools::BamReader* pBamReader = new BamTools::BamReader;

        pBamReader->Open(opt::bamlist[i]);

        // get bam headers
        const BamTools::SamHeader header = pBamReader ->GetHeader();

//        for(BamTools::SamSequenceConstIterator it = header.Sequences.Begin();
//        						it != header.Sequences.End();++it){
//        	std::cout << "Assembly ID:" << it->AssemblyID << ", Name:" << it->Name << std::endl;
//        }
//        exit(0);

        bool rggroups=false;
        
        if(opt::ignorerg){ // ignore read groups
        	std::cerr << "Treat all reads in BAM as if they were from a same sample" << std::endl;
        	ScanResults results;
        	results.sample = opt::unknown;
        	resultmap[opt::unknown]=results;
        }else{
			std::map <std::string, std::string> readgroups;
			std::map <std::string, std::string> readlibs;

			rggroups = header.HasReadGroups();

			if(rggroups){
				for(BamTools::SamReadGroupConstIterator it = header.ReadGroups.Begin();
						it != header.ReadGroups.End();++it){
					readgroups[it->ID]= it->Sample;
					if(it->HasLibrary()){
						readlibs[it->ID] = it -> Library;
					}else{
						readlibs[it->ID] = opt::unknown;
					}
				}
				std::cerr<<"Specified BAM has "<< readgroups.size()<< " read groups" << std::endl;

				for(std::map<std::string, std::string>::iterator it = readgroups.begin(); it != readgroups.end(); ++it){
					ScanResults results;
					std::string rgid = it -> first;
					results.sample = it -> second;
					results.lib = readlibs[rgid];
					resultmap[rgid]=results; //results are identified by RG tag.
				}

			}else{
				std::cerr << "Warning: can't find RG tag in the BAM header" << std::endl;
				std::cerr << "Warning: treat all reads in BAM as if they were from a same sample" << std::endl;
				ScanResults results;
				results.sample = opt::unknown;
				results.lib = opt::unknown;
				resultmap[opt::unknown]=results;
			}
        }

        BamTools::BamAlignment record1;
        bool done = false;
        
        int nprocessed=0; // number of reads analyzed
        int ntotal=0; // number of reads scanned in bam (we skip some reads, see below)
        while(!done)
        {
            ntotal ++;
            done = !pBamReader -> GetNextAlignment(record1);
            std::string tag = opt::unknown;
            if(rggroups){
                
                // skip reads that do not have read group tag
            	if(record1.HasTag("RG")){
					record1.GetTag("RG", tag);
	//            	std::cerr << c << " reads:{" << record1.QueryBases << "} tag:{" << tag << "}\n";
				}else{
					std::cerr << "can't find RG tag for read at position {" << record1.RefID << ":" << record1.Position << "}" << std::endl;
					std::cerr << "skip this read" << std::endl;
					continue;
				}
            }
            
            // skip reads with readgroup not defined in BAM header
            if(resultmap.find(tag) == resultmap.end()){
				std::cerr << "RG tag {" << tag << "} for read at position ";
				std::cerr << "{" << record1.RefID << ":" << record1.Position << "} doesn't exist in BAM header.";
				continue;
            }

            // for exome, exclude reads mapped to the exome regions.
            if(isExome){
				range rg;
				rg.first = record1.Position;
				rg.second = record1.Position + record1.Length;
				std::string chrm =  refID2Name(record1.RefID);

				if(chrm != "-1"){ // check if overlap exome when the read is mapped to chr1-22, X, Y
					// std::cerr << "read: " << chrm << " " << rg << "\n" << std::endl;
					std::map<std::string, std::vector<range> >::iterator chrmit = opt::exomebed.find(chrm);
					if(chrmit == opt::exomebed.end())
					{
						// std::cerr<<"chromosome or reference sequence: " << chrm << " is not present in the specified exome bed file." <<std::endl;
						// std::cerr<<"please check sequence name encoding, i.e. for chromosome one, is it chr1 or 1" << std::endl;
                        // unmapped reads can have chr names as a star (*). We also don't consider MT reads. 
						resultmap[tag].n_exreadsChrUnmatched +=1; 
					}else{
						std::vector<range>::iterator itend = opt::exomebed[chrm].end();
						std::map<std::string, std::vector<range>::iterator>::iterator lastfoundchrmit = lastfound.find(chrm);
						if(lastfoundchrmit == lastfound.end()){ // first entry to this chrm
							lastfound[chrm] = chrmit->second.begin();// start from begining
						}

						// set the hint to where the previous found is
						searchhint = lastfound[chrm];
						std::vector<range>::iterator itsearch = searchRange(searchhint, itend, rg);
						// if found
						if(itsearch != itend){// if found
							searchhint = itsearch;
							resultmap[tag].n_exreadsExcluded +=1;
							lastfound[chrm] = searchhint; // update search hint
							continue;
						}
					}

				}
            }

            resultmap[tag].numTotal +=1;

            if(record1.IsMapped())
            {
            	resultmap[tag].numMapped += 1;
            }
            if(record1.IsDuplicate()){
            	resultmap[tag].numDuplicates +=1;
            }

            double gc = calcGC(record1.QueryBases);
            int ptn_count = countMotif(record1.QueryBases, opt::PATTERN, opt::PATTERN_REV);
            // when the read length exceeds 100bp, number of patterns might exceed the boundary
            if (ptn_count > ScanParameters::TEL_MOTIF_N-1){
                continue;
            }
            resultmap[tag].telcounts[ptn_count]+=1;

            if(gc >= ScanParameters::GC_LOWERBOUND && gc <= ScanParameters::GC_UPPERBOUND){
            	// get index for GC bin.
            	int idx = floor((gc-ScanParameters::GC_LOWERBOUND)/ScanParameters::GC_BINSIZE);
            	assert(idx >=0 && idx <= ScanParameters::GC_BIN_N-1);
//            	std::cerr << c << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC idx{" << idx << "}\n";
            	if(idx > ScanParameters::GC_BIN_N-1){
            		std::cerr << nprocessed << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC bin index out of bound:" << idx << "\n";
            		exit(EXIT_FAILURE);
            	}
            	resultmap[tag].gccounts[idx]+=1;
            }

            // if(resultmap[tag].n_exreadsChrUnmatched > 1000){
            // 	std::cerr<<"too many reads found with unmatched chromosome ID between BAM and exome BED. \n" << std::endl;
            // }

            nprocessed++;

            if( nprocessed%10000000 == 0){
            	std::cerr << "[scan] processed " << nprocessed << " reads \n" ;
            }
        }
        
		pBamReader->Close();
        delete pBamReader;
        
        // consider each BAM separately
        resultlist.push_back(resultmap);

        std::cerr << "[scan] total reads in BAM scanned " << ntotal << std::endl;
        std::cerr << "Completed scanning BAM\n";

    }

    if(opt::onebam){
        merge_results_by_readgroup(resultlist);
    }
    
    outputresults(resultlist);

    if(isExome){
    	printlog(resultlist);
    }
    
    std::cerr << "Completed writing results\n";

    return 0;
}