void InputStructures::DetectFlowOrderzAndKeyFromBam(const SamHeader &samHeader){
  
//TODO Need to handle multiple BAM files with different flowOrders, at least throw an error for now.
    for (BamTools::SamReadGroupConstIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr) {
        if (itr->HasFlowOrder()) {
            string tmpflowOrder = itr->FlowOrder;
            if (bamFlowOrderVector.empty()){
                bamFlowOrderVector.push_back(tmpflowOrder);
                flowOrder = tmpflowOrder; // first one free
            }else { //check if the flowOrder is the same if not throw an error, for now we dont support bams with different flow orders
                vector<string>::iterator it = std::find(bamFlowOrderVector.begin(), bamFlowOrderVector.end(), tmpflowOrder);
                if (it == bamFlowOrderVector.end()) {
                   // check to see if flowOrder is a substring/superstring first
                  std::size_t found_me = std::string::npos; // assume bad
                  if (tmpflowOrder.length()>flowOrder.length()){
                    found_me = tmpflowOrder.find(flowOrder);
                    if (found_me==0){ // must find at first position
                      flowOrder = tmpflowOrder; // longer superstring
                      bamFlowOrderVector.push_back(tmpflowOrder);
                      //cout<< "Super: " << tmpflowOrder.length() << " " << tmpflowOrder << endl;
                    } else
                      found_me = std::string::npos;

                  }else{
                    found_me = flowOrder.find(tmpflowOrder);
                    if (found_me==0){ // must find at first position
                      // substring, so no need to update flowOrder
                      bamFlowOrderVector.push_back(tmpflowOrder);
                      //cout << "Sub: " << tmpflowOrder.length() << " "<< tmpflowOrder << endl;
                    } else
                      found_me = std::string::npos;
                  }

                  if (found_me==std::string::npos){
                    cerr << "FATAL ERROR: BAM files specified as input have different flow orders. Currently tvc supports only BAM files with same flow order. " << endl;
                    exit(-1);
                  }
                }
            }
            flowKey = itr->KeySequence;

        }

    }
    if (bamFlowOrderVector.size()>1)
      cout << "Compatibly nested flow orders found: " << bamFlowOrderVector.size() << " using longest, nFlows=  " << flowOrder.length() << endl;
    //cout << "Final: " << flowOrder.length() << " " << flowOrder << endl;
    nFlows = flowOrder.length();

    if (nFlows > 0) {
        flowSigPresent = true;
        treePhaserFlowOrder.SetFlowOrder(flowOrder, nFlows);
        key.Set(treePhaserFlowOrder, flowKey, "key");
    }

}
Пример #2
0
int scanBam()
{

	std::vector< std::map<std::string, ScanResults> > resultlist;
//	std::ostream* pWriter;
//	pWriter = &std::cout;
    bool isExome = opt::exomebedfile.size()==0? false: true;

    std::cout << opt::bamlist << "\n";
    std::cout << opt::bamlist.size() << " BAMs" <<  std::endl;

    for(std::size_t i=0; i<opt::bamlist.size(); i++) {

        // storing results for each read group (RG tag). use
        // read group ID as key.
        std::map<std::string, ScanResults> resultmap;
        // store where the overlap was last found in the case of exome seq
    	std::map<std::string, std::vector<range>::iterator> lastfound;
    	std::vector<range>::iterator searchhint;
        
        std::cerr << "Start analysing BAM " << opt::bamlist[i] << "\n";

        // Open the bam files for reading/writing
        BamTools::BamReader* pBamReader = new BamTools::BamReader;

        pBamReader->Open(opt::bamlist[i]);

        // get bam headers
        const BamTools::SamHeader header = pBamReader ->GetHeader();

//        for(BamTools::SamSequenceConstIterator it = header.Sequences.Begin();
//        						it != header.Sequences.End();++it){
//        	std::cout << "Assembly ID:" << it->AssemblyID << ", Name:" << it->Name << std::endl;
//        }
//        exit(0);

        bool rggroups=false;
        
        if(opt::ignorerg){ // ignore read groups
        	std::cerr << "Treat all reads in BAM as if they were from a same sample" << std::endl;
        	ScanResults results;
        	results.sample = opt::unknown;
        	resultmap[opt::unknown]=results;
        }else{
			std::map <std::string, std::string> readgroups;
			std::map <std::string, std::string> readlibs;

			rggroups = header.HasReadGroups();

			if(rggroups){
				for(BamTools::SamReadGroupConstIterator it = header.ReadGroups.Begin();
						it != header.ReadGroups.End();++it){
					readgroups[it->ID]= it->Sample;
					if(it->HasLibrary()){
						readlibs[it->ID] = it -> Library;
					}else{
						readlibs[it->ID] = opt::unknown;
					}
				}
				std::cerr<<"Specified BAM has "<< readgroups.size()<< " read groups" << std::endl;

				for(std::map<std::string, std::string>::iterator it = readgroups.begin(); it != readgroups.end(); ++it){
					ScanResults results;
					std::string rgid = it -> first;
					results.sample = it -> second;
					results.lib = readlibs[rgid];
					resultmap[rgid]=results; //results are identified by RG tag.
				}

			}else{
				std::cerr << "Warning: can't find RG tag in the BAM header" << std::endl;
				std::cerr << "Warning: treat all reads in BAM as if they were from a same sample" << std::endl;
				ScanResults results;
				results.sample = opt::unknown;
				results.lib = opt::unknown;
				resultmap[opt::unknown]=results;
			}
        }

        BamTools::BamAlignment record1;
        bool done = false;
        
        int nprocessed=0; // number of reads analyzed
        int ntotal=0; // number of reads scanned in bam (we skip some reads, see below)
        while(!done)
        {
            ntotal ++;
            done = !pBamReader -> GetNextAlignment(record1);
            std::string tag = opt::unknown;
            if(rggroups){
                
                // skip reads that do not have read group tag
            	if(record1.HasTag("RG")){
					record1.GetTag("RG", tag);
	//            	std::cerr << c << " reads:{" << record1.QueryBases << "} tag:{" << tag << "}\n";
				}else{
					std::cerr << "can't find RG tag for read at position {" << record1.RefID << ":" << record1.Position << "}" << std::endl;
					std::cerr << "skip this read" << std::endl;
					continue;
				}
            }
            
            // skip reads with readgroup not defined in BAM header
            if(resultmap.find(tag) == resultmap.end()){
				std::cerr << "RG tag {" << tag << "} for read at position ";
				std::cerr << "{" << record1.RefID << ":" << record1.Position << "} doesn't exist in BAM header.";
				continue;
            }

            // for exome, exclude reads mapped to the exome regions.
            if(isExome){
				range rg;
				rg.first = record1.Position;
				rg.second = record1.Position + record1.Length;
				std::string chrm =  refID2Name(record1.RefID);

				if(chrm != "-1"){ // check if overlap exome when the read is mapped to chr1-22, X, Y
					// std::cerr << "read: " << chrm << " " << rg << "\n" << std::endl;
					std::map<std::string, std::vector<range> >::iterator chrmit = opt::exomebed.find(chrm);
					if(chrmit == opt::exomebed.end())
					{
						// std::cerr<<"chromosome or reference sequence: " << chrm << " is not present in the specified exome bed file." <<std::endl;
						// std::cerr<<"please check sequence name encoding, i.e. for chromosome one, is it chr1 or 1" << std::endl;
                        // unmapped reads can have chr names as a star (*). We also don't consider MT reads. 
						resultmap[tag].n_exreadsChrUnmatched +=1; 
					}else{
						std::vector<range>::iterator itend = opt::exomebed[chrm].end();
						std::map<std::string, std::vector<range>::iterator>::iterator lastfoundchrmit = lastfound.find(chrm);
						if(lastfoundchrmit == lastfound.end()){ // first entry to this chrm
							lastfound[chrm] = chrmit->second.begin();// start from begining
						}

						// set the hint to where the previous found is
						searchhint = lastfound[chrm];
						std::vector<range>::iterator itsearch = searchRange(searchhint, itend, rg);
						// if found
						if(itsearch != itend){// if found
							searchhint = itsearch;
							resultmap[tag].n_exreadsExcluded +=1;
							lastfound[chrm] = searchhint; // update search hint
							continue;
						}
					}

				}
            }

            resultmap[tag].numTotal +=1;

            if(record1.IsMapped())
            {
            	resultmap[tag].numMapped += 1;
            }
            if(record1.IsDuplicate()){
            	resultmap[tag].numDuplicates +=1;
            }

            double gc = calcGC(record1.QueryBases);
            int ptn_count = countMotif(record1.QueryBases, opt::PATTERN, opt::PATTERN_REV);
            // when the read length exceeds 100bp, number of patterns might exceed the boundary
            if (ptn_count > ScanParameters::TEL_MOTIF_N-1){
                continue;
            }
            resultmap[tag].telcounts[ptn_count]+=1;

            if(gc >= ScanParameters::GC_LOWERBOUND && gc <= ScanParameters::GC_UPPERBOUND){
            	// get index for GC bin.
            	int idx = floor((gc-ScanParameters::GC_LOWERBOUND)/ScanParameters::GC_BINSIZE);
            	assert(idx >=0 && idx <= ScanParameters::GC_BIN_N-1);
//            	std::cerr << c << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC idx{" << idx << "}\n";
            	if(idx > ScanParameters::GC_BIN_N-1){
            		std::cerr << nprocessed << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC bin index out of bound:" << idx << "\n";
            		exit(EXIT_FAILURE);
            	}
            	resultmap[tag].gccounts[idx]+=1;
            }

            // if(resultmap[tag].n_exreadsChrUnmatched > 1000){
            // 	std::cerr<<"too many reads found with unmatched chromosome ID between BAM and exome BED. \n" << std::endl;
            // }

            nprocessed++;

            if( nprocessed%10000000 == 0){
            	std::cerr << "[scan] processed " << nprocessed << " reads \n" ;
            }
        }
        
		pBamReader->Close();
        delete pBamReader;
        
        // consider each BAM separately
        resultlist.push_back(resultmap);

        std::cerr << "[scan] total reads in BAM scanned " << ntotal << std::endl;
        std::cerr << "Completed scanning BAM\n";

    }

    if(opt::onebam){
        merge_results_by_readgroup(resultlist);
    }
    
    outputresults(resultlist);

    if(isExome){
    	printlog(resultlist);
    }
    
    std::cerr << "Completed writing results\n";

    return 0;
}
Пример #3
0
void InputStructures::DetectFlowOrderzAndKeyFromBam(const SamHeader &samHeader){

    // We only store flow orders that are different from each other in the flow order vector.
    // The same flow order but different total numbers of flow map to the  same flow order object
    // So multiple runs, even with different max flows, point to the same flow order object
    // We assume that the read group name is written in the form <run_id>.<Barcode Name>

    flow_order_vector.clear();
    vector<string> temp_flow_order_vector;
    int num_read_groups = 0;

    for (BamTools::SamReadGroupConstIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr) {

      num_read_groups++;
      if (itr->ID.empty()){
        cerr << "TVC ERROR: BAM file has a read group without ID." << endl;
        exit(EXIT_FAILURE);
      }
      // We need a flow order to do variant calling so throw an error if there is none.
      if (not itr->HasFlowOrder()) {
        cerr << "TVC ERROR: read group " << itr->ID << " does not have a flow order." << endl;
        exit(EXIT_FAILURE);
      }

      // Check for duplicate read group ID entries and throw an error if one is found
      std::map<string,string>::const_iterator key_it = key_by_read_group.find(itr->ID);
      if (key_it != key_by_read_group.end()) {
        cerr << "TVC ERROR: Multiple read group entries with ID " << itr->ID << endl;
        exit(EXIT_FAILURE);
      }

      // Store Key Sequence for each read group
      // The read group key in the BAM file contains the full prefix: key sequence + barcode + barcode adapter
      key_by_read_group[itr->ID] = itr->KeySequence;

      // Get run id from read group name: convention <read group name> = <run_id>.<Barcode Name>
      string run_id = itr->ID.substr(0,itr->ID.find('.'));
      if (run_id.empty()) {
        cerr << "TVC ERROR: Unable to extract run id from read group name " << itr->ID << endl;
        exit(EXIT_FAILURE);
      }

      // Check whether an entry already exists for this run id and whether it is compatible
      std::map<string,int>::const_iterator fo_it = flow_order_index_by_run_id.find(run_id);
      if (fo_it != flow_order_index_by_run_id.end()) {
    	// Flow order for this run id may be equal or a subset of the stored one
        if (temp_flow_order_vector.at(fo_it->second).length() < itr->FlowOrder.length()
            or temp_flow_order_vector.at(fo_it->second).substr(0, itr->FlowOrder.length()) != itr->FlowOrder
            or num_flows_by_run_id.at(run_id) != (int)(itr->FlowOrder).length())
        {
          cerr << "TVC ERROR: Flow order information extracted from read group name " << itr->ID
               << " does not match previous entry for run id " << run_id << ": " << endl;
          cerr << "Exiting entry  : " << temp_flow_order_vector.at(fo_it->second) << endl;
          cerr << "Newly extracted: " << itr->FlowOrder << endl;
          cerr << temp_flow_order_vector.at(fo_it->second) << endl;
          exit(EXIT_FAILURE);
        }
        // Found matching entry and everything is OK.
        continue;
      }

      // New run id: Check whether this flow order is the same or a sub/ superset of an existing flow order
      unsigned int iFO = 0;
      for (; iFO< temp_flow_order_vector.size(); iFO++){

        // Is the new flow order a subset of an existing flow order?
        if ( temp_flow_order_vector.at(iFO).length() >= itr->FlowOrder.length() ) {
          if (temp_flow_order_vector.at(iFO).substr(0, itr->FlowOrder.length()) == itr->FlowOrder ) {
            flow_order_index_by_run_id[run_id] = iFO;
            num_flows_by_run_id[run_id] = itr->FlowOrder.length();
            break;
          }
          else
            continue;
        }

        // Is the new flow order a superset of an existing flow order?
        if ( temp_flow_order_vector.at(iFO).length() < itr->FlowOrder.length() ) {
          if ( itr->FlowOrder.substr(0, temp_flow_order_vector.at(iFO).length()) == temp_flow_order_vector.at(iFO) ) {
            temp_flow_order_vector.at(iFO) = itr->FlowOrder;
            flow_order_index_by_run_id[run_id] = iFO;
            num_flows_by_run_id[run_id] = itr->FlowOrder.length();
            break;
          }
        }
      }

      // Do we have a new flow order?
      if (iFO == temp_flow_order_vector.size()) {
    	temp_flow_order_vector.push_back(itr->FlowOrder);
    	flow_order_index_by_run_id[run_id] = iFO;
    	num_flows_by_run_id[run_id] = itr->FlowOrder.length();
      }

    } // --- end loop over read groups

    // Now we have amassed all the unique flow orders and can construct the FlowOrder objects
    for (unsigned int iFO=0; iFO < temp_flow_order_vector.size(); iFO++){
      ion::FlowOrder tempIonFlowOrder(temp_flow_order_vector.at(iFO), temp_flow_order_vector.at(iFO).length());
      flow_order_vector.push_back(tempIonFlowOrder);
    }

    // Verbose output
    cout << "TVC found a total of " << flow_order_vector.size() << " different flow orders of max flow lengths: ";
    int iFO=0;
    for (; iFO<(int)flow_order_vector.size()-1; iFO++)
      cout << flow_order_vector.at(iFO).num_flows() << ',';
    cout << flow_order_vector.at(iFO).num_flows() << endl;

}