void InputStructures::DetectFlowOrderzAndKeyFromBam(const SamHeader &samHeader){
  
//TODO Need to handle multiple BAM files with different flowOrders, at least throw an error for now.
    for (BamTools::SamReadGroupConstIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr) {
        if (itr->HasFlowOrder()) {
            string tmpflowOrder = itr->FlowOrder;
            if (bamFlowOrderVector.empty()){
                bamFlowOrderVector.push_back(tmpflowOrder);
                flowOrder = tmpflowOrder; // first one free
            }else { //check if the flowOrder is the same if not throw an error, for now we dont support bams with different flow orders
                vector<string>::iterator it = std::find(bamFlowOrderVector.begin(), bamFlowOrderVector.end(), tmpflowOrder);
                if (it == bamFlowOrderVector.end()) {
                   // check to see if flowOrder is a substring/superstring first
                  std::size_t found_me = std::string::npos; // assume bad
                  if (tmpflowOrder.length()>flowOrder.length()){
                    found_me = tmpflowOrder.find(flowOrder);
                    if (found_me==0){ // must find at first position
                      flowOrder = tmpflowOrder; // longer superstring
                      bamFlowOrderVector.push_back(tmpflowOrder);
                      //cout<< "Super: " << tmpflowOrder.length() << " " << tmpflowOrder << endl;
                    } else
                      found_me = std::string::npos;

                  }else{
                    found_me = flowOrder.find(tmpflowOrder);
                    if (found_me==0){ // must find at first position
                      // substring, so no need to update flowOrder
                      bamFlowOrderVector.push_back(tmpflowOrder);
                      //cout << "Sub: " << tmpflowOrder.length() << " "<< tmpflowOrder << endl;
                    } else
                      found_me = std::string::npos;
                  }

                  if (found_me==std::string::npos){
                    cerr << "FATAL ERROR: BAM files specified as input have different flow orders. Currently tvc supports only BAM files with same flow order. " << endl;
                    exit(-1);
                  }
                }
            }
            flowKey = itr->KeySequence;

        }

    }
    if (bamFlowOrderVector.size()>1)
      cout << "Compatibly nested flow orders found: " << bamFlowOrderVector.size() << " using longest, nFlows=  " << flowOrder.length() << endl;
    //cout << "Final: " << flowOrder.length() << " " << flowOrder << endl;
    nFlows = flowOrder.length();

    if (nFlows > 0) {
        flowSigPresent = true;
        treePhaserFlowOrder.SetFlowOrder(flowOrder, nFlows);
        key.Set(treePhaserFlowOrder, flowKey, "key");
    }

}
Exemplo n.º 2
0
void InputStructures::DetectFlowOrderzAndKeyFromBam(const SamHeader &samHeader){

    // We only store flow orders that are different from each other in the flow order vector.
    // The same flow order but different total numbers of flow map to the  same flow order object
    // So multiple runs, even with different max flows, point to the same flow order object
    // We assume that the read group name is written in the form <run_id>.<Barcode Name>

    flow_order_vector.clear();
    vector<string> temp_flow_order_vector;
    int num_read_groups = 0;

    for (BamTools::SamReadGroupConstIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr) {

      num_read_groups++;
      if (itr->ID.empty()){
        cerr << "TVC ERROR: BAM file has a read group without ID." << endl;
        exit(EXIT_FAILURE);
      }
      // We need a flow order to do variant calling so throw an error if there is none.
      if (not itr->HasFlowOrder()) {
        cerr << "TVC ERROR: read group " << itr->ID << " does not have a flow order." << endl;
        exit(EXIT_FAILURE);
      }

      // Check for duplicate read group ID entries and throw an error if one is found
      std::map<string,string>::const_iterator key_it = key_by_read_group.find(itr->ID);
      if (key_it != key_by_read_group.end()) {
        cerr << "TVC ERROR: Multiple read group entries with ID " << itr->ID << endl;
        exit(EXIT_FAILURE);
      }

      // Store Key Sequence for each read group
      // The read group key in the BAM file contains the full prefix: key sequence + barcode + barcode adapter
      key_by_read_group[itr->ID] = itr->KeySequence;

      // Get run id from read group name: convention <read group name> = <run_id>.<Barcode Name>
      string run_id = itr->ID.substr(0,itr->ID.find('.'));
      if (run_id.empty()) {
        cerr << "TVC ERROR: Unable to extract run id from read group name " << itr->ID << endl;
        exit(EXIT_FAILURE);
      }

      // Check whether an entry already exists for this run id and whether it is compatible
      std::map<string,int>::const_iterator fo_it = flow_order_index_by_run_id.find(run_id);
      if (fo_it != flow_order_index_by_run_id.end()) {
    	// Flow order for this run id may be equal or a subset of the stored one
        if (temp_flow_order_vector.at(fo_it->second).length() < itr->FlowOrder.length()
            or temp_flow_order_vector.at(fo_it->second).substr(0, itr->FlowOrder.length()) != itr->FlowOrder
            or num_flows_by_run_id.at(run_id) != (int)(itr->FlowOrder).length())
        {
          cerr << "TVC ERROR: Flow order information extracted from read group name " << itr->ID
               << " does not match previous entry for run id " << run_id << ": " << endl;
          cerr << "Exiting entry  : " << temp_flow_order_vector.at(fo_it->second) << endl;
          cerr << "Newly extracted: " << itr->FlowOrder << endl;
          cerr << temp_flow_order_vector.at(fo_it->second) << endl;
          exit(EXIT_FAILURE);
        }
        // Found matching entry and everything is OK.
        continue;
      }

      // New run id: Check whether this flow order is the same or a sub/ superset of an existing flow order
      unsigned int iFO = 0;
      for (; iFO< temp_flow_order_vector.size(); iFO++){

        // Is the new flow order a subset of an existing flow order?
        if ( temp_flow_order_vector.at(iFO).length() >= itr->FlowOrder.length() ) {
          if (temp_flow_order_vector.at(iFO).substr(0, itr->FlowOrder.length()) == itr->FlowOrder ) {
            flow_order_index_by_run_id[run_id] = iFO;
            num_flows_by_run_id[run_id] = itr->FlowOrder.length();
            break;
          }
          else
            continue;
        }

        // Is the new flow order a superset of an existing flow order?
        if ( temp_flow_order_vector.at(iFO).length() < itr->FlowOrder.length() ) {
          if ( itr->FlowOrder.substr(0, temp_flow_order_vector.at(iFO).length()) == temp_flow_order_vector.at(iFO) ) {
            temp_flow_order_vector.at(iFO) = itr->FlowOrder;
            flow_order_index_by_run_id[run_id] = iFO;
            num_flows_by_run_id[run_id] = itr->FlowOrder.length();
            break;
          }
        }
      }

      // Do we have a new flow order?
      if (iFO == temp_flow_order_vector.size()) {
    	temp_flow_order_vector.push_back(itr->FlowOrder);
    	flow_order_index_by_run_id[run_id] = iFO;
    	num_flows_by_run_id[run_id] = itr->FlowOrder.length();
      }

    } // --- end loop over read groups

    // Now we have amassed all the unique flow orders and can construct the FlowOrder objects
    for (unsigned int iFO=0; iFO < temp_flow_order_vector.size(); iFO++){
      ion::FlowOrder tempIonFlowOrder(temp_flow_order_vector.at(iFO), temp_flow_order_vector.at(iFO).length());
      flow_order_vector.push_back(tempIonFlowOrder);
    }

    // Verbose output
    cout << "TVC found a total of " << flow_order_vector.size() << " different flow orders of max flow lengths: ";
    int iFO=0;
    for (; iFO<(int)flow_order_vector.size()-1; iFO++)
      cout << flow_order_vector.at(iFO).num_flows() << ',';
    cout << flow_order_vector.at(iFO).num_flows() << endl;

}