void InputStructures::DetectFlowOrderzAndKeyFromBam(const SamHeader &samHeader){ //TODO Need to handle multiple BAM files with different flowOrders, at least throw an error for now. for (BamTools::SamReadGroupConstIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr) { if (itr->HasFlowOrder()) { string tmpflowOrder = itr->FlowOrder; if (bamFlowOrderVector.empty()){ bamFlowOrderVector.push_back(tmpflowOrder); flowOrder = tmpflowOrder; // first one free }else { //check if the flowOrder is the same if not throw an error, for now we dont support bams with different flow orders vector<string>::iterator it = std::find(bamFlowOrderVector.begin(), bamFlowOrderVector.end(), tmpflowOrder); if (it == bamFlowOrderVector.end()) { // check to see if flowOrder is a substring/superstring first std::size_t found_me = std::string::npos; // assume bad if (tmpflowOrder.length()>flowOrder.length()){ found_me = tmpflowOrder.find(flowOrder); if (found_me==0){ // must find at first position flowOrder = tmpflowOrder; // longer superstring bamFlowOrderVector.push_back(tmpflowOrder); //cout<< "Super: " << tmpflowOrder.length() << " " << tmpflowOrder << endl; } else found_me = std::string::npos; }else{ found_me = flowOrder.find(tmpflowOrder); if (found_me==0){ // must find at first position // substring, so no need to update flowOrder bamFlowOrderVector.push_back(tmpflowOrder); //cout << "Sub: " << tmpflowOrder.length() << " "<< tmpflowOrder << endl; } else found_me = std::string::npos; } if (found_me==std::string::npos){ cerr << "FATAL ERROR: BAM files specified as input have different flow orders. Currently tvc supports only BAM files with same flow order. " << endl; exit(-1); } } } flowKey = itr->KeySequence; } } if (bamFlowOrderVector.size()>1) cout << "Compatibly nested flow orders found: " << bamFlowOrderVector.size() << " using longest, nFlows= " << flowOrder.length() << endl; //cout << "Final: " << flowOrder.length() << " " << flowOrder << endl; nFlows = flowOrder.length(); if (nFlows > 0) { flowSigPresent = true; treePhaserFlowOrder.SetFlowOrder(flowOrder, nFlows); key.Set(treePhaserFlowOrder, flowKey, "key"); } }
void InputStructures::DetectFlowOrderzAndKeyFromBam(const SamHeader &samHeader){ // We only store flow orders that are different from each other in the flow order vector. // The same flow order but different total numbers of flow map to the same flow order object // So multiple runs, even with different max flows, point to the same flow order object // We assume that the read group name is written in the form <run_id>.<Barcode Name> flow_order_vector.clear(); vector<string> temp_flow_order_vector; int num_read_groups = 0; for (BamTools::SamReadGroupConstIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr) { num_read_groups++; if (itr->ID.empty()){ cerr << "TVC ERROR: BAM file has a read group without ID." << endl; exit(EXIT_FAILURE); } // We need a flow order to do variant calling so throw an error if there is none. if (not itr->HasFlowOrder()) { cerr << "TVC ERROR: read group " << itr->ID << " does not have a flow order." << endl; exit(EXIT_FAILURE); } // Check for duplicate read group ID entries and throw an error if one is found std::map<string,string>::const_iterator key_it = key_by_read_group.find(itr->ID); if (key_it != key_by_read_group.end()) { cerr << "TVC ERROR: Multiple read group entries with ID " << itr->ID << endl; exit(EXIT_FAILURE); } // Store Key Sequence for each read group // The read group key in the BAM file contains the full prefix: key sequence + barcode + barcode adapter key_by_read_group[itr->ID] = itr->KeySequence; // Get run id from read group name: convention <read group name> = <run_id>.<Barcode Name> string run_id = itr->ID.substr(0,itr->ID.find('.')); if (run_id.empty()) { cerr << "TVC ERROR: Unable to extract run id from read group name " << itr->ID << endl; exit(EXIT_FAILURE); } // Check whether an entry already exists for this run id and whether it is compatible std::map<string,int>::const_iterator fo_it = flow_order_index_by_run_id.find(run_id); if (fo_it != flow_order_index_by_run_id.end()) { // Flow order for this run id may be equal or a subset of the stored one if (temp_flow_order_vector.at(fo_it->second).length() < itr->FlowOrder.length() or temp_flow_order_vector.at(fo_it->second).substr(0, itr->FlowOrder.length()) != itr->FlowOrder or num_flows_by_run_id.at(run_id) != (int)(itr->FlowOrder).length()) { cerr << "TVC ERROR: Flow order information extracted from read group name " << itr->ID << " does not match previous entry for run id " << run_id << ": " << endl; cerr << "Exiting entry : " << temp_flow_order_vector.at(fo_it->second) << endl; cerr << "Newly extracted: " << itr->FlowOrder << endl; cerr << temp_flow_order_vector.at(fo_it->second) << endl; exit(EXIT_FAILURE); } // Found matching entry and everything is OK. continue; } // New run id: Check whether this flow order is the same or a sub/ superset of an existing flow order unsigned int iFO = 0; for (; iFO< temp_flow_order_vector.size(); iFO++){ // Is the new flow order a subset of an existing flow order? if ( temp_flow_order_vector.at(iFO).length() >= itr->FlowOrder.length() ) { if (temp_flow_order_vector.at(iFO).substr(0, itr->FlowOrder.length()) == itr->FlowOrder ) { flow_order_index_by_run_id[run_id] = iFO; num_flows_by_run_id[run_id] = itr->FlowOrder.length(); break; } else continue; } // Is the new flow order a superset of an existing flow order? if ( temp_flow_order_vector.at(iFO).length() < itr->FlowOrder.length() ) { if ( itr->FlowOrder.substr(0, temp_flow_order_vector.at(iFO).length()) == temp_flow_order_vector.at(iFO) ) { temp_flow_order_vector.at(iFO) = itr->FlowOrder; flow_order_index_by_run_id[run_id] = iFO; num_flows_by_run_id[run_id] = itr->FlowOrder.length(); break; } } } // Do we have a new flow order? if (iFO == temp_flow_order_vector.size()) { temp_flow_order_vector.push_back(itr->FlowOrder); flow_order_index_by_run_id[run_id] = iFO; num_flows_by_run_id[run_id] = itr->FlowOrder.length(); } } // --- end loop over read groups // Now we have amassed all the unique flow orders and can construct the FlowOrder objects for (unsigned int iFO=0; iFO < temp_flow_order_vector.size(); iFO++){ ion::FlowOrder tempIonFlowOrder(temp_flow_order_vector.at(iFO), temp_flow_order_vector.at(iFO).length()); flow_order_vector.push_back(tempIonFlowOrder); } // Verbose output cout << "TVC found a total of " << flow_order_vector.size() << " different flow orders of max flow lengths: "; int iFO=0; for (; iFO<(int)flow_order_vector.size()-1; iFO++) cout << flow_order_vector.at(iFO).num_flows() << ','; cout << flow_order_vector.at(iFO).num_flows() << endl; }