Beispiel #1
0
int main(int argc, char* argv[])
{
	// Open a BAM file:
	char* bamFile  = argv[1];
	//	char* bamIndex = argv[2];

	BAMReader reader(bamFile); // , bamIndex);
	reader.open();
	assert(reader);

	// Print out list of reference sequences, and their lengths:
	cout << "Found " << reader.num_refs() << " reference sequences:" << endl;
	for(int i=0; i<reader.num_refs(); ++i)
		cout << setw(9) << reader.refs()[i] << "    " << reader.lens()[i] << endl;

	// Print out list of reads:
	for (BAMReader::iterator i = reader.get_iterator(); i.good(); i.next()) {		
	  BAMRead read = i.get();
	  cout << read.to_string();
	  for (Sequence::iterator s_iter = read.get_seq().get_iterator(); s_iter.good(); s_iter.next())
	    cout << s_iter.get(); // nuc from SEQ
	  cout << endl;

	  for (Cigar::iterator c_iter = read.get_cigar().get_iterator(); c_iter.good(); c_iter.next())
	    cout << c_iter.len() << ":" << c_iter.op() << "; ";
	  cout << endl;

	  // Don't use MD directly.  Use BAMUtils.
	  BAMUtils utils(read);
	  cout << utils.get_qdna() << endl << utils.get_matcha() << endl << utils.get_tdna() << endl << endl;

	}
}
Beispiel #2
0
int main(int argc, const char *argv[])
{
#ifdef _DEBUG
  atexit(memstatus);
  dbgmemInit();
#endif /* _DEBUG */

  printf ("%s - %s-%s (%s)\n", argv[0], IonVersion::GetVersion().c_str(), IonVersion::GetRelease().c_str(), IonVersion::GetSvnRev().c_str());

  string bamInputFilename;
  string fastaInputFilename;
  string jsonOutputFilename;
  bool help;

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  opts.GetOption(bamInputFilename,    "",             '-',  "bam");
  opts.GetOption(fastaInputFilename,  "",             '-',  "ref");
  opts.GetOption(jsonOutputFilename,  "TFStats.json", '-',  "output-json");
  opts.GetOption(help,                "false",        'h',  "help");
  opts.CheckNoLeftovers();

  if (help || bamInputFilename.empty() || fastaInputFilename.empty())
    return showHelp();


  // Parse BAM header

  BAMReader bamReader(bamInputFilename);
  bamReader.open();
  bam_header_t *header = (bam_header_t *)bamReader.get_header_ptr();

  int numFlows = 0;
  string flowOrder;
  string key;

  if (header->l_text >= 3) {
    if (header->dict == 0)
      header->dict = sam_header_parse2(header->text);
    int nEntries = 0;
    char **tmp = sam_header2list(header->dict, "RG", "FO", &nEntries);
    if (nEntries) {
      flowOrder = tmp[0];
      numFlows = flowOrder.length();
    }
    if (tmp)
      free(tmp);
    nEntries = 0;
    tmp = sam_header2list(header->dict, "RG", "KS", &nEntries);
    if (nEntries) {
      key = tmp[0];
    }
    if (tmp)
      free(tmp);
  }

  if (numFlows <= 0) {
    fprintf(stderr, "[TFMapper] Could not retrieve flow order from FO BAM tag. SFF-specific tags absent?\n");
    exit(1);
  }
  if (key.empty()) {
    fprintf(stderr, "[TFMapper] Could not retrieve key sequence from KS BAM tag. SFF-specific tags absent?\n");
    exit(1);
  }
  //printf("Retrieved flow order from bam: %s (%d)\n", flowOrder.c_str(), numFlows);
  //printf("Retrieved key from bam: %s\n", key.c_str());


  // Retrieve test fragment sequences

  vector<string>  referenceSequences;
  PopulateReferenceSequences(referenceSequences, fastaInputFilename, header->n_targets, header->target_name, string(""));


  //  Process the BAM reads and generate metrics

  int numTFs = header->n_targets;
  vector<int>     TFCount(numTFs,0);
  MetricGeneratorQualityHistograms  metricGeneratorQualityHistograms[numTFs];
  MetricGeneratorHPAccuracy         metricGeneratorHPAccuracy[numTFs];
  MetricGeneratorSNR                metricGeneratorSNR[numTFs];
  MetricGeneratorAvgIonogram        metricGeneratorAvgIonogram[numTFs];

  for (BAMReader::iterator i = bamReader.get_iterator(); i.good(); i.next()) {

    BAMRead bamRead = i.get();
    int bestTF = bamRead.get_tid();
    if (bestTF < 0)
      continue;
    BAMUtils bamUtil(bamRead);
    TFCount[bestTF]++;

    // Extract flowspace signal from FZ BAM tag

    uint16_t *bam_flowgram = NULL;
    uint8_t *fz = bam_aux_get(bamRead.get_bam_ptr(), "FZ");
    if (fz != NULL) {
      if (fz[0] == (uint8_t)'B' && fz[1] == (uint8_t)'S' && *((uint32_t *)(fz+2)) == (uint32_t)numFlows)
        bam_flowgram = (uint16_t *)(fz+6);
    }
    if (bam_flowgram == NULL) {
      fprintf(stderr, "[TFMapper] Could not retrieve flow signal from FZ BAM tag. SFF-specific tags absent?\n");
      exit(1);
    }


    // Use alignments to generate "synchronized" flowspace reference and read ionograms
    // TODO: Do proper flowspace alignment

    string genome = key + bamUtil.get_tdna();
    string calls = key + bamUtil.get_qdna();

    int numBases = min(genome.length(),calls.length());
    vector<int> refIonogram(numFlows, 0);
    vector<int> readIonogram(numFlows, 0);

    int numFlowsRead = 0;
    int numFlowsRef = 0;
    char gC = flowOrder[0];
    int gBC = 0;

    for (int iBase = 0; (iBase < numBases) && (numFlowsRead < numFlows) && (numFlowsRef < numFlows); iBase++) {

      // Conversion for reads (independent of reference)
      if (calls[iBase] != '-') {
        while ((calls[iBase] != flowOrder[numFlowsRead]) && (numFlowsRead < numFlows))
          numFlowsRead++;
        if (numFlowsRead < numFlows)
          readIonogram[numFlowsRead]++;
      }

      if (genome[iBase] != '-') {

        if (genome[iBase] != gC) {
          // Since a new homopolymer begins, need to drop off the old one
          while ((gC != flowOrder[numFlowsRef]) && (numFlowsRef < numFlows)) {
            numFlowsRef++;
            if (numFlowsRef < numFlows)
              refIonogram[numFlowsRef] = 0;
          }
          if (numFlowsRef < numFlows)
            refIonogram[numFlowsRef] = gBC;

          gC = genome[iBase];
          gBC = 0;
        }
        gBC++;

        if (genome[iBase] == calls[iBase])
          numFlowsRef = numFlowsRead;
      }
    }

    int validFlows = min(numFlowsRef, numFlowsRead);


    metricGeneratorSNR[bestTF].AddElement(bam_flowgram ,key.c_str(), flowOrder);
    metricGeneratorAvgIonogram[bestTF].AddElement(bam_flowgram, numFlows);
    metricGeneratorQualityHistograms[bestTF].AddElement(bamUtil.get_phred_len(10),bamUtil.get_phred_len(17));
    for (int iFlow = 0; iFlow < validFlows-20; iFlow++)
      metricGeneratorHPAccuracy[bestTF].AddElement(refIonogram[iFlow],readIonogram[iFlow]);
  }


  // Save stats to a json file

  Json::Value outputJson(Json::objectValue);

  for(int i = 0; i < numTFs; i++) {
    if (TFCount[i] < minTFCount)
      continue;

    Json::Value currentTFJson(Json::objectValue);
    currentTFJson["TF Name"] = header->target_name[i];
    currentTFJson["TF Seq"] = referenceSequences[i];
    currentTFJson["Num"] = TFCount[i];
    currentTFJson["Top Reads"] = Json::Value(Json::arrayValue); // Obsolete

    metricGeneratorSNR[i].PrintSNR(currentTFJson);
    metricGeneratorHPAccuracy[i].PrintHPAccuracy(currentTFJson);
    metricGeneratorQualityHistograms[i].PrintMetrics(currentTFJson);
    metricGeneratorAvgIonogram[i].PrintIonograms(currentTFJson);

    outputJson[header->target_name[i]] = currentTFJson;
  }

  bamReader.close();  // Closing invalidates the header pointers

  if (!jsonOutputFilename.empty()) {
    ofstream out(jsonOutputFilename.c_str(), ios::out);
    if (out.good())
      out << outputJson.toStyledString();
  }

  return 0;
}